NVIDIA-NeMo · huvunvidia · Mar 14, 2026 · Feb 25, 2026 · Feb 25, 2026 · Feb 25, 2026
@@ -379,6 +379,8 @@ jobs:
           - script: L2_Launch_models_qwen_quantization
           - script: L2_Launch_models_qwen_vl
           - script: L2_Launch_models_qwen35_vl
+          - script: L2_Launch_models_flux
+          - script: L2_Launch_models_wan
           - script: L2_Launch_recipes_gemma_vl
           - script: L2_Launch_recipes_gpt_oss
           - script: L2_Launch_models_qwen_vl_quantization

diff --git a/examples/diffusion/README.md b/examples/diffusion/README.md
@@ -0,0 +1,3 @@
+# Megatron Examples
+
+Recipes and configuration overrides for megatron training.
diff --git a/examples/diffusion/override_configs/README.md b/examples/diffusion/override_configs/README.md
@@ -0,0 +1,3 @@
+# Override Configs
+
+Parallelism configuration overrides for different CP/TP/SP sizes.
diff --git a/examples/diffusion/override_configs/wan_pretrain_sample_data.yaml b/examples/diffusion/override_configs/wan_pretrain_sample_data.yaml
@@ -0,0 +1,44 @@
+# WAN Pretrain Mock Data Test Configuration
+# Converted from L2_Function_Tests_GPU_Wan_Mock_Data.sh
+
+model:
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  context_parallel_size: 1
+  crossattn_emb_size: 1536
+  hidden_size: 1536
+  ffn_hidden_size: 8960
+  num_attention_heads: 12
+  num_layers: 3
+  qkv_format: thd
+  seq_length: 2048
+
+train:
+  eval_iters: 0
+  train_iters: 10
+  global_batch_size: 2
+  micro_batch_size: 1
+
+optimizer:
+  lr: 5.0e-6
+  min_lr: 5.0e-6
+
+scheduler:
+  lr_decay_style: constant
+  lr_warmup_iters: 0
+
+checkpoint:
+  save: ${oc.env:CHECKPOINT_DIR,null}
+  load: ${oc.env:CHECKPOINT_DIR,null}
+  load_optim: false
+  save_interval: 200
+
+dataset:
+  path: ${oc.env:DATASET_PATH,null}
+  seq_length: 2048
+  global_batch_size: 2
+  micro_batch_size: 1
+  packing_buffer_size: 50
+
+logger:
+  log_interval: 1
diff --git a/examples/diffusion/recipes/README.md b/examples/diffusion/recipes/README.md
@@ -0,0 +1,3 @@
+# Recipe
+
+Training recipes for Wan2.1 pretraining, finetuning, and weight verification.
diff --git a/examples/diffusion/recipes/__init__.py b/examples/diffusion/recipes/__init__.py
diff --git a/examples/diffusion/recipes/flux/README.md b/examples/diffusion/recipes/flux/README.md
@@ -0,0 +1,158 @@
+# FLUX Examples
+
+This directory contains example scripts for the FLUX diffusion model (text-to-image) with Megatron-Bridge: checkpoint conversion, inference, pretraining, and fine-tuning.
+
+All commands below assume you run them from the **Megatron-Bridge repository root** unless noted. Use `uv run` when you need the project’s virtualenv (e.g. `uv run python ...`, `uv run torchrun ...`).
+
+## Workspace Configuration
+
+Use a `WORKSPACE` environment variable as the base directory for checkpoints and results. Default is `/workspace`. Override it if needed:
+
+```bash
+export WORKSPACE=/your/custom/path
+```
+
+Suggested layout:
+
+- `${WORKSPACE}/checkpoints/flux/` – Megatron FLUX checkpoints (after import)
+- `${WORKSPACE}/checkpoints/flux_hf/` – Hugging Face FLUX model (download or export)
+- `${WORKSPACE}/results/flux/` – Training outputs (pretrain/finetune)
+
+---
+
+## 1. Checkpoint Conversion
+
+The script [conversion/convert_checkpoints.py](conversion/convert_checkpoints.py) converts between Hugging Face (diffusers) and Megatron checkpoint formats.
+
+**Source model:** [black-forest-labs/FLUX.1-dev](https://huggingface.co/black-forest-labs/FLUX.1-dev) (or a local clone).
+
+### Download the Hugging Face model (optional)
+
+If you want a local copy before conversion:
+
+```bash
+huggingface-cli download black-forest-labs/FLUX.1-dev \
+  --local-dir ${WORKSPACE}/checkpoints/flux_hf/flux.1-dev \
+  --local-dir-use-symlinks False
+```
+
+**Note**: It is recommended to save the checkpoint because we will need to reuse the VAE and text encoders for the inference pipeline later as well.
+
+### Import: Hugging Face → Megatron
+
+Convert a Hugging Face FLUX model to Megatron format:
+
+```bash
+uv run python examples/diffusion/recipes/flux/conversion/convert_checkpoints.py import \
+  --hf-model ${WORKSPACE}/checkpoints/flux_hf/flux.1-dev \
+  --megatron-path ${WORKSPACE}/checkpoints/flux/flux.1-dev
+```
+
+The Megatron checkpoint is written under `--megatron-path` (e.g. `.../flux.1-dev/iter_0000000/`). Use that path for inference and fine-tuning.
+
+### Export: Megatron → Hugging Face
+
+Export a Megatron checkpoint back to Hugging Face (e.g. for use in diffusers). You must pass the **reference** HF model (for config and non-DiT components) and the **Megatron iteration directory**:
+
+```bash
+uv run python examples/diffusion/recipes/flux/conversion/convert_checkpoints.py export \
+  --hf-model ${WORKSPACE}/checkpoints/flux_hf/flux.1-dev \
+  --megatron-path ${WORKSPACE}/checkpoints/flux/flux.1-dev/iter_0000000 \
+  --hf-path ${WORKSPACE}/checkpoints/flux_hf/flux.1-dev_export
+```
+
+**Note:** The exported directory contains only the DiT transformer weights. For a full pipeline (VAE, text encoders, etc.), copy the original HF repo and replace its `transformer` folder with the exported one.
+
+---
+
+## 2. Inference
+
+The script [inference_flux.py](inference_flux.py) runs text-to-image generation with a Megatron-format FLUX checkpoint. You need:
+
+- **FLUX checkpoint:** Megatron DiT (e.g. from the import step above).
+- **VAE:** Path to VAE weights (often inside the same HF repo as FLUX, e.g. `transformer` sibling directory or a separate VAE checkpoint).
+- **Text encoders:** T5 and CLIP are loaded from Hugging Face by default; you can override with local paths.
+
+### Single prompt (default 1024×1024, 10 steps)
+
+```bash
+uv run python examples/diffusion/recipes/flux/inference_flux.py \
+  --flux_ckpt ${WORKSPACE}/checkpoints/flux/flux.1-dev/iter_0000000 \
+  --vae_ckpt ${WORKSPACE}/checkpoints/flux_hf/flux.1-dev/vae \
+  --prompts "a dog holding a sign that says hello world" \
+  --output_path ./flux_output
+```
+
+
+**VAE path:** If you downloaded FLUX.1-dev with `huggingface-cli`, the VAE is usually in the same repo (e.g. `${WORKSPACE}/checkpoints/flux_hf/flux.1-dev/vae`); use the path to the VAE subfolder or the main repo, depending on how the pipeline expects it.
+
+---
+
+## 3. Pretraining
+
+The script [pretrain_flux.py](pretrain_flux.py) runs FLUX pretraining with the `pretrain_config()` recipe. Configuration can be overridden with Hydra-style CLI keys.
+
+**Recipe:** [megatron.bridge.diffusion.recipes.flux.flux.pretrain_config](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/src/megatron/bridge/diffusion/recipes/flux/flux.py)
+
+### Quick run with mock data (single node, 8 GPUs)
+
+```bash
+uv run torchrun --nproc_per_node=8 examples/diffusion/recipes/flux/pretrain_flux.py --mock
+```
+
+### With CLI overrides only
+
+```bash
+uv run torchrun --nproc_per_node=8 examples/diffusion/recipes/flux/pretrain_flux.py --mock \
+  model.tensor_model_parallel_size=4 \
+  train.train_iters=10000 \
+  optimizer.lr=1e-4
+```
+
+
+### Flow matching options
+
+```bash
+uv run torchrun --nproc_per_node=8 examples/diffusion/recipes/flux/pretrain_flux.py --mock \
+  --timestep-sampling logit_normal \
+  --flow-shift 1.0 \
+  --use-loss-weighting
+```
+
+Before pretraining with real data, set the dataset in the recipe or in your YAML/CLI (e.g. `data_paths`, dataset blend, and cache paths). For data preprocessing, see the Megatron-Bridge data tutorials.
+
+---
+
+## 4. Fine-Tuning
+
+The script [finetune_flux.py](finetune_flux.py) fine-tunes a pretrained FLUX checkpoint (Megatron format). It loads model weights and resets optimizer and step count; config can be overridden via YAML and CLI as with pretraining.
+
+Point `--load-checkpoint` at the **Megatron checkpoint directory** (either the base dir, e.g. `.../flux.1-dev`, or a specific iteration, e.g. `.../flux.1-dev/iter_0000000`):
+
+```bash
+uv run torchrun --nproc_per_node=8 examples/diffusion/recipes/flux/finetune_flux.py \
+  --load-checkpoint ${WORKSPACE}/checkpoints/flux/flux.1-dev/iter_0000000 \
+  --mock
+```
+
+**Note**: If you pass a path that ends with an `iter_XXXXXXX` directory, the script loads that iteration; otherwise it uses the latest iteration under the given path.
+
+**Note**: Loss might explode if you are using a mock dataset.
+
+---
+
+## Summary: End-to-End Flow
+
+1. **Conversion (HF → Megatron)**  
+   Download FLUX.1-dev (optional), then run the `import` command. Use the created `iter_0000000` path as your Megatron checkpoint.
+
+2. **Inference**  
+   Run [inference_flux.py](inference_flux.py) with `--flux_ckpt` (Megatron `iter_*` path), `--vae_ckpt`, and `--prompts`.
+
+3. **Pretraining**  
+   Run [pretrain_flux.py](pretrain_flux.py) with `--mock` or your data config; optionally use `--config-file` and CLI overrides.
+
+4. **Fine-Tuning**  
+   Run [finetune_flux.py](finetune_flux.py) with `--load-checkpoint` set to a Megatron checkpoint (import or pretrain/finetune output), then `--mock` or your data and overrides.
+
+For more details, see the docstrings in each script and the recipe in `src/megatron/bridge/diffusion/recipes/flux/flux.py`.
diff --git a/examples/diffusion/recipes/flux/conf/flux_pretrain_override_example.yaml b/examples/diffusion/recipes/flux/conf/flux_pretrain_override_example.yaml
@@ -0,0 +1,56 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Example FLUX pretrain configuration override file
+# This file shows common overrides for FLUX pretraining
+
+# Model configuration
+model:
+  # Parallelism settings
+  tensor_model_parallel_size: 4
+  pipeline_model_parallel_size: 1
+  context_parallel_size: 1
+  sequence_parallel: false
+
+  # FLUX architecture (FLUX-schnell defaults)
+  num_joint_layers: 19
+  num_single_layers: 38
+  hidden_size: 3072
+  num_attention_heads: 24
+  in_channels: 64
+  context_dim: 4096
+
+  # For FLUX-dev, set guidance_embed: true
+  guidance_embed: false
+  guidance_scale: 3.5
+
+# Training configuration
+train:
+  train_iters: 10000
+  eval_interval: 2000
+  eval_iters: 32
+  global_batch_size: 64
+  micro_batch_size: 1
+
+# Optimizer configuration
+optimizer:
+  lr: 1.0e-4
+
+# Checkpoint configuration
+checkpoint:
+  save_interval: 2000
+
+# Logger configuration
+logger:
+  log_interval: 1
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Megatron Examples

		Recipes and configuration overrides for megatron training.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Override Configs

		Parallelism configuration overrides for different CP/TP/SP sizes.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Recipe

		Training recipes for Wan2.1 pretraining, finetuning, and weight verification.