diff --git a/examples/models/vlm/qwen3_vl/README.md b/examples/models/vlm/qwen3_vl/README.md index da9da325a1..b93438e402 100644 --- a/examples/models/vlm/qwen3_vl/README.md +++ b/examples/models/vlm/qwen3_vl/README.md @@ -13,13 +13,16 @@ export WORKSPACE=/your/custom/path ``` Directory structure: + - `${WORKSPACE}/models/` - Converted checkpoints - `${WORKSPACE}/results/` - Training outputs and experiment results ## Checkpoint Conversion ### Import HF → Megatron + To import the HF VL model to your desired Megatron path: + ```bash python examples/conversion/convert_checkpoints.py import \ --hf-model Qwen/Qwen3-VL-8B-Instruct \ @@ -27,6 +30,7 @@ python examples/conversion/convert_checkpoints.py import \ ``` ### Export Megatron → HF + ```bash python examples/conversion/convert_checkpoints.py export \ --hf-model Qwen/Qwen3-VL-8B-Instruct \ @@ -48,15 +52,18 @@ python -m torch.distributed.run --nproc_per_node=4 examples/conversion/hf_to_meg ``` Note: + - `--megatron_model_path` is optional. If not specified, the script will convert the model and then run forward. - You can also use image URLs: `--image_path="https://example.com/image.jpg"` See the [inference.sh](inference.sh) script for commands to: + - Run inference with Hugging Face checkpoints - Run inference with imported Megatron checkpoints - Run inference with exported Hugging Face checkpoints **Expected output:** + ``` ... Generation step 46 @@ -88,8 +95,9 @@ Here is a breakdown of the key specifications: - `qwen3_vl_8b_finetune_config`: Finetuning for 8B VL model with PEFT support - `qwen3_vl_30b_a3b_finetune_config`: Finetuning for 30B-A3B VL model with PEFT support - `qwen3_vl_235b_a22b_finetune_config`: Finetuning for 235B-A22B VL model with PEFT support - + Before training, ensure the following environment variables are set: + 1. `HF_TOKEN`: to download models from HF Hub (if required) 2. `HF_HOME`: (optional) to avoid re-downloading models and datasets 3. `WANDB_API_KEY`: (optional) to enable WandB logging @@ -125,7 +133,7 @@ Follow the instructions [here](https://github.com/NVIDIA/Megatron-LM/tree/main/e __module__: megatron.bridge.recipes.qwen_vl.data.energon.task_encoder __class__: ChatMLWebdataset field_map: - imgs: jpg + imgs: jpgs conversation: json ``` diff --git a/src/megatron/bridge/recipes/qwen_vl/data/energon/task_encoder.py b/src/megatron/bridge/recipes/qwen_vl/data/energon/task_encoder.py index b6301ef5c5..5e8c674ed7 100644 --- a/src/megatron/bridge/recipes/qwen_vl/data/energon/task_encoder.py +++ b/src/megatron/bridge/recipes/qwen_vl/data/energon/task_encoder.py @@ -216,17 +216,30 @@ def __init__(self, imagespec): self.extensions_mapping = {"jpgs": "jpg", "mp4s": "jpg", "videos": "jpg"} self.image_handler = imagehandler(imagespec) + def _convert_to_tensor(self, data): + """Convert numpy array or bytes to tensor. + + The wds conversion script stores images as numpy arrays (HWC, uint8), + so we need to handle both numpy arrays and raw bytes. + """ + if isinstance(data, np.ndarray): + # Data is already a numpy array (HWC, uint8) from pickle + # Convert to tensor (CHW, float32 in [0,1]) + return torch.from_numpy(data).permute(2, 0, 1).float() / 255.0 + else: + # Data is raw bytes, use imagehandler to decode + return self.image_handler("jpg", data) + def __call__(self, key, data): """Perform nested image decoding.""" extension = re.sub(r".*[.]", "", key) if extension.lower() not in self.extensions: return None data = pickle.loads(data) - key = self.extensions_mapping[extension] if extension.lower() == "jpgs": - data = [self.image_handler(key, d) for d in data] + data = [self._convert_to_tensor(d) for d in data] else: - data = [[self.image_handler(key, d) for d in video] for video in data] + data = [[self._convert_to_tensor(d) for d in video] for video in data] return data