pytorch
diff --git a/‎.ci/docker/common/install_conda.sh‎
Lines changed: 1 addition & 0 deletions b/‎.ci/docker/common/install_conda.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.ci/docker/requirements-flux.txt‎
Lines changed: 4 additions & 0 deletions b/‎.ci/docker/requirements-flux.txt‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎.ci/docker/requirements.txt‎
Lines changed: 0 additions & 4 deletions b/‎.ci/docker/requirements.txt‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎torchtitan/hf_datasets/hf_datasets.py‎
Lines changed: 227 additions & 0 deletions b/‎torchtitan/hf_datasets/hf_datasets.py‎
Lines changed: 227 additions & 0 deletions
diff --git a/‎scripts/flux_inference/infer.py‎ renamed to ‎torchtitan/models/flux/inference/infer.py‎ b/‎scripts/flux_inference/infer.py‎ renamed to ‎torchtitan/models/flux/inference/infer.py‎
diff --git a/‎scripts/flux_inference/prompts.txt‎ renamed to ‎torchtitan/models/flux/inference/prompts.txt‎ b/‎scripts/flux_inference/prompts.txt‎ renamed to ‎torchtitan/models/flux/inference/prompts.txt‎
diff --git a/‎torchtitan/models/flux/sampling.py‎ renamed to ‎torchtitan/models/flux/inference/sampling.py‎ b/‎torchtitan/models/flux/sampling.py‎ renamed to ‎torchtitan/models/flux/inference/sampling.py‎
diff --git a/‎torchtitan/models/flux/requirements-flux.txt‎
Lines changed: 1 addition & 0 deletions b/‎torchtitan/models/flux/requirements-flux.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎scripts/flux_inference/run_infer.sh‎ renamed to ‎torchtitan/models/flux/run_infer.sh‎
Lines changed: 1 addition & 1 deletion b/‎scripts/flux_inference/run_infer.sh‎ renamed to ‎torchtitan/models/flux/run_infer.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎torchtitan/models/flux/tests/__init__.py‎
Lines changed: 0 additions & 5 deletions b/‎torchtitan/models/flux/tests/__init__.py‎
Lines changed: 0 additions & 5 deletions
@@ -41,6 +41,7 @@ install_pip_dependencies() {
   # Install all Python dependencies
   pip_install -r /opt/conda/requirements-dev.txt
   pip_install -r /opt/conda/requirements.txt
+  pip_install -r /opt/conda/requirements-flux.txt
   pip_install -r /opt/conda/requirements-vlm.txt
   popd
 }
 
@@ -0,0 +1,4 @@
+transformers
+einops
+sentencepiece
+pillow
@@ -8,7 +8,3 @@ fsspec
 tyro
 tokenizers >= 0.15.0
 safetensors
-transformers
-einops
-sentencepiece
-pillow
@@ -0,0 +1,227 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from functools import partial
+from typing import Any, Callable
+
+import torch
+
+from datasets import Dataset, load_dataset
+from datasets.distributed import split_dataset_by_node
+from torch.distributed.checkpoint.stateful import Stateful
+from torch.utils.data import IterableDataset
+
+from torchtitan.components.dataloader import ParallelAwareDataloader
+from torchtitan.components.tokenizer import BaseTokenizer
+from torchtitan.config import JobConfig
+from torchtitan.hf_datasets import DatasetConfig
+from torchtitan.tools.logging import logger
+
+
+def _load_c4_dataset(dataset_path: str, split: str):
+    """Load C4 dataset with default configuration."""
+    return load_dataset(dataset_path, name="en", split=split, streaming=True)
+
+
+def _process_c4_text(sample: dict[str, Any]) -> str:
+    """Process C4 dataset sample text."""
+    return sample["text"]
+
+
+# Add your dataset here - more information at docs/datasets.md
+DATASETS = {
+    "c4": DatasetConfig(
+        path="allenai/c4",
+        loader=partial(_load_c4_dataset, split="train"),
+        sample_processor=_process_c4_text,
+    ),
+    "c4_test": DatasetConfig(
+        path="tests/assets/c4_test",
+        loader=lambda path: load_dataset(path, split="train"),
+        sample_processor=_process_c4_text,
+    ),
+    "c4_validation": DatasetConfig(
+        path="allenai/c4",
+        loader=partial(_load_c4_dataset, split="validation"),
+        sample_processor=_process_c4_text,
+    ),
+}
+
+
+def _validate_dataset(
+    dataset_name: str, dataset_path: str | None = None
+) -> tuple[str, Callable, Callable]:
+    """Validate dataset name and path."""
+    if dataset_name not in DATASETS:
+        raise ValueError(
+            f"Dataset {dataset_name} is not supported. "
+            f"Supported datasets are: {list(DATASETS.keys())}"
+        )
+
+    config = DATASETS[dataset_name]
+    path = dataset_path or config.path
+    logger.info(f"Preparing {dataset_name} dataset from {path}")
+    return path, config.loader, config.sample_processor
+
+
+class HuggingFaceTextDataset(IterableDataset, Stateful):
+    def __init__(
+        self,
+        dataset_name: str,
+        dataset_path: str | None,
+        tokenizer: BaseTokenizer,
+        seq_len: int = 2048,
+        dp_rank: int = 0,
+        dp_world_size: int = 1,
+        infinite: bool = False,
+    ) -> None:
+        # Force lowercase for consistent comparison
+        dataset_name = dataset_name.lower()
+
+        path, dataset_loader, text_processor = _validate_dataset(
+            dataset_name, dataset_path
+        )
+        ds = dataset_loader(path)
+
+        self.dataset_name = dataset_name
+        self._data = split_dataset_by_node(ds, dp_rank, dp_world_size)
+        self._tokenizer = tokenizer
+        self.seq_len = seq_len
+        self.infinite = infinite
+        self._text_processor = text_processor
+
+        # Variables for checkpointing
+        self._sample_idx = 0
+        self._token_buffer: list[int] = []
+
+    def _get_data_iter(self):
+        # For map-style datasets, resume by skipping to the correct index
+        # For iterable-style datasets, the underlying iterator already points to the correct index
+        if isinstance(self._data, Dataset):
+            if self._sample_idx == len(self._data):
+                return iter([])
+            else:
+                return iter(self._data.skip(self._sample_idx))
+
+        return iter(self._data)
+
+    def __iter__(self):
+        max_buffer_token_len = 1 + self.seq_len
+
+        while True:
+            for sample in self._get_data_iter():
+                # Use the dataset-specific text processor
+                sample_text = self._text_processor(sample)
+                sample_tokens = self._tokenizer.encode(
+                    sample_text, add_bos=True, add_eos=True
+                )
+                self._token_buffer.extend(sample_tokens)
+                self._sample_idx += 1
+
+                while len(self._token_buffer) >= max_buffer_token_len:
+                    x = torch.LongTensor(self._token_buffer[:max_buffer_token_len])
+                    # update tokens to the remaining tokens
+                    self._token_buffer = self._token_buffer[max_buffer_token_len:]
+                    input = x[:-1]
+                    label = x[1:]
+                    yield {"input": input}, label
+
+            if not self.infinite:
+                logger.warning(f"Dataset {self.dataset_name} has run out of data")
+                break
+            else:
+                # Reset offset for the next iteration
+                self._sample_idx = 0
+                logger.warning(f"Dataset {self.dataset_name} is being re-looped")
+                # Ensures re-looping a dataset loaded from a checkpoint works correctly
+                if not isinstance(self._data, Dataset):
+                    if hasattr(self._data, "set_epoch") and hasattr(
+                        self._data, "epoch"
+                    ):
+                        self._data.set_epoch(self._data.epoch + 1)
+
+    def load_state_dict(self, state_dict):
+        self._token_buffer = state_dict["token_buffer"]
+
+        if isinstance(self._data, Dataset):
+            self._sample_idx = state_dict["sample_idx"]
+        else:
+            assert "data" in state_dict
+            self._data.load_state_dict(state_dict["data"])
+
+    def state_dict(self):
+        _state_dict = {"token_buffer": self._token_buffer}
+
+        if isinstance(self._data, Dataset):
+            _state_dict["sample_idx"] = self._sample_idx
+        else:
+            # Save the iterable dataset's state to later efficiently resume from it
+            # https://huggingface.co/docs/datasets/v3.5.0/en/stream#save-a-dataset-checkpoint-and-resume-iteration
+            _state_dict["data"] = self._data.state_dict()
+
+        return _state_dict
+
+
+def build_text_dataloader(
+    dp_world_size: int,
+    dp_rank: int,
+    tokenizer: BaseTokenizer,
+    job_config: JobConfig,
+    infinite: bool = True,
+) -> ParallelAwareDataloader:
+    """Build a data loader for HuggingFace datasets."""
+    dataset_name = job_config.training.dataset
+    dataset_path = job_config.training.dataset_path
+    batch_size = job_config.training.local_batch_size
+    seq_len = job_config.training.seq_len
+
+    hf_ds = HuggingFaceTextDataset(
+        dataset_name=dataset_name,
+        dataset_path=dataset_path,
+        tokenizer=tokenizer,
+        seq_len=seq_len,
+        dp_rank=dp_rank,
+        dp_world_size=dp_world_size,
+        infinite=infinite,
+    )
+
+    return ParallelAwareDataloader(
+        dataset=hf_ds,
+        dp_rank=dp_rank,
+        dp_world_size=dp_world_size,
+        batch_size=batch_size,
+    )
+
+
+def build_text_validation_dataloader(
+    dp_world_size: int,
+    dp_rank: int,
+    tokenizer: BaseTokenizer,
+    job_config: JobConfig,
+    infinite: bool = False,
+) -> ParallelAwareDataloader:
+    """Build a validation data loader for HuggingFace datasets."""
+    dataset_name = job_config.validation.dataset
+    dataset_path = job_config.validation.dataset_path
+    batch_size = job_config.validation.local_batch_size
+    seq_len = job_config.validation.seq_len
+
+    hf_ds = HuggingFaceTextDataset(
+        dataset_name=dataset_name,
+        dataset_path=dataset_path,
+        tokenizer=tokenizer,
+        seq_len=seq_len,
+        dp_rank=dp_rank,
+        dp_world_size=dp_world_size,
+        infinite=infinite,
+    )
+
+    return ParallelAwareDataloader(
+        dataset=hf_ds,
+        dp_rank=dp_rank,
+        dp_world_size=dp_world_size,
+        batch_size=batch_size,
+    )
@@ -0,0 +1 @@
+.ci/docker/requirements-flux.txt
@@ -17,6 +17,6 @@ CONFIG_FILE=${CONFIG_FILE:-"./torchtitan/models/flux/train_configs/debug_model.t
 PYTORCH_ALLOC_CONF="expandable_segments:True" \
 torchrun --nproc_per_node=${NGPU} --rdzv_backend c10d --rdzv_endpoint="localhost:0" \
 --local-ranks-filter ${LOG_RANK} --role rank --tee 3 \
--m scripts.flux_inference.infer --job.config_file ${CONFIG_FILE} \
+-m torchtitan.models.flux.inference.infer --job.config_file ${CONFIG_FILE} \
 --checkpoint.enable \
 --checkpoint.exclude_from_loading=lr_scheduler,dataloader,optimizer "$@"
Original file line number	Diff line number	Diff line change
`@@ -41,6 +41,7 @@ install_pip_dependencies() {`
`41`	`41`	`# Install all Python dependencies`
`42`	`42`	`pip_install -r /opt/conda/requirements-dev.txt`
`43`	`43`	`pip_install -r /opt/conda/requirements.txt`
	`44`	`+ pip_install -r /opt/conda/requirements-flux.txt`
`44`	`45`	`pip_install -r /opt/conda/requirements-vlm.txt`
`45`	`46`	`popd`
`46`	`47`	`}`
-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +transformers
 +einops
 +sentencepiece
 +pillow