diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 86b6b7a..8386ec7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -29,7 +29,7 @@ repos: additional_dependencies: ["bandit[toml]"] - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.6.4 + rev: v0.6.7 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] diff --git a/maestro/trainer/common/data_loaders/datasets.py b/maestro/trainer/common/data_loaders/datasets.py index 7e0e316..0a2b436 100644 --- a/maestro/trainer/common/data_loaders/datasets.py +++ b/maestro/trainer/common/data_loaders/datasets.py @@ -3,7 +3,6 @@ from typing import Any from PIL import Image -from transformers.pipelines.base import Dataset class JSONLDataset: @@ -34,18 +33,4 @@ def __getitem__(self, idx: int) -> tuple[Image.Image, dict[str, Any]]: except FileNotFoundError: raise FileNotFoundError(f"Image file {image_path} not found.") else: - return (image, entry) - - -class DetectionDataset(Dataset): - def __init__(self, jsonl_file_path: str, image_directory_path: str) -> None: - self.dataset = JSONLDataset(jsonl_file_path, image_directory_path) - - def __len__(self) -> int: - return len(self.dataset) - - def __getitem__(self, idx): - image, data = self.dataset[idx] - prefix = data["prefix"] - suffix = data["suffix"] - return prefix, suffix, image + return image, entry diff --git a/maestro/trainer/common/data_loaders/loaders.py b/maestro/trainer/common/data_loaders/loaders.py new file mode 100644 index 0000000..e69de29 diff --git a/maestro/trainer/models/florence_2/loaders.py b/maestro/trainer/models/florence_2/loaders.py index c35bff5..be8b6e1 100644 --- a/maestro/trainer/models/florence_2/loaders.py +++ b/maestro/trainer/models/florence_2/loaders.py @@ -7,8 +7,23 @@ from PIL import Image from torch.utils.data import DataLoader from transformers import AutoProcessor +from transformers.pipelines.base import Dataset -from maestro.trainer.common.data_loaders.datasets import DetectionDataset +from maestro.trainer.common.data_loaders.datasets import JSONLDataset + + +class Florence2Dataset(Dataset): + def __init__(self, jsonl_file_path: str, image_directory_path: str) -> None: + self.dataset = JSONLDataset(jsonl_file_path, image_directory_path) + + def __len__(self) -> int: + return len(self.dataset) + + def __getitem__(self, idx): + image, data = self.dataset[idx] + prefix = data["prefix"] + suffix = data["suffix"] + return prefix, suffix, image def create_data_loaders( @@ -85,7 +100,7 @@ def create_split_data_loader( def load_split_dataset( dataset_location: str, split_name: str, -) -> Optional[DetectionDataset]: +) -> Optional[Florence2Dataset]: image_directory_path = os.path.join(dataset_location, split_name) jsonl_file_path = os.path.join(dataset_location, split_name, "annotations.jsonl") if not os.path.exists(image_directory_path): @@ -94,7 +109,7 @@ def load_split_dataset( if not os.path.exists(jsonl_file_path): logging.warning(f"Could not find JSONL file: {jsonl_file_path}") return None - return DetectionDataset( + return Florence2Dataset( jsonl_file_path=jsonl_file_path, image_directory_path=image_directory_path, ) diff --git a/maestro/trainer/models/florence_2/metrics.py b/maestro/trainer/models/florence_2/metrics.py index 501bc96..5e9980b 100644 --- a/maestro/trainer/models/florence_2/metrics.py +++ b/maestro/trainer/models/florence_2/metrics.py @@ -5,7 +5,7 @@ from PIL import Image from transformers import AutoProcessor -from maestro.trainer.common.data_loaders.datasets import DetectionDataset +from maestro.trainer.models.florence_2.loaders import Florence2Dataset DETECTION_CLASS_PATTERN = r"([a-zA-Z0-9 -]+)" @@ -59,7 +59,7 @@ def process_output_for_text_metric( return predictions -def get_unique_detection_classes(dataset: DetectionDataset) -> list[str]: +def get_unique_detection_classes(dataset: Florence2Dataset) -> list[str]: class_set = set() for i in range(len(dataset)): _, suffix, _ = dataset[i] diff --git a/maestro/trainer/models/qwen2_vl/__init__.py b/maestro/trainer/models/qwen2_vl/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/maestro/trainer/models/qwen2_vl/checkpoints.py b/maestro/trainer/models/qwen2_vl/checkpoints.py new file mode 100644 index 0000000..4ea3039 --- /dev/null +++ b/maestro/trainer/models/qwen2_vl/checkpoints.py @@ -0,0 +1 @@ +DEFAULT_FLORENCE2_MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct" diff --git a/maestro/trainer/models/qwen2_vl/loaders.py b/maestro/trainer/models/qwen2_vl/loaders.py new file mode 100644 index 0000000..f4fe02f --- /dev/null +++ b/maestro/trainer/models/qwen2_vl/loaders.py @@ -0,0 +1,75 @@ +from transformers.pipelines.base import Dataset + +from maestro.trainer.common.data_loaders.datasets import JSONLDataset + +START_TOKEN_1 = 151644 +START_TOKEN_2 = 77091 +END_TOKEN = 151645 + + +def extract_assistant_content_ranges(token_list: list[int]) -> list[tuple[int, int]]: + """ + Identify the start and end indexes of assistant content ranges within a list of + tokens. + + The function searches for sequences that mark the start and end of assistant content + in the tokenized list, returning the corresponding index ranges. + + Args: + token_list (list[int]): A list of tokens to search. + + Returns: + list[tuple[int, int]]: A list of (start_index, end_index) tuples indicating the + assistant content ranges in the input list. + + Note: + - Assistant content starts with the sequence [START_TOKEN_1, START_TOKEN_2], + which corresponds to the tokenized value of `"<|im_start|>assistant"`. + - Assistant content ends with END_TOKEN, which corresponds to the tokenized + value of `"<|im_end|>"`. + - Each start sequence has a corresponding end token. + """ + start_indexes = [] + end_indexes = [] + + for i in range(len(token_list) - 1): + if token_list[i] == START_TOKEN_1 and token_list[i + 1] == START_TOKEN_2: + start_indexes.append(i) + for j in range(i + 2, len(token_list)): + if token_list[j] == END_TOKEN: + end_indexes.append(j) + break + + return list(zip(start_indexes, end_indexes)) + + +class Qwen2VLDataset(Dataset): + def __init__(self, jsonl_file_path: str, image_directory_path: str) -> None: + self.dataset = JSONLDataset(jsonl_file_path, image_directory_path) + + def __len__(self) -> int: + return len(self.dataset) + + def __getitem__(self, idx): + image, data = self.dataset[idx] + prefix = data["prefix"] + suffix = data["suffix"] + # fmt: off + return { + "messages": [ + { + "role": "user", + "content": [ + {"type": "image", "image": image}, + {"type": "text", "text": prefix} + ] + }, + { + "role": "assistant", + "content": [ + {"type": "text", "text": suffix} + ] + } + ] + } + # fmt: on diff --git a/pyproject.toml b/pyproject.toml index c3c15ca..e56900d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,7 @@ classifiers = [ dependencies = [ "supervision~=0.24.0rc1", "requests>=2.31.0,<=2.32.3", - "transformers~=4.44.2", + "transformers @ git+https://github.com/huggingface/transformers", "torch~=2.4.0", "accelerate>=0.33,<0.35", "sentencepiece~=0.2.0",