From 879bc3d0ea7053cc526105d608561de245eb7dfb Mon Sep 17 00:00:00 2001 From: SkalskiP Date: Tue, 24 Sep 2024 14:41:03 +0200 Subject: [PATCH 01/10] initial commit --- maestro/trainer/models/qwen2-vl/__init__.py | 0 maestro/trainer/models/qwen2-vl/checkpoints.py | 1 + 2 files changed, 1 insertion(+) create mode 100644 maestro/trainer/models/qwen2-vl/__init__.py create mode 100644 maestro/trainer/models/qwen2-vl/checkpoints.py diff --git a/maestro/trainer/models/qwen2-vl/__init__.py b/maestro/trainer/models/qwen2-vl/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/maestro/trainer/models/qwen2-vl/checkpoints.py b/maestro/trainer/models/qwen2-vl/checkpoints.py new file mode 100644 index 0000000..4ea3039 --- /dev/null +++ b/maestro/trainer/models/qwen2-vl/checkpoints.py @@ -0,0 +1 @@ +DEFAULT_FLORENCE2_MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct" From bd3f74046b9626aed63f63203c768ad247620746 Mon Sep 17 00:00:00 2001 From: SkalskiP Date: Wed, 25 Sep 2024 14:13:31 +0200 Subject: [PATCH 02/10] reformat `DetectionDataset` to `Florence2Dataset` and update deps --- .../trainer/common/data_loaders/datasets.py | 17 +-------------- .../trainer/common/data_loaders/loaders.py | 0 maestro/trainer/models/florence_2/loaders.py | 21 ++++++++++++++++--- maestro/trainer/models/florence_2/metrics.py | 4 ++-- 4 files changed, 21 insertions(+), 21 deletions(-) create mode 100644 maestro/trainer/common/data_loaders/loaders.py diff --git a/maestro/trainer/common/data_loaders/datasets.py b/maestro/trainer/common/data_loaders/datasets.py index 7e0e316..0a2b436 100644 --- a/maestro/trainer/common/data_loaders/datasets.py +++ b/maestro/trainer/common/data_loaders/datasets.py @@ -3,7 +3,6 @@ from typing import Any from PIL import Image -from transformers.pipelines.base import Dataset class JSONLDataset: @@ -34,18 +33,4 @@ def __getitem__(self, idx: int) -> tuple[Image.Image, dict[str, Any]]: except FileNotFoundError: raise FileNotFoundError(f"Image file {image_path} not found.") else: - return (image, entry) - - -class DetectionDataset(Dataset): - def __init__(self, jsonl_file_path: str, image_directory_path: str) -> None: - self.dataset = JSONLDataset(jsonl_file_path, image_directory_path) - - def __len__(self) -> int: - return len(self.dataset) - - def __getitem__(self, idx): - image, data = self.dataset[idx] - prefix = data["prefix"] - suffix = data["suffix"] - return prefix, suffix, image + return image, entry diff --git a/maestro/trainer/common/data_loaders/loaders.py b/maestro/trainer/common/data_loaders/loaders.py new file mode 100644 index 0000000..e69de29 diff --git a/maestro/trainer/models/florence_2/loaders.py b/maestro/trainer/models/florence_2/loaders.py index c35bff5..be8b6e1 100644 --- a/maestro/trainer/models/florence_2/loaders.py +++ b/maestro/trainer/models/florence_2/loaders.py @@ -7,8 +7,23 @@ from PIL import Image from torch.utils.data import DataLoader from transformers import AutoProcessor +from transformers.pipelines.base import Dataset -from maestro.trainer.common.data_loaders.datasets import DetectionDataset +from maestro.trainer.common.data_loaders.datasets import JSONLDataset + + +class Florence2Dataset(Dataset): + def __init__(self, jsonl_file_path: str, image_directory_path: str) -> None: + self.dataset = JSONLDataset(jsonl_file_path, image_directory_path) + + def __len__(self) -> int: + return len(self.dataset) + + def __getitem__(self, idx): + image, data = self.dataset[idx] + prefix = data["prefix"] + suffix = data["suffix"] + return prefix, suffix, image def create_data_loaders( @@ -85,7 +100,7 @@ def create_split_data_loader( def load_split_dataset( dataset_location: str, split_name: str, -) -> Optional[DetectionDataset]: +) -> Optional[Florence2Dataset]: image_directory_path = os.path.join(dataset_location, split_name) jsonl_file_path = os.path.join(dataset_location, split_name, "annotations.jsonl") if not os.path.exists(image_directory_path): @@ -94,7 +109,7 @@ def load_split_dataset( if not os.path.exists(jsonl_file_path): logging.warning(f"Could not find JSONL file: {jsonl_file_path}") return None - return DetectionDataset( + return Florence2Dataset( jsonl_file_path=jsonl_file_path, image_directory_path=image_directory_path, ) diff --git a/maestro/trainer/models/florence_2/metrics.py b/maestro/trainer/models/florence_2/metrics.py index 501bc96..5e9980b 100644 --- a/maestro/trainer/models/florence_2/metrics.py +++ b/maestro/trainer/models/florence_2/metrics.py @@ -5,7 +5,7 @@ from PIL import Image from transformers import AutoProcessor -from maestro.trainer.common.data_loaders.datasets import DetectionDataset +from maestro.trainer.models.florence_2.loaders import Florence2Dataset DETECTION_CLASS_PATTERN = r"([a-zA-Z0-9 -]+)" @@ -59,7 +59,7 @@ def process_output_for_text_metric( return predictions -def get_unique_detection_classes(dataset: DetectionDataset) -> list[str]: +def get_unique_detection_classes(dataset: Florence2Dataset) -> list[str]: class_set = set() for i in range(len(dataset)): _, suffix, _ = dataset[i] From 32814b64505ae208fc53d5c145c0e7eef559cccc Mon Sep 17 00:00:00 2001 From: SkalskiP Date: Wed, 25 Sep 2024 14:13:31 +0200 Subject: [PATCH 03/10] reformat `DetectionDataset` to `Florence2Dataset` and update deps --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c3c15ca..e56900d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,7 @@ classifiers = [ dependencies = [ "supervision~=0.24.0rc1", "requests>=2.31.0,<=2.32.3", - "transformers~=4.44.2", + "transformers @ git+https://github.com/huggingface/transformers", "torch~=2.4.0", "accelerate>=0.33,<0.35", "sentencepiece~=0.2.0", From 09708bbd31599c76d21a38a035fac502b17e67c2 Mon Sep 17 00:00:00 2001 From: SkalskiP Date: Wed, 25 Sep 2024 14:41:22 +0200 Subject: [PATCH 04/10] Qwen2-VL compatible dataloader. --- maestro/trainer/models/qwen2-vl/loaders.py | 34 ++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 maestro/trainer/models/qwen2-vl/loaders.py diff --git a/maestro/trainer/models/qwen2-vl/loaders.py b/maestro/trainer/models/qwen2-vl/loaders.py new file mode 100644 index 0000000..b88d39a --- /dev/null +++ b/maestro/trainer/models/qwen2-vl/loaders.py @@ -0,0 +1,34 @@ + +from transformers.pipelines.base import Dataset + +from maestro.trainer.common.data_loaders.datasets import JSONLDataset + + +class Qwen2VLDataset(Dataset): + def __init__(self, jsonl_file_path: str, image_directory_path: str) -> None: + self.dataset = JSONLDataset(jsonl_file_path, image_directory_path) + + def __len__(self) -> int: + return len(self.dataset) + + def __getitem__(self, idx): + image, data = self.dataset[idx] + prefix = data["prefix"] + suffix = data["suffix"] + return { + "messages": [ + { + "role": "user", + "content": [ + {"type": "image", "image": image}, + {"type": "text", "text": prefix} + ] + }, + { + "role": "assistant", + "content": [ + {"type": "text", "text": suffix} + ] + } + ] + } From 966d0e9a30783f153de82be90b2a42e37647e748 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 25 Sep 2024 12:42:20 +0000 Subject: [PATCH 05/10] =?UTF-8?q?fix(pre=5Fcommit):=20=F0=9F=8E=A8=20auto?= =?UTF-8?q?=20format=20pre-commit=20hooks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- maestro/trainer/models/qwen2-vl/loaders.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/maestro/trainer/models/qwen2-vl/loaders.py b/maestro/trainer/models/qwen2-vl/loaders.py index b88d39a..f84766c 100644 --- a/maestro/trainer/models/qwen2-vl/loaders.py +++ b/maestro/trainer/models/qwen2-vl/loaders.py @@ -1,4 +1,3 @@ - from transformers.pipelines.base import Dataset from maestro.trainer.common.data_loaders.datasets import JSONLDataset @@ -17,18 +16,7 @@ def __getitem__(self, idx): suffix = data["suffix"] return { "messages": [ - { - "role": "user", - "content": [ - {"type": "image", "image": image}, - {"type": "text", "text": prefix} - ] - }, - { - "role": "assistant", - "content": [ - {"type": "text", "text": suffix} - ] - } + {"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prefix}]}, + {"role": "assistant", "content": [{"type": "text", "text": suffix}]}, ] } From 46295476bd38b6821a3557b7cf2177cb58e582ce Mon Sep 17 00:00:00 2001 From: SkalskiP Date: Wed, 25 Sep 2024 14:41:22 +0200 Subject: [PATCH 06/10] Qwen2-VL compatible dataloader. --- .../models/{qwen2-vl => qwen2_vl}/__init__.py | 0 .../models/{qwen2-vl => qwen2_vl}/checkpoints.py | 0 .../models/{qwen2-vl => qwen2_vl}/loaders.py | 16 ++++++++++++++-- 3 files changed, 14 insertions(+), 2 deletions(-) rename maestro/trainer/models/{qwen2-vl => qwen2_vl}/__init__.py (100%) rename maestro/trainer/models/{qwen2-vl => qwen2_vl}/checkpoints.py (100%) rename maestro/trainer/models/{qwen2-vl => qwen2_vl}/loaders.py (57%) diff --git a/maestro/trainer/models/qwen2-vl/__init__.py b/maestro/trainer/models/qwen2_vl/__init__.py similarity index 100% rename from maestro/trainer/models/qwen2-vl/__init__.py rename to maestro/trainer/models/qwen2_vl/__init__.py diff --git a/maestro/trainer/models/qwen2-vl/checkpoints.py b/maestro/trainer/models/qwen2_vl/checkpoints.py similarity index 100% rename from maestro/trainer/models/qwen2-vl/checkpoints.py rename to maestro/trainer/models/qwen2_vl/checkpoints.py diff --git a/maestro/trainer/models/qwen2-vl/loaders.py b/maestro/trainer/models/qwen2_vl/loaders.py similarity index 57% rename from maestro/trainer/models/qwen2-vl/loaders.py rename to maestro/trainer/models/qwen2_vl/loaders.py index f84766c..b88d39a 100644 --- a/maestro/trainer/models/qwen2-vl/loaders.py +++ b/maestro/trainer/models/qwen2_vl/loaders.py @@ -1,3 +1,4 @@ + from transformers.pipelines.base import Dataset from maestro.trainer.common.data_loaders.datasets import JSONLDataset @@ -16,7 +17,18 @@ def __getitem__(self, idx): suffix = data["suffix"] return { "messages": [ - {"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prefix}]}, - {"role": "assistant", "content": [{"type": "text", "text": suffix}]}, + { + "role": "user", + "content": [ + {"type": "image", "image": image}, + {"type": "text", "text": prefix} + ] + }, + { + "role": "assistant", + "content": [ + {"type": "text", "text": suffix} + ] + } ] } From 20f418f980bcf462fb4cb8d37f6b1933b2f14166 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 25 Sep 2024 13:58:38 +0000 Subject: [PATCH 07/10] =?UTF-8?q?fix(pre=5Fcommit):=20=F0=9F=8E=A8=20auto?= =?UTF-8?q?=20format=20pre-commit=20hooks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- maestro/trainer/models/qwen2_vl/loaders.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/maestro/trainer/models/qwen2_vl/loaders.py b/maestro/trainer/models/qwen2_vl/loaders.py index b88d39a..f84766c 100644 --- a/maestro/trainer/models/qwen2_vl/loaders.py +++ b/maestro/trainer/models/qwen2_vl/loaders.py @@ -1,4 +1,3 @@ - from transformers.pipelines.base import Dataset from maestro.trainer.common.data_loaders.datasets import JSONLDataset @@ -17,18 +16,7 @@ def __getitem__(self, idx): suffix = data["suffix"] return { "messages": [ - { - "role": "user", - "content": [ - {"type": "image", "image": image}, - {"type": "text", "text": prefix} - ] - }, - { - "role": "assistant", - "content": [ - {"type": "text", "text": suffix} - ] - } + {"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prefix}]}, + {"role": "assistant", "content": [{"type": "text", "text": suffix}]}, ] } From b42be9e4aaebb843ea28a342324bf93f45ac6c5a Mon Sep 17 00:00:00 2001 From: Onuralp SEZER Date: Wed, 25 Sep 2024 17:46:30 +0300 Subject: [PATCH 08/10] =?UTF-8?q?chore:=20=F0=9F=A7=B9=20ignore=20fmt=20fo?= =?UTF-8?q?r=20qwen2=5Fvl=20dataset=20loader?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Onuralp SEZER --- .pre-commit-config.yaml | 2 +- maestro/trainer/models/qwen2_vl/loaders.py | 17 +++++++++++++++-- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 86b6b7a..8386ec7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -29,7 +29,7 @@ repos: additional_dependencies: ["bandit[toml]"] - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.6.4 + rev: v0.6.7 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] diff --git a/maestro/trainer/models/qwen2_vl/loaders.py b/maestro/trainer/models/qwen2_vl/loaders.py index f84766c..252eb30 100644 --- a/maestro/trainer/models/qwen2_vl/loaders.py +++ b/maestro/trainer/models/qwen2_vl/loaders.py @@ -14,9 +14,22 @@ def __getitem__(self, idx): image, data = self.dataset[idx] prefix = data["prefix"] suffix = data["suffix"] + # fmt: off return { "messages": [ - {"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prefix}]}, - {"role": "assistant", "content": [{"type": "text", "text": suffix}]}, + { + "role": "user", + "content": [ + {"type": "image", "image": image}, + {"type": "text", "text": prefix} + ] + }, + { + "role": "assistant", + "content": [ + {"type": "text", "text": suffix} + ] + } ] } + # fmt: on From 5910fc6e5b7a0396a4146de5f153e2c7cde4d184 Mon Sep 17 00:00:00 2001 From: SkalskiP Date: Wed, 25 Sep 2024 20:29:04 +0200 Subject: [PATCH 09/10] add `extract_assistant_content_ranges` function --- maestro/trainer/models/qwen2_vl/loaders.py | 42 ++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/maestro/trainer/models/qwen2_vl/loaders.py b/maestro/trainer/models/qwen2_vl/loaders.py index 252eb30..224fa71 100644 --- a/maestro/trainer/models/qwen2_vl/loaders.py +++ b/maestro/trainer/models/qwen2_vl/loaders.py @@ -1,7 +1,49 @@ +from typing import List, Tuple + from transformers.pipelines.base import Dataset from maestro.trainer.common.data_loaders.datasets import JSONLDataset +START_TOKEN_1 = 151644 +START_TOKEN_2 = 77091 +END_TOKEN = 151645 + + +def extract_assistant_content_ranges(token_list: List[int]) -> List[Tuple[int, int]]: + """ + Identify the start and end indexes of assistant content ranges within a list of + tokens. + + The function searches for sequences that mark the start and end of assistant content + in the tokenized list, returning the corresponding index ranges. + + Args: + token_list (List[int]): A list of tokens to search. + + Returns: + List[Tuple[int, int]]: A list of (start_index, end_index) tuples indicating the + assistant content ranges in the input list. + + Note: + - Assistant content starts with the sequence [START_TOKEN_1, START_TOKEN_2], + which corresponds to the tokenized value of `"<|im_start|>assistant"`. + - Assistant content ends with END_TOKEN, which corresponds to the tokenized + value of `"<|im_end|>"`. + - Each start sequence has a corresponding end token. + """ + start_indexes = [] + end_indexes = [] + + for i in range(len(token_list) - 1): + if token_list[i] == START_TOKEN_1 and token_list[i + 1] == START_TOKEN_2: + start_indexes.append(i) + for j in range(i + 2, len(token_list)): + if token_list[j] == END_TOKEN: + end_indexes.append(j) + break + + return list(zip(start_indexes, end_indexes)) + class Qwen2VLDataset(Dataset): def __init__(self, jsonl_file_path: str, image_directory_path: str) -> None: From 79b075ee69444c04f13153bd491d323597972b1b Mon Sep 17 00:00:00 2001 From: SkalskiP Date: Wed, 25 Sep 2024 20:29:04 +0200 Subject: [PATCH 10/10] add `extract_assistant_content_ranges` function --- maestro/trainer/models/qwen2_vl/loaders.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/maestro/trainer/models/qwen2_vl/loaders.py b/maestro/trainer/models/qwen2_vl/loaders.py index 224fa71..f4fe02f 100644 --- a/maestro/trainer/models/qwen2_vl/loaders.py +++ b/maestro/trainer/models/qwen2_vl/loaders.py @@ -1,5 +1,3 @@ -from typing import List, Tuple - from transformers.pipelines.base import Dataset from maestro.trainer.common.data_loaders.datasets import JSONLDataset @@ -9,7 +7,7 @@ END_TOKEN = 151645 -def extract_assistant_content_ranges(token_list: List[int]) -> List[Tuple[int, int]]: +def extract_assistant_content_ranges(token_list: list[int]) -> list[tuple[int, int]]: """ Identify the start and end indexes of assistant content ranges within a list of tokens. @@ -18,10 +16,10 @@ def extract_assistant_content_ranges(token_list: List[int]) -> List[Tuple[int, i in the tokenized list, returning the corresponding index ranges. Args: - token_list (List[int]): A list of tokens to search. + token_list (list[int]): A list of tokens to search. Returns: - List[Tuple[int, int]]: A list of (start_index, end_index) tuples indicating the + list[tuple[int, int]]: A list of (start_index, end_index) tuples indicating the assistant content ranges in the input list. Note: