From 879bc3d0ea7053cc526105d608561de245eb7dfb Mon Sep 17 00:00:00 2001
From: SkalskiP <piotr.skalski92@gmail.com>
Date: Tue, 24 Sep 2024 14:41:03 +0200
Subject: [PATCH 01/10] initial commit

---
 maestro/trainer/models/qwen2-vl/__init__.py    | 0
 maestro/trainer/models/qwen2-vl/checkpoints.py | 1 +
 2 files changed, 1 insertion(+)
 create mode 100644 maestro/trainer/models/qwen2-vl/__init__.py
 create mode 100644 maestro/trainer/models/qwen2-vl/checkpoints.py

diff --git a/maestro/trainer/models/qwen2-vl/__init__.py b/maestro/trainer/models/qwen2-vl/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/maestro/trainer/models/qwen2-vl/checkpoints.py b/maestro/trainer/models/qwen2-vl/checkpoints.py
new file mode 100644
index 0000000..4ea3039
--- /dev/null
+++ b/maestro/trainer/models/qwen2-vl/checkpoints.py
@@ -0,0 +1 @@
+DEFAULT_FLORENCE2_MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"

From bd3f74046b9626aed63f63203c768ad247620746 Mon Sep 17 00:00:00 2001
From: SkalskiP <piotr.skalski92@gmail.com>
Date: Wed, 25 Sep 2024 14:13:31 +0200
Subject: [PATCH 02/10] reformat `DetectionDataset` to `Florence2Dataset` and
 update deps

---
 .../trainer/common/data_loaders/datasets.py   | 17 +--------------
 .../trainer/common/data_loaders/loaders.py    |  0
 maestro/trainer/models/florence_2/loaders.py  | 21 ++++++++++++++++---
 maestro/trainer/models/florence_2/metrics.py  |  4 ++--
 4 files changed, 21 insertions(+), 21 deletions(-)
 create mode 100644 maestro/trainer/common/data_loaders/loaders.py

diff --git a/maestro/trainer/common/data_loaders/datasets.py b/maestro/trainer/common/data_loaders/datasets.py
index 7e0e316..0a2b436 100644
--- a/maestro/trainer/common/data_loaders/datasets.py
+++ b/maestro/trainer/common/data_loaders/datasets.py
@@ -3,7 +3,6 @@
 from typing import Any
 
 from PIL import Image
-from transformers.pipelines.base import Dataset
 
 
 class JSONLDataset:
@@ -34,18 +33,4 @@ def __getitem__(self, idx: int) -> tuple[Image.Image, dict[str, Any]]:
         except FileNotFoundError:
             raise FileNotFoundError(f"Image file {image_path} not found.")
         else:
-            return (image, entry)
-
-
-class DetectionDataset(Dataset):
-    def __init__(self, jsonl_file_path: str, image_directory_path: str) -> None:
-        self.dataset = JSONLDataset(jsonl_file_path, image_directory_path)
-
-    def __len__(self) -> int:
-        return len(self.dataset)
-
-    def __getitem__(self, idx):
-        image, data = self.dataset[idx]
-        prefix = data["prefix"]
-        suffix = data["suffix"]
-        return prefix, suffix, image
+            return image, entry
diff --git a/maestro/trainer/common/data_loaders/loaders.py b/maestro/trainer/common/data_loaders/loaders.py
new file mode 100644
index 0000000..e69de29
diff --git a/maestro/trainer/models/florence_2/loaders.py b/maestro/trainer/models/florence_2/loaders.py
index c35bff5..be8b6e1 100644
--- a/maestro/trainer/models/florence_2/loaders.py
+++ b/maestro/trainer/models/florence_2/loaders.py
@@ -7,8 +7,23 @@
 from PIL import Image
 from torch.utils.data import DataLoader
 from transformers import AutoProcessor
+from transformers.pipelines.base import Dataset
 
-from maestro.trainer.common.data_loaders.datasets import DetectionDataset
+from maestro.trainer.common.data_loaders.datasets import JSONLDataset
+
+
+class Florence2Dataset(Dataset):
+    def __init__(self, jsonl_file_path: str, image_directory_path: str) -> None:
+        self.dataset = JSONLDataset(jsonl_file_path, image_directory_path)
+
+    def __len__(self) -> int:
+        return len(self.dataset)
+
+    def __getitem__(self, idx):
+        image, data = self.dataset[idx]
+        prefix = data["prefix"]
+        suffix = data["suffix"]
+        return prefix, suffix, image
 
 
 def create_data_loaders(
@@ -85,7 +100,7 @@ def create_split_data_loader(
 def load_split_dataset(
     dataset_location: str,
     split_name: str,
-) -> Optional[DetectionDataset]:
+) -> Optional[Florence2Dataset]:
     image_directory_path = os.path.join(dataset_location, split_name)
     jsonl_file_path = os.path.join(dataset_location, split_name, "annotations.jsonl")
     if not os.path.exists(image_directory_path):
@@ -94,7 +109,7 @@ def load_split_dataset(
     if not os.path.exists(jsonl_file_path):
         logging.warning(f"Could not find JSONL file: {jsonl_file_path}")
         return None
-    return DetectionDataset(
+    return Florence2Dataset(
         jsonl_file_path=jsonl_file_path,
         image_directory_path=image_directory_path,
     )
diff --git a/maestro/trainer/models/florence_2/metrics.py b/maestro/trainer/models/florence_2/metrics.py
index 501bc96..5e9980b 100644
--- a/maestro/trainer/models/florence_2/metrics.py
+++ b/maestro/trainer/models/florence_2/metrics.py
@@ -5,7 +5,7 @@
 from PIL import Image
 from transformers import AutoProcessor
 
-from maestro.trainer.common.data_loaders.datasets import DetectionDataset
+from maestro.trainer.models.florence_2.loaders import Florence2Dataset
 
 DETECTION_CLASS_PATTERN = r"([a-zA-Z0-9 -]+)<loc_\d+>"
 
@@ -59,7 +59,7 @@ def process_output_for_text_metric(
     return predictions
 
 
-def get_unique_detection_classes(dataset: DetectionDataset) -> list[str]:
+def get_unique_detection_classes(dataset: Florence2Dataset) -> list[str]:
     class_set = set()
     for i in range(len(dataset)):
         _, suffix, _ = dataset[i]

From 32814b64505ae208fc53d5c145c0e7eef559cccc Mon Sep 17 00:00:00 2001
From: SkalskiP <piotr.skalski92@gmail.com>
Date: Wed, 25 Sep 2024 14:13:31 +0200
Subject: [PATCH 03/10] reformat `DetectionDataset` to `Florence2Dataset` and
 update deps

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index c3c15ca..e56900d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,7 +36,7 @@ classifiers = [
 dependencies = [
     "supervision~=0.24.0rc1",
     "requests>=2.31.0,<=2.32.3",
-    "transformers~=4.44.2",
+    "transformers @ git+https://github.com/huggingface/transformers",
     "torch~=2.4.0",
     "accelerate>=0.33,<0.35",
     "sentencepiece~=0.2.0",

From 09708bbd31599c76d21a38a035fac502b17e67c2 Mon Sep 17 00:00:00 2001
From: SkalskiP <piotr.skalski92@gmail.com>
Date: Wed, 25 Sep 2024 14:41:22 +0200
Subject: [PATCH 04/10] Qwen2-VL compatible dataloader.

---
 maestro/trainer/models/qwen2-vl/loaders.py | 34 ++++++++++++++++++++++
 1 file changed, 34 insertions(+)
 create mode 100644 maestro/trainer/models/qwen2-vl/loaders.py

diff --git a/maestro/trainer/models/qwen2-vl/loaders.py b/maestro/trainer/models/qwen2-vl/loaders.py
new file mode 100644
index 0000000..b88d39a
--- /dev/null
+++ b/maestro/trainer/models/qwen2-vl/loaders.py
@@ -0,0 +1,34 @@
+
+from transformers.pipelines.base import Dataset
+
+from maestro.trainer.common.data_loaders.datasets import JSONLDataset
+
+
+class Qwen2VLDataset(Dataset):
+    def __init__(self, jsonl_file_path: str, image_directory_path: str) -> None:
+        self.dataset = JSONLDataset(jsonl_file_path, image_directory_path)
+
+    def __len__(self) -> int:
+        return len(self.dataset)
+
+    def __getitem__(self, idx):
+        image, data = self.dataset[idx]
+        prefix = data["prefix"]
+        suffix = data["suffix"]
+        return {
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image", "image": image},
+                        {"type": "text", "text": prefix}
+                    ]
+                },
+                {
+                    "role": "assistant",
+                    "content": [
+                        {"type": "text", "text": suffix}
+                    ]
+                }
+            ]
+        }

From 966d0e9a30783f153de82be90b2a42e37647e748 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 25 Sep 2024 12:42:20 +0000
Subject: [PATCH 05/10] =?UTF-8?q?fix(pre=5Fcommit):=20=F0=9F=8E=A8=20auto?=
 =?UTF-8?q?=20format=20pre-commit=20hooks?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 maestro/trainer/models/qwen2-vl/loaders.py | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/maestro/trainer/models/qwen2-vl/loaders.py b/maestro/trainer/models/qwen2-vl/loaders.py
index b88d39a..f84766c 100644
--- a/maestro/trainer/models/qwen2-vl/loaders.py
+++ b/maestro/trainer/models/qwen2-vl/loaders.py
@@ -1,4 +1,3 @@
-
 from transformers.pipelines.base import Dataset
 
 from maestro.trainer.common.data_loaders.datasets import JSONLDataset
@@ -17,18 +16,7 @@ def __getitem__(self, idx):
         suffix = data["suffix"]
         return {
             "messages": [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "image", "image": image},
-                        {"type": "text", "text": prefix}
-                    ]
-                },
-                {
-                    "role": "assistant",
-                    "content": [
-                        {"type": "text", "text": suffix}
-                    ]
-                }
+                {"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prefix}]},
+                {"role": "assistant", "content": [{"type": "text", "text": suffix}]},
             ]
         }

From 46295476bd38b6821a3557b7cf2177cb58e582ce Mon Sep 17 00:00:00 2001
From: SkalskiP <piotr.skalski92@gmail.com>
Date: Wed, 25 Sep 2024 14:41:22 +0200
Subject: [PATCH 06/10] Qwen2-VL compatible dataloader.

---
 .../models/{qwen2-vl => qwen2_vl}/__init__.py    |  0
 .../models/{qwen2-vl => qwen2_vl}/checkpoints.py |  0
 .../models/{qwen2-vl => qwen2_vl}/loaders.py     | 16 ++++++++++++++--
 3 files changed, 14 insertions(+), 2 deletions(-)
 rename maestro/trainer/models/{qwen2-vl => qwen2_vl}/__init__.py (100%)
 rename maestro/trainer/models/{qwen2-vl => qwen2_vl}/checkpoints.py (100%)
 rename maestro/trainer/models/{qwen2-vl => qwen2_vl}/loaders.py (57%)

diff --git a/maestro/trainer/models/qwen2-vl/__init__.py b/maestro/trainer/models/qwen2_vl/__init__.py
similarity index 100%
rename from maestro/trainer/models/qwen2-vl/__init__.py
rename to maestro/trainer/models/qwen2_vl/__init__.py
diff --git a/maestro/trainer/models/qwen2-vl/checkpoints.py b/maestro/trainer/models/qwen2_vl/checkpoints.py
similarity index 100%
rename from maestro/trainer/models/qwen2-vl/checkpoints.py
rename to maestro/trainer/models/qwen2_vl/checkpoints.py
diff --git a/maestro/trainer/models/qwen2-vl/loaders.py b/maestro/trainer/models/qwen2_vl/loaders.py
similarity index 57%
rename from maestro/trainer/models/qwen2-vl/loaders.py
rename to maestro/trainer/models/qwen2_vl/loaders.py
index f84766c..b88d39a 100644
--- a/maestro/trainer/models/qwen2-vl/loaders.py
+++ b/maestro/trainer/models/qwen2_vl/loaders.py
@@ -1,3 +1,4 @@
+
 from transformers.pipelines.base import Dataset
 
 from maestro.trainer.common.data_loaders.datasets import JSONLDataset
@@ -16,7 +17,18 @@ def __getitem__(self, idx):
         suffix = data["suffix"]
         return {
             "messages": [
-                {"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prefix}]},
-                {"role": "assistant", "content": [{"type": "text", "text": suffix}]},
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image", "image": image},
+                        {"type": "text", "text": prefix}
+                    ]
+                },
+                {
+                    "role": "assistant",
+                    "content": [
+                        {"type": "text", "text": suffix}
+                    ]
+                }
             ]
         }

From 20f418f980bcf462fb4cb8d37f6b1933b2f14166 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 25 Sep 2024 13:58:38 +0000
Subject: [PATCH 07/10] =?UTF-8?q?fix(pre=5Fcommit):=20=F0=9F=8E=A8=20auto?=
 =?UTF-8?q?=20format=20pre-commit=20hooks?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 maestro/trainer/models/qwen2_vl/loaders.py | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/maestro/trainer/models/qwen2_vl/loaders.py b/maestro/trainer/models/qwen2_vl/loaders.py
index b88d39a..f84766c 100644
--- a/maestro/trainer/models/qwen2_vl/loaders.py
+++ b/maestro/trainer/models/qwen2_vl/loaders.py
@@ -1,4 +1,3 @@
-
 from transformers.pipelines.base import Dataset
 
 from maestro.trainer.common.data_loaders.datasets import JSONLDataset
@@ -17,18 +16,7 @@ def __getitem__(self, idx):
         suffix = data["suffix"]
         return {
             "messages": [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "image", "image": image},
-                        {"type": "text", "text": prefix}
-                    ]
-                },
-                {
-                    "role": "assistant",
-                    "content": [
-                        {"type": "text", "text": suffix}
-                    ]
-                }
+                {"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prefix}]},
+                {"role": "assistant", "content": [{"type": "text", "text": suffix}]},
             ]
         }

From b42be9e4aaebb843ea28a342324bf93f45ac6c5a Mon Sep 17 00:00:00 2001
From: Onuralp SEZER <thunderbirdtr@gmail.com>
Date: Wed, 25 Sep 2024 17:46:30 +0300
Subject: [PATCH 08/10] =?UTF-8?q?chore:=20=F0=9F=A7=B9=20ignore=20fmt=20fo?=
 =?UTF-8?q?r=20qwen2=5Fvl=20dataset=20loader?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Onuralp SEZER <thunderbirdtr@gmail.com>
---
 .pre-commit-config.yaml                    |  2 +-
 maestro/trainer/models/qwen2_vl/loaders.py | 17 +++++++++++++++--
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 86b6b7a..8386ec7 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -29,7 +29,7 @@ repos:
           additional_dependencies: ["bandit[toml]"]
 
   -   repo: https://github.com/astral-sh/ruff-pre-commit
-      rev: v0.6.4
+      rev: v0.6.7
       hooks:
       -   id: ruff
           args: [--fix, --exit-non-zero-on-fix]
diff --git a/maestro/trainer/models/qwen2_vl/loaders.py b/maestro/trainer/models/qwen2_vl/loaders.py
index f84766c..252eb30 100644
--- a/maestro/trainer/models/qwen2_vl/loaders.py
+++ b/maestro/trainer/models/qwen2_vl/loaders.py
@@ -14,9 +14,22 @@ def __getitem__(self, idx):
         image, data = self.dataset[idx]
         prefix = data["prefix"]
         suffix = data["suffix"]
+        # fmt: off
         return {
             "messages": [
-                {"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prefix}]},
-                {"role": "assistant", "content": [{"type": "text", "text": suffix}]},
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image", "image": image},
+                        {"type": "text", "text": prefix}
+                    ]
+                },
+                {
+                    "role": "assistant",
+                    "content": [
+                        {"type": "text", "text": suffix}
+                    ]
+                }
             ]
         }
+        # fmt: on

From 5910fc6e5b7a0396a4146de5f153e2c7cde4d184 Mon Sep 17 00:00:00 2001
From: SkalskiP <piotr.skalski92@gmail.com>
Date: Wed, 25 Sep 2024 20:29:04 +0200
Subject: [PATCH 09/10] add `extract_assistant_content_ranges` function

---
 maestro/trainer/models/qwen2_vl/loaders.py | 42 ++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/maestro/trainer/models/qwen2_vl/loaders.py b/maestro/trainer/models/qwen2_vl/loaders.py
index 252eb30..224fa71 100644
--- a/maestro/trainer/models/qwen2_vl/loaders.py
+++ b/maestro/trainer/models/qwen2_vl/loaders.py
@@ -1,7 +1,49 @@
+from typing import List, Tuple
+
 from transformers.pipelines.base import Dataset
 
 from maestro.trainer.common.data_loaders.datasets import JSONLDataset
 
+START_TOKEN_1 = 151644
+START_TOKEN_2 = 77091
+END_TOKEN = 151645
+
+
+def extract_assistant_content_ranges(token_list: List[int]) -> List[Tuple[int, int]]:
+    """
+    Identify the start and end indexes of assistant content ranges within a list of
+    tokens.
+
+    The function searches for sequences that mark the start and end of assistant content
+    in the tokenized list, returning the corresponding index ranges.
+
+    Args:
+        token_list (List[int]): A list of tokens to search.
+
+    Returns:
+        List[Tuple[int, int]]: A list of (start_index, end_index) tuples indicating the
+        assistant content ranges in the input list.
+
+    Note:
+        - Assistant content starts with the sequence [START_TOKEN_1, START_TOKEN_2],
+        which corresponds to the tokenized value of `"<|im_start|>assistant"`.
+        - Assistant content ends with END_TOKEN, which corresponds to the tokenized
+        value of `"<|im_end|>"`.
+        - Each start sequence has a corresponding end token.
+    """
+    start_indexes = []
+    end_indexes = []
+
+    for i in range(len(token_list) - 1):
+        if token_list[i] == START_TOKEN_1 and token_list[i + 1] == START_TOKEN_2:
+            start_indexes.append(i)
+            for j in range(i + 2, len(token_list)):
+                if token_list[j] == END_TOKEN:
+                    end_indexes.append(j)
+                    break
+
+    return list(zip(start_indexes, end_indexes))
+
 
 class Qwen2VLDataset(Dataset):
     def __init__(self, jsonl_file_path: str, image_directory_path: str) -> None:

From 79b075ee69444c04f13153bd491d323597972b1b Mon Sep 17 00:00:00 2001
From: SkalskiP <piotr.skalski92@gmail.com>
Date: Wed, 25 Sep 2024 20:29:04 +0200
Subject: [PATCH 10/10] add `extract_assistant_content_ranges` function

---
 maestro/trainer/models/qwen2_vl/loaders.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/maestro/trainer/models/qwen2_vl/loaders.py b/maestro/trainer/models/qwen2_vl/loaders.py
index 224fa71..f4fe02f 100644
--- a/maestro/trainer/models/qwen2_vl/loaders.py
+++ b/maestro/trainer/models/qwen2_vl/loaders.py
@@ -1,5 +1,3 @@
-from typing import List, Tuple
-
 from transformers.pipelines.base import Dataset
 
 from maestro.trainer.common.data_loaders.datasets import JSONLDataset
@@ -9,7 +7,7 @@
 END_TOKEN = 151645
 
 
-def extract_assistant_content_ranges(token_list: List[int]) -> List[Tuple[int, int]]:
+def extract_assistant_content_ranges(token_list: list[int]) -> list[tuple[int, int]]:
     """
     Identify the start and end indexes of assistant content ranges within a list of
     tokens.
@@ -18,10 +16,10 @@ def extract_assistant_content_ranges(token_list: List[int]) -> List[Tuple[int, i
     in the tokenized list, returning the corresponding index ranges.
 
     Args:
-        token_list (List[int]): A list of tokens to search.
+        token_list (list[int]): A list of tokens to search.
 
     Returns:
-        List[Tuple[int, int]]: A list of (start_index, end_index) tuples indicating the
+        list[tuple[int, int]]: A list of (start_index, end_index) tuples indicating the
         assistant content ranges in the input list.
 
     Note: