revidee
diff --git a/‎comer/datamodule/__init__.py
+2-2 b/‎comer/datamodule/__init__.py
+2-2
diff --git a/‎comer/datamodule/crohme/__init__.py
+4 b/‎comer/datamodule/crohme/__init__.py
+4
diff --git a/‎comer/datamodule/crohme/batch.py
+110 b/‎comer/datamodule/crohme/batch.py
+110
diff --git a/‎comer/datamodule/crohme/datamodule.py
+102 b/‎comer/datamodule/crohme/datamodule.py
+102
diff --git a/‎comer/datamodule/dataset.py renamed to ‎comer/datamodule/crohme/dataset.py
+10-5 b/‎comer/datamodule/dataset.py renamed to ‎comer/datamodule/crohme/dataset.py
+10-5
diff --git a/‎comer/datamodule/dictionary.txt renamed to ‎comer/datamodule/crohme/dictionary.txt b/‎comer/datamodule/dictionary.txt renamed to ‎comer/datamodule/crohme/dictionary.txt
diff --git a/‎comer/datamodule/crohme/entry.py
+39 b/‎comer/datamodule/crohme/entry.py
+39
diff --git a/‎comer/datamodule/vocab.py renamed to ‎comer/datamodule/crohme/vocab.py b/‎comer/datamodule/vocab.py renamed to ‎comer/datamodule/crohme/vocab.py
@@ -1,5 +1,5 @@
-from .datamodule import Batch, CROHMEDatamodule
-from .vocab import vocab
+from comer.datamodule.crohme.datamodule import Batch, CROHMEDatamodule
+from comer.datamodule.crohme.vocab import vocab
 
 vocab_size = len(vocab)
 
 
@@ -0,0 +1,4 @@
+from .entry import DataEntry, extract_data_entries
+from .batch import Batch, BatchTuple, build_batches_from_entries, build_dataset
+from .dataset import CROHMEDataset
+from .datamodule import CROHMEDatamodule
@@ -0,0 +1,110 @@
+from dataclasses import dataclass
+from typing import List, Tuple, Callable
+from zipfile import ZipFile
+
+import numpy as np
+from torch import FloatTensor, LongTensor
+
+from comer.datamodule.crohme import DataEntry, extract_data_entries
+
+
+@dataclass
+class Batch:
+    img_bases: List[str]  # [b,]
+    imgs: FloatTensor  # [b, 1, H, W]
+    mask: LongTensor  # [b, H, W]
+    indices: List[List[int]]  # [b, l]
+
+    def __len__(self) -> int:
+        return len(self.img_bases)
+
+    def to(self, device) -> "Batch":
+        return Batch(
+            img_bases=self.img_bases,
+            imgs=self.imgs.to(device),
+            mask=self.mask.to(device),
+            indices=self.indices,
+        )
+
+
+# A BatchTuple represents a single batch which contains 3 lists of equal length (batch-len)
+# [file_names, images, labels]
+BatchTuple = Tuple[List[str], List[np.ndarray], List[List[str]]]
+
+# change according to your GPU memory
+MAX_SIZE = 32e4
+
+
+def build_batches_from_entries(
+        data: List[DataEntry],
+        batch_size: int,
+        batch_imagesize: int = MAX_SIZE,
+        maxlen: int = 200,
+        max_imagesize: int = MAX_SIZE,
+) -> List[BatchTuple]:
+    curr_fname_batch: List[str] = []
+    curr_feature_batch: List[np.ndarray] = []
+    curr_label_batch: List[List[str]] = []
+
+    total_fname_batches: List[List[str]] = []
+    total_feature_batches: List[List[np.ndarray]] = []
+    total_label_batches: List[List[List[str]]] = []
+
+    biggest_image_size = 0
+    get_entry_image_pixels: Callable[[DataEntry], int] = lambda x: x.image.size[0] * x.image.size[1]
+    data.sort(key=get_entry_image_pixels)
+
+    i = 0
+    for entry in data:
+        size = get_entry_image_pixels(entry)
+        image_arr = np.array(entry.image)
+        if size > biggest_image_size:
+            biggest_image_size = size
+        batch_image_size = biggest_image_size * (i + 1)
+        if len(entry.label) > maxlen:
+            print("label", i, "length bigger than", maxlen, "ignore")
+        elif size > max_imagesize:
+            print(
+                f"image: {entry.file_name} size: {image_arr.shape[0]} x {image_arr.shape[1]} = {size} bigger than {max_imagesize}, ignore"
+            )
+        else:
+            if batch_image_size > batch_imagesize or i == batch_size:
+                # a batch is full, add it to the "batch"-list and reset the current batch with the new entry.
+                total_fname_batches.append(curr_fname_batch)
+                total_feature_batches.append(curr_feature_batch)
+                total_label_batches.append(curr_label_batch)
+                # reset current batch
+                i = 0
+                biggest_image_size = size
+                curr_fname_batch = []
+                curr_feature_batch = []
+                curr_label_batch = []
+            # add the entry to the current batch
+            curr_fname_batch.append(entry.file_name)
+            curr_feature_batch.append(image_arr)
+            curr_label_batch.append(entry.label)
+            i += 1
+
+    # add last batch if it isn't empty
+    if len(curr_fname_batch) > 0:
+        total_fname_batches.append(curr_fname_batch)
+        total_feature_batches.append(curr_feature_batch)
+        total_label_batches.append(curr_label_batch)
+    print("total ", len(total_feature_batches), "batch data loaded")
+    return list(
+        # Zips batches into a 3-Tuple Tuple[ List[str] , List[np.ndarray], List[List[str]] ]
+        #                        Per batch:  file_names, images          , labels
+        zip(
+            total_fname_batches,
+            total_feature_batches,
+            total_label_batches
+        )
+    )
+
+
+def build_dataset(
+        archive: ZipFile,
+        folder: str,
+        batch_size: int
+) -> List[BatchTuple]:
+    return build_batches_from_entries(extract_data_entries(archive, folder), batch_size)
@@ -0,0 +1,102 @@
+import os
+from typing import List, Optional
+from zipfile import ZipFile
+
+import pytorch_lightning as pl
+import torch
+from comer.datamodule.crohme.dataset import CROHMEDataset
+from torch.utils.data.dataloader import DataLoader
+
+from comer.datamodule.crohme import Batch, build_dataset, BatchTuple
+from comer.datamodule.crohme.vocab import vocab
+
+
+# Used to transform a Lighting-Batch into some other form (here, our custom Batch)
+def collate_fn(batch: List[BatchTuple]) -> Batch:
+    assert len(batch) == 1
+    batch = batch[0]
+    fnames = batch[0]
+    images_x = batch[1]
+    seqs_y = [vocab.words2indices(x) for x in batch[2]]
+
+    heights_x = [s.size(1) for s in images_x]
+    widths_x = [s.size(2) for s in images_x]
+
+    n_samples = len(heights_x)
+    max_height_x = max(heights_x)
+    max_width_x = max(widths_x)
+
+    x = torch.zeros(n_samples, 1, max_height_x, max_width_x)
+    x_mask = torch.ones(n_samples, max_height_x, max_width_x, dtype=torch.bool)
+    for idx, s_x in enumerate(images_x):
+        x[idx, :, : heights_x[idx], : widths_x[idx]] = s_x
+        x_mask[idx, : heights_x[idx], : widths_x[idx]] = 0
+
+    # return fnames, x, x_mask, seqs_y
+    return Batch(fnames, x, x_mask, seqs_y)
+
+
+class CROHMEDatamodule(pl.LightningDataModule):
+    def __init__(
+            self,
+            zipfile_path: str = f"{os.path.dirname(os.path.realpath(__file__))}/../../data.zip",
+            test_year: str = "2014",
+            train_batch_size: int = 8,
+            eval_batch_size: int = 4,
+            num_workers: int = 5,
+            scale_aug: bool = False,
+    ) -> None:
+        super().__init__()
+        assert isinstance(test_year, str)
+        self.zipfile_path = zipfile_path
+        self.test_year = test_year
+        self.train_batch_size = train_batch_size
+        self.eval_batch_size = eval_batch_size
+        self.num_workers = num_workers
+        self.scale_aug = scale_aug
+
+        print(f"Load data from: {self.zipfile_path}")
+
+    def setup(self, stage: Optional[str] = None) -> None:
+        with ZipFile(self.zipfile_path) as archive:
+            if stage == "fit" or stage is None:
+                self.train_dataset = CROHMEDataset(
+                    build_dataset(archive, "train", self.train_batch_size),
+                    True,
+                    self.scale_aug,
+                )
+                self.val_dataset = CROHMEDataset(
+                    build_dataset(archive, self.test_year, self.eval_batch_size),
+                    False,
+                    self.scale_aug,
+                )
+            if stage == "test" or stage is None:
+                self.test_dataset = CROHMEDataset(
+                    build_dataset(archive, self.test_year, self.eval_batch_size),
+                    False,
+                    self.scale_aug,
+                )
+
+    def train_dataloader(self):
+        return DataLoader(
+            self.train_dataset,
+            shuffle=True,
+            num_workers=self.num_workers,
+            collate_fn=collate_fn,
+        )
+
+    def val_dataloader(self):
+        return DataLoader(
+            self.val_dataset,
+            shuffle=False,
+            num_workers=self.num_workers,
+            collate_fn=collate_fn,
+        )
+
+    def test_dataloader(self):
+        return DataLoader(
+            self.test_dataset,
+            shuffle=False,
+            num_workers=self.num_workers,
+            collate_fn=collate_fn,
+        )
@@ -1,7 +1,10 @@
+from typing import List
+
 import torchvision.transforms as tr
 from torch.utils.data.dataset import Dataset
 
-from .transforms import ScaleAugmentation, ScaleToLimitRange
+from comer.datamodule.crohme import BatchTuple
+from comer.datamodule.utils.transforms import ScaleAugmentation, ScaleToLimitRange
 
 K_MIN = 0.7
 K_MAX = 1.4
@@ -13,7 +16,9 @@
 
 
 class CROHMEDataset(Dataset):
-    def __init__(self, ds, is_train: bool, scale_aug: bool) -> None:
+    ds: List[BatchTuple]
+
+    def __init__(self, ds: List[BatchTuple], is_train: bool, scale_aug: bool) -> None:
         super().__init__()
         self.ds = ds
 
@@ -28,11 +33,11 @@ def __init__(self, ds, is_train: bool, scale_aug: bool) -> None:
         self.transform = tr.Compose(trans_list)
 
     def __getitem__(self, idx):
-        fname, img, caption = self.ds[idx]
+        file_names, images, labels = self.ds[idx]
 
-        img = [self.transform(im) for im in img]
+        images = [self.transform(im) for im in images]
 
-        return fname, img, caption
+        return file_names, images, labels
 
     def __len__(self):
         return len(self.ds)
@@ -0,0 +1,39 @@
+from dataclasses import dataclass
+from typing import TypedDict, List
+from zipfile import ZipFile
+
+from PIL import Image
+
+
+@dataclass
+class DataEntry:
+    file_name: str
+    image: Image
+    label: List[str]
+
+
+def extract_data_entries(archive: ZipFile, dir_name: str) -> List[DataEntry]:
+    """Extract all data need for a dataset from zip archive
+
+    Args:
+        archive (ZipFile):
+        dir_name (str): dir name in archive zip (eg: train, test_2014......)
+
+    Returns:
+        Data: list of tuple of image and formula
+    """
+    with archive.open(f"data/{dir_name}/caption.txt", "r") as f:
+        captions = f.readlines()
+    data: List[DataEntry] = []
+    for line in captions:
+        tmp: List[str] = line.decode().strip().split()
+        file_name: str = tmp[0]
+        label: List[str] = tmp[1:]
+        with archive.open(f"data/{dir_name}/img/{file_name}.bmp", "r") as f:
+            # move image to memory immediately, avoid lazy loading, which will lead to None pointer error in loading
+            img: Image.Image = Image.open(f).copy()
+        data.append(DataEntry(file_name, img, label))
+
+    print(f"Extract data from: {dir_name}, with data size: {len(data)}")
+
+    return data