SforAiDl · abhi-glitchhg · Mar 16, 2022 · Mar 31, 2022 · Mar 31, 2022 · Mar 31, 2022
diff --git a/.isort.cfg b/.isort.cfg
@@ -1,5 +1,5 @@
 [settings]
-known_third_party = cv2,einops,numpy,setuptools,timm,torch,torchvision
+known_third_party = cv2,einops,numpy,omegaconf,setuptools,timm,torch,torchvision
 multi_line_output=3
 include_trailing_comma=True
 force_grid_wrap=0

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -11,12 +11,12 @@ repos:
         - id: isort
 
   - repo: https://github.com/python/black
-    rev: 20.8b1
+    rev: 22.3.0
     hooks:
         - id: black
           language_version: python3.8
 
   # - repo: https://gitlab.com/pycqa/flake8
   #   rev: 3.8.3
   #   hooks:
-  #       - id: flake8
+  #       - id: flake8
diff --git a/requirements.txt b/requirements.txt
@@ -19,6 +19,7 @@ jinja2-time==0.2.0
 MarkupSafe==2.0.1
 nodeenv==1.6.0
 olefile
+omegaconf
 packaging==21.0
 Pillow
 platformdirs==2.3.0
@@ -41,4 +42,4 @@ timm==0.4.12
 typing-extensions
 urllib3==1.26.6
 virtualenv==20.7.2
-opencv-python==4.5.3.56
+opencv-python==4.5.3.56
diff --git a/tests/test_config.py b/tests/test_config.py
@@ -0,0 +1,48 @@
+import torch
+
+from vformer.config import LazyCall, instantiate
+from vformer.models import PVTSegmentation, SwinTransformer, VanillaViT, ViViTModel2
+
+
+def test_lazy():
+    # classification models
+    vanilla_config = LazyCall(VanillaViT)(img_size=224, patch_size=7, n_classes=10)
+    swin_config = LazyCall(SwinTransformer)(
+        img_size=224,
+        patch_size=4,
+        in_channels=3,
+        n_classes=10,
+        embedding_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        p_dropout=0.2,
+    )
+    vivit_config = LazyCall(ViViTModel2)(
+        img_size=224,
+        in_channels=3,
+        patch_size=16,
+        embedding_dim=192,
+        depth=4,
+        num_heads=3,
+        head_dim=64,
+        num_frames=1,
+        n_classes=10,
+    )
+
+    # dense models
+    pvt_config = LazyCall(PVTSegmentation)()
+    pvt_config["img_size"] = 224
+    rand_img_tensor = torch.randn(4, 3, 224, 224)
+    rand_vdo_tensor = torch.randn([32, 16, 3, 224, 224])
+
+    vanilla_vit = instantiate(vanilla_config)
+    swin_vit = instantiate(swin_config)
+    vivit = instantiate(vivit_config)
+
+    pvt = instantiate(pvt_config)
+
+    assert vanilla_vit(rand_img_tensor).shape == (4, 10)
+    assert swin_vit(rand_img_tensor).shape == (4, 10)
+    assert pvt(rand_img_tensor).shape == (4, 1, 224, 224)
+    assert vivit(rand_vdo_tensor).shape == (32, 10)
diff --git a/vformer/attention/convvt.py b/vformer/attention/convvt.py
@@ -65,7 +65,7 @@ def __init__(
         self.with_cls_token = with_cls_token
         self.dim = dim_out
         self.num_heads = num_heads
-        self.scale = dim_out ** -0.5
+        self.scale = dim_out**-0.5
         self.h, self.w = img_size, img_size
         self.conv_proj_q = self._build_projection(
             dim_in, kernel_size, padding_q, stride_q, method

diff --git a/vformer/attention/cross.py b/vformer/attention/cross.py
@@ -41,7 +41,7 @@ def __init__(self, cls_dim, patch_dim, num_heads=8, head_dim=64):
 
         inner_dim = num_heads * head_dim
         self.num_heads = num_heads
-        self.scale = head_dim ** -0.5
+        self.scale = head_dim**-0.5
         self.fl = _Projection(cls_dim, patch_dim)
         self.gl = _Projection(patch_dim, cls_dim)
         self.to_k = nn.Linear(patch_dim, inner_dim)

diff --git a/vformer/attention/gated_positional.py b/vformer/attention/gated_positional.py
@@ -33,13 +33,13 @@ def __init__(self, dim, num_heads=8, head_dim=64, p_dropout=0):
 
     def rel_embedding(self, n):
 
-        l = int(n ** 0.5)
+        l = int(n**0.5)
         rel_indices_x = torch.arange(l).reshape(1, -1)
         rel_indices_y = torch.arange(l).reshape(-1, 1)
         indices = rel_indices_x - rel_indices_y
         rel_indices_x = indices.repeat(l, l)
         rel_indices_y = indices.repeat_interleave(l, dim=0).repeat_interleave(l, dim=1)
-        rel_indices_d = (rel_indices_x ** 2 + rel_indices_y ** 2) ** 0.5
+        rel_indices_d = (rel_indices_x**2 + rel_indices_y**2) ** 0.5
         self.rel_indices = torch.stack(
             [rel_indices_x, rel_indices_y, rel_indices_d], dim=-1
         )

diff --git a/vformer/attention/vanilla.py b/vformer/attention/vanilla.py
@@ -30,7 +30,7 @@ def __init__(self, dim, num_heads=8, head_dim=64, p_dropout=0.0):
         project_out = not (num_heads == 1 and head_dim == dim)
 
         self.num_heads = num_heads
-        self.scale = head_dim ** -0.5
+        self.scale = head_dim**-0.5
 
         self.attend = nn.Softmax(dim=-1)
         self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False)

diff --git a/vformer/attention/window.py b/vformer/attention/window.py
@@ -43,7 +43,7 @@ def __init__(
         self.window_size = pair(window_size)
         self.num_heads = num_heads
         self.head_dim = dim // num_heads
-        self.scale = qk_scale or self.head_dim ** -0.5
+        self.scale = qk_scale or self.head_dim**-0.5
         self.qkv_bias = True
         self.relative_position_bias_table = nn.Parameter(
             torch.zeros(

diff --git a/vformer/config/__init__.py b/vformer/config/__init__.py
@@ -0,0 +1,2 @@
+from .config_utils import instantiate
+from .lazy import LazyCall
diff --git a/vformer/config/config_utils.py b/vformer/config/config_utils.py
@@ -0,0 +1,67 @@
+import logging
+import pydoc
+from collections import abc
+from typing import Any
+
+
+def _convert_target_to_string(t: Any) -> str:
+
+    module, qualname = t.__module__, t.__qualname__
+
+    module_parts = module.split(".")
+    for k in range(1, len(module_parts)):
+        prefix = ".".join(module_parts[:k])
+        candidate = f"{prefix}.{qualname}"
+        try:
+            if locate(candidate) is t:
+                return candidate
+        except ImportError:
+            pass
+    return f"{module}.{qualname}"
+
+
+def locate(name: str) -> Any:
+
+    obj = pydoc.locate(name)
+
+    # Some cases (e.g. torch.optim.sgd.SGD) not handled correctly
+    # by pydoc.locate. Try a private function from hydra.
+    if obj is None:
+        raise TypeError(f"can't locate object {obj} !")
+
+
+def instantiate(cfg):
+
+    from omegaconf import ListConfig
+
+    if isinstance(cfg, ListConfig):
+        lst = [instantiate(x) for x in cfg]
+        return ListConfig(lst, flags={"allow_objects": True})
+    if isinstance(cfg, list):
+        return [instantiate(x) for x in cfg]
+
+    if isinstance(cfg, abc.Mapping) and "_target_" in cfg:
+        # conceptually equivalent to hydra.utils.instantiate(cfg) with _convert_=all,
+        # but faster: https://github.com/facebookresearch/hydra/issues/1200
+        cfg = {k: instantiate(v) for k, v in cfg.items()}
+        cls = cfg.pop("_target_")
+        cls = instantiate(cls)
+
+        if isinstance(cls, str):
+            cls_name = cls
+            cls = locate(cls_name)
+            assert cls is not None, cls_name
+        else:
+            try:
+                cls_name = cls.__module__ + "." + cls.__qualname__
+            except Exception:
+                # target could be anything, so the above could fail
+                cls_name = str(cls)
+        assert callable(cls), f"_target_ {cls} does not define a callable object"
+        try:
+            return cls(**cfg)
+        except TypeError:
+            logger = logging.getLogger(__name__)
+            logger.error(f"Error when instantiating {cls_name}!")
+            raise
+    return cfg  # return as-is if don't know what to do
diff --git a/vformer/config/lazy.py b/vformer/config/lazy.py
@@ -0,0 +1,59 @@
+from collections import abc
+from dataclasses import is_dataclass
+
+from omegaconf import DictConfig
+
+from .config_utils import _convert_target_to_string
+
+# copied from detectron 2
+
+
+class LazyCall:
+    """
+    Wrap a callable so that when it's called, the call will not be executed,
+    but returns a dict that describes the call.
+    LazyCall object has to be called with only keyword arguments. Positional
+    arguments are not yet supported.
+    Examples:
+    ::
+        from detectron2.config import instantiate, LazyCall
+        layer_cfg = LazyCall(nn.Conv2d)(in_channels=32, out_channels=32)
+        layer_cfg.out_channels = 64   # can edit it afterwards
+        layer = instantiate(layer_cfg)
+    """
+
+    def __init__(self, target):
+        if not (callable(target) or isinstance(target, (str, abc.Mapping))):
+            raise TypeError(
+                f"target of LazyCall must be a callable or defines a callable! Got {target}"
+            )
+        self._target = target
+
+    def __call__(self, **kwargs):
+        if is_dataclass(self._target):
+            # omegaconf object cannot hold dataclass type
+            # https://github.com/omry/omegaconf/issues/784
+            target = _convert_target_to_string(self._target)
+        else:
+            target = self._target
+        kwargs["_target_"] = target
+
+        return DictConfig(content=kwargs, flags={"allow_objects": True})
+
+
+if __name__ == "__main__":
+    import vformer.models
+
+    print("ok lets check :)")
+    model_config = LazyCall(vformer.models.VanillaViT)(
+        img_size=224, patch_size=7, n_classes=4
+    )
+    print(model_config)
+    # change kwargs
+    model_config["img_size"], model_config["patch_size"] = 256, 8
+    print(model_config)
+
+    from config_utils import instantiate
+
+    model = instantiate(model_config)
+    print(model)
diff --git a/vformer/encoder/vivit.py b/vformer/encoder/vivit.py
@@ -9,7 +9,7 @@
 
 @ENCODER_REGISTRY.register()
 class ViViTEncoderBlock(nn.Module):
-    """For model 3 only """
+    """For model 3 only"""
 
     def __init__(
         self, dim, num_heads, head_dim, p_dropout, out_dim=None, hidden_dim=None

diff --git a/vformer/models/classification/convvt.py b/vformer/models/classification/convvt.py
@@ -86,7 +86,7 @@ def __init__(
         for i in range(self.num_stages):
             stage = ConvVTStage(
                 in_channels=in_channels,
-                img_size=img_size // (4 * 2 ** i),
+                img_size=img_size // (4 * 2**i),
                 with_cls_token=False if i < self.num_stages - 1 else True,
                 patch_size=patch_size[i],
                 patch_stride=patch_stride[i],

diff --git a/vformer/models/classification/swin.py b/vformer/models/classification/swin.py
@@ -102,10 +102,10 @@ def __init__(
 
         for i_layer in range(len(depths)):
             layer = SwinEncoder(
-                dim=int(embedding_dim * (2 ** i_layer)),
+                dim=int(embedding_dim * (2**i_layer)),
                 input_resolution=(
-                    (self.patch_resolution[0] // (2 ** i_layer)),
-                    self.patch_resolution[1] // (2 ** i_layer),
+                    (self.patch_resolution[0] // (2**i_layer)),
+                    self.patch_resolution[1] // (2**i_layer),
                 ),
                 depth=depths[i_layer],
                 num_heads=num_heads[i_layer],

diff --git a/vformer/models/classification/vivit.py b/vformer/models/classification/vivit.py
@@ -71,7 +71,7 @@ def __init__(
             pool=pool,
         )
 
-        patch_dim = in_channels * patch_size ** 2
+        patch_dim = in_channels * patch_size**2
         self.patch_embedding = LinearVideoEmbedding(
             embedding_dim=embedding_dim,
             patch_height=patch_size,
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from .config_utils import instantiate
		from .lazy import LazyCall