@@ -118,6 +118,17 @@ def _cfg(url='', **kwargs):
118118 'vit_deit_base_distilled_patch16_384' : _cfg (
119119 url = 'https://dl.fbaipublicfiles.com/deit/deit_base_distilled_patch16_384-d0272ac0.pth' ,
120120 input_size = (3 , 384 , 384 ), crop_pct = 1.0 , classifier = ('head' , 'head_dist' )),
121+
122+ # ViT ImageNet-21K-P pretraining
123+ 'vit_base_patch16_224_miil_in21k' : _cfg (
124+ url = 'https://miil-public-eu.oss-eu-central-1.aliyuncs.com/model-zoo/ImageNet_21K_P/models/timm/vit_base_patch16_224_in21k_miil.pth' ,
125+ mean = (0 , 0 , 0 ), std = (1 , 1 , 1 ), crop_pct = 0.875 , interpolation = 'bilinear' , num_classes = 11221 ,
126+ ),
127+ 'vit_base_patch16_224_miil' : _cfg (
128+ url = 'https://miil-public-eu.oss-eu-central-1.aliyuncs.com/model-zoo/ImageNet_21K_P/models/timm'
129+ '/vit_base_patch16_224_1k_miil_84_4.pth' ,
130+ mean = (0 , 0 , 0 ), std = (1 , 1 , 1 ), crop_pct = 0.875 , interpolation = 'bilinear' ,
131+ ),
121132}
122133
123134
@@ -687,3 +698,23 @@ def vit_deit_base_distilled_patch16_384(pretrained=False, **kwargs):
687698 model = _create_vision_transformer (
688699 'vit_deit_base_distilled_patch16_384' , pretrained = pretrained , distilled = True , ** model_kwargs )
689700 return model
701+
702+
703+ @register_model
704+ def vit_base_patch16_224_miil_in21k (pretrained = False , ** kwargs ):
705+ """ ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
706+ Weights taken from: https://github.com/Alibaba-MIIL/ImageNet21K
707+ """
708+ model_kwargs = dict (patch_size = 16 , embed_dim = 768 , depth = 12 , num_heads = 12 , qkv_bias = False , ** kwargs )
709+ model = _create_vision_transformer ('vit_base_patch16_224_miil_in21k' , pretrained = pretrained , ** model_kwargs )
710+ return model
711+
712+
713+ @register_model
714+ def vit_base_patch16_224_miil (pretrained = False , ** kwargs ):
715+ """ ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
716+ Weights taken from: https://github.com/Alibaba-MIIL/ImageNet21K
717+ """
718+ model_kwargs = dict (patch_size = 16 , embed_dim = 768 , depth = 12 , num_heads = 12 , qkv_bias = False , ** kwargs )
719+ model = _create_vision_transformer ('vit_base_patch16_224_miil' , pretrained = pretrained , ** model_kwargs )
720+ return model
0 commit comments