diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..53f8173
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,12 @@
+cache/*
+*.txt
+datasets/assets/*
+pretrained_weights/*
+
+*.pyc
+
+cache/*
+cache_ROAD/*
+cache_ROADpp/*
+
+
diff --git a/configuration/TubeR_CSN152_AVA22.yaml b/configuration/TubeR_CSN152_AVA22.yaml
index 9ae6f33..8990f6b 100644
--- a/configuration/TubeR_CSN152_AVA22.yaml
+++ b/configuration/TubeR_CSN152_AVA22.yaml
@@ -1,13 +1,13 @@
 DDP_CONFIG:
   WORLD_SIZE: 1
   WORLD_RANK: 0
-  GPU_WORLD_SIZE: 8
+  GPU_WORLD_SIZE: 4
   GPU_WORLD_RANK: 0
-  DIST_URL: 'tcp://xxx.xxx.xxx.xxx:xxxx'
-  WOLRD_URLS: ['xxx.xxx.xxx.xxx']
-  AUTO_RANK_MATCH: True
+  DIST_URL: 'tcp://161.73.173.217:23456'
+  WOLRD_URLS: ['161.73.173.217']
+  AUTO_RANK_MATCH: False
   DIST_BACKEND: 'nccl'
-  GPU: 0
+  GPU: 4
   DISTRIBUTED: True
 
 CONFIG:
@@ -17,7 +17,7 @@ CONFIG:
   USE_LOCATION: False
   TRAIN:
     EPOCH_NUM: 20
-    BATCH_SIZE: 2
+    BATCH_SIZE: 1
     LR: 1e-4
     MIN_LR: 1e-5
     LR_BACKBONE: 1e-5
@@ -39,9 +39,9 @@ CONFIG:
 
   DATA:
     DATASET_NAME: 'ava'
-    LABEL_PATH: '/xxx/datasets/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
-    ANNO_PATH: '/xxx/datasets/ava_{}_v22.json'
-    DATA_PATH: '/xxx/ava/frames/{}/'
+    LABEL_PATH: 'datasets/assets/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
+    ANNO_PATH: 'datasets/assets/ava_{}_v22.json'
+    DATA_PATH: '/mnt/pluto-theta/salman/ROAD/Datasets/ava_download/frames/{}/'
     NUM_CLASSES: 80
     MULTIGRID: False
     IMG_SIZE: 256
@@ -69,9 +69,9 @@ CONFIG:
     TEMP_LEN: 32
     SAMPLE_RATE: 2
     PRETRAINED: False
-    PRETRAIN_BACKBONE_DIR: "/xxx/irCSN_152_ft_kinetics_from_ig65m_f126851907.mat"
-    PRETRAIN_TRANSFORMER_DIR: "/xxx/detr.pth"
-    PRETRAINED_PATH: "/xxx/ADTR_CSN_152_decode_ava_22.pth"
+    PRETRAIN_BACKBONE_DIR: "pretrained_weights/irCSN_152_ft_kinetics_from_ig65m_f126851907.mat"
+    PRETRAIN_TRANSFORMER_DIR: "pretrained_weights/detr.pth"
+    PRETRAINED_PATH: "pretrained_weights/ADTR_CSN_152_decode_ava_22.pth"
     LOAD: True
     LOAD_FC: True
 
@@ -94,9 +94,9 @@ CONFIG:
     CLIPS_MAX_NORM: 0.1
 
   LOG:
-    BASE_PATH: '/xxx/AVA_Tuber'
+    BASE_PATH: 'cache/AVA_Tuber'
     LOG_DIR: 'tb_log'
     SAVE_DIR: 'checkpoints'
-    EVAL_DIR: '/xxx/AVA_Tuber/eval'
+    EVAL_DIR: 'cache/AVA_Tuber/eval'
     SAVE_FREQ: 1
     RES_DIR: 'tmp2'
\ No newline at end of file
diff --git a/configuration/TubeR_CSN152_ROAD.yaml b/configuration/TubeR_CSN152_ROAD.yaml
new file mode 100644
index 0000000..4aa78f8
--- /dev/null
+++ b/configuration/TubeR_CSN152_ROAD.yaml
@@ -0,0 +1,116 @@
+DDP_CONFIG:
+  WORLD_SIZE: 1
+  WORLD_RANK: 0
+  GPU_WORLD_SIZE: 4
+  GPU_WORLD_RANK: 0
+  DIST_URL: 'tcp://161.73.173.217:23457'
+  WOLRD_URLS: ['161.73.173.217']
+  AUTO_RANK_MATCH: False
+  DIST_BACKEND: 'nccl'
+  GPU: 4
+  DISTRIBUTED: True
+
+CONFIG:
+  EVAL_ONLY: False
+  TWO_STREAM: False
+  USE_LFB: False
+  USE_LOCATION: False
+  TRAIN:
+    EPOCH_NUM: 20
+    BATCH_SIZE: 1
+    LR: 1e-4
+    MIN_LR: 1e-5
+    LR_BACKBONE: 1e-5
+    MOMENTUM: 0.9
+    W_DECAY: 1e-4
+    LR_POLICY: 'step'
+    USE_WARMUP: False
+    WARMUP_START_LR: 1e-5
+    WARMUP_EPOCHS: 4
+    LR_MILESTONE: [10, 15]
+    STEP: 0.1
+    OPTIMIZER:
+      NAME: SGD
+    AUX_LOSS: True
+
+  VAL:
+    BATCH_SIZE: 1
+    FREQ: 2
+
+
+  DATA:
+    DATASET: 'road'
+    DATASET_NAME: 'ava'
+    TRAIN_SUBSETS: [train_3]
+    VAL_SUBSETS: [train_3]
+    SEQ_LEN: 12
+    MIN_SEQ_STEP: 1
+    MAX_SEQ_STEP: 1
+    DATA_ROOT: '/mnt/pluto-gamma/salman/ROAD/Datasets/'
+    ANNO_ROOT: '/mnt/pluto-gamma/salman/ROAD/Datasets/'
+    train_skip_step: 1
+    skip_step: 1
+
+    # DATASET_NAME: 'ava'
+    LABEL_PATH: 'datasets/road_labels.pbtxt'
+    # ANNO_PATH: 'datasets/assets/ava_{}_v22.json'
+    # DATA_PATH: '/mnt/pluto-gamma/salman/ROAD/Datasets/ava_download/frames/{}/'
+    NUM_CLASSES: 41
+    MULTIGRID: False
+    IMG_SIZE: 680
+    IMG_RESHAPE_SIZE: 512
+    TEMP_LEN: 12
+    FRAME_RATE: 2
+
+
+  MODEL:
+    SINGLE_FRAME: True
+    BACKBONE_NAME: CSN-152
+    TEMPORAL_DS_STRATEGY: decode
+    LAST_STRIDE: False
+    GENERATE_LFB: False
+    NAME: 'ava_detr_9_224'
+    ENC_LAYERS: 6
+    DEC_LAYERS: 6
+    D_MODEL: 256
+    NHEAD: 8
+    NUM_ENCODER_LAYERS: 12
+    DIM_FEEDFORWARD: 2048
+    QUERY_NUM: 15
+    NORMALIZE_BEFORE: False
+    DROPOUT: 0.1
+    DS_RATE: 8
+    TEMP_LEN: 12
+    SAMPLE_RATE: 2
+    PRETRAINED: True
+    PRETRAIN_BACKBONE_DIR: "pretrained_weights/irCSN_152_ft_kinetics_from_ig65m_f126851907.mat"
+    PRETRAIN_TRANSFORMER_DIR: "pretrained_weights/detr.pth"
+    PRETRAINED_PATH: "pretrained_weights/_TubeR_CSN152_AVA22.pth"
+    LOAD: True
+    LOAD_FC: True
+
+  MATCHER:
+    COST_CLASS: 12
+    COST_BBOX: 5
+    COST_GIOU: 2
+    BNY_LOSS: True
+    BEFORE: False
+
+  LOSS_COFS:
+    MASK_COF: 1
+    DICE_COF: 12
+    BBOX_COF: 5
+    GIOU_COF: 2
+    EOS_COF: 0.1
+    WEIGHT: 10
+    WEIGHT_CHANGE: 1000
+    LOSS_CHANGE_COF: 2
+    CLIPS_MAX_NORM: 0.1
+
+  LOG:
+    BASE_PATH: 'cache_ROAD/AVA_Tuber'
+    LOG_DIR: 'tb_log'
+    SAVE_DIR: 'checkpoints'
+    EVAL_DIR: 'cache_ROAD/AVA_Tuber/eval'
+    SAVE_FREQ: 1
+    RES_DIR: 'tmp2'
\ No newline at end of file
diff --git a/configuration/TubeR_CSN152_ROADpp.yaml b/configuration/TubeR_CSN152_ROADpp.yaml
new file mode 100644
index 0000000..d319f57
--- /dev/null
+++ b/configuration/TubeR_CSN152_ROADpp.yaml
@@ -0,0 +1,117 @@
+DDP_CONFIG:
+  WORLD_SIZE: 1
+  WORLD_RANK: 0
+  GPU_WORLD_SIZE: 4
+  GPU_WORLD_RANK: 0
+  DIST_URL: 'tcp://161.73.173.213:23457'
+  WOLRD_URLS: ['161.73.173.213']
+  AUTO_RANK_MATCH: False
+  DIST_BACKEND: 'nccl'
+  GPU: 4
+  DISTRIBUTED: True
+
+CONFIG:
+  EVAL_ONLY: False
+  TWO_STREAM: False
+  USE_LFB: False
+  USE_LOCATION: False
+  TRAIN:
+    EPOCH_NUM: 20
+    BATCH_SIZE: 1
+    LR: 1e-4
+    MIN_LR: 1e-5
+    LR_BACKBONE: 1e-5
+    MOMENTUM: 0.9
+    W_DECAY: 1e-4
+    LR_POLICY: 'step'
+    USE_WARMUP: False
+    WARMUP_START_LR: 1e-5
+    WARMUP_EPOCHS: 4
+    LR_MILESTONE: [10, 15]
+    STEP: 0.1
+    OPTIMIZER:
+      NAME: SGD
+    AUX_LOSS: True
+
+  VAL:
+    BATCH_SIZE: 1
+    FREQ: 2
+
+
+  DATA:
+    DATASET: 'roadpp'
+    DATASET_NAME: 'ava'
+    TRAIN_SUBSETS: [train]
+    VAL_SUBSETS: [val]
+    SEQ_LEN: 10
+    MIN_SEQ_STEP: 1
+    MAX_SEQ_STEP: 1
+    DATA_ROOT: '../'
+    ANNO_ROOT: '../'
+    train_skip_step: 1
+    skip_step: 1
+
+    # DATASET_NAME: 'ava'
+    LABEL_PATH: 'datasets/roadpp.pbtxt'
+    # ANNO_PATH: 'datasets/assets/ava_{}_v22.json'
+    # DATA_PATH: '/mnt/pluto-gamma/salman/ROAD/Datasets/ava_download/frames/{}/'
+    NUM_CLASSES: 43
+    MULTIGRID: False
+    IMG_SIZE: 680
+    IMG_RESHAPE_SIZE: 512
+    TEMP_LEN: 10
+    FRAME_RATE: 10
+
+
+  MODEL:
+    SINGLE_FRAME: True
+    BACKBONE_NAME: CSN-152
+    TEMPORAL_DS_STRATEGY: decode
+    LAST_STRIDE: False
+    GENERATE_LFB: False
+    NAME: 'ava_detr_9_224'
+    ENC_LAYERS: 6
+    DEC_LAYERS: 6
+    D_MODEL: 256
+    NHEAD: 8
+    NUM_ENCODER_LAYERS: 12
+    DIM_FEEDFORWARD: 2048
+    QUERY_NUM: 15
+    NORMALIZE_BEFORE: False
+    DROPOUT: 0.1
+    DS_RATE: 8
+    TEMP_LEN: 12
+    SAMPLE_RATE: 2
+    PRETRAINED: True
+    PRETRAIN_BACKBONE_DIR: "pretrained_weights/irCSN_152_ft_kinetics_from_ig65m_f126851907.mat"
+    PRETRAIN_TRANSFORMER_DIR: "pretrained_weights/detr.pth"
+    PRETRAINED_PATH: "-.pth"
+    # PRETRAINED_PATH: "cache_ROADpp/roadpp_Tuber/2023-04-08-23-36-21/checkpoints/ckpt_epoch_6.pth"
+    LOAD: True
+    LOAD_FC: True
+
+  MATCHER:
+    COST_CLASS: 12
+    COST_BBOX: 5
+    COST_GIOU: 2
+    BNY_LOSS: True
+    BEFORE: False
+
+  LOSS_COFS:
+    MASK_COF: 1
+    DICE_COF: 12
+    BBOX_COF: 5
+    GIOU_COF: 2
+    EOS_COF: 0.1
+    WEIGHT: 10
+    WEIGHT_CHANGE: 1000
+    LOSS_CHANGE_COF: 2
+    CLIPS_MAX_NORM: 0.1
+
+  LOG:
+    BASE_PATH: 'cache_ROADpp/roadpp_Tuber'
+    LOG_DIR: 'tb_log'
+    SAVE_DIR: 'checkpoints'
+    EVAL_DIR: 'cache_ROADpp/roadpp_Tuber/eval'
+    SAVE_FREQ: 1
+    RES_DIR: 'tmp2'
\ No newline at end of file
diff --git a/datasets/ava_frame.py b/datasets/ava_frame.py
index 3b322dc..30b87d3 100644
--- a/datasets/ava_frame.py
+++ b/datasets/ava_frame.py
@@ -71,6 +71,12 @@ def __getitem__(self, index):
         imgs = torch.stack(imgs, dim=0)
         imgs = imgs.permute(1, 0, 2, 3)
 
+        print('img',imgs.shape)
+        print('tar',target)
+        print('tar shape',target.shape)
+        print(rr)
+        
+
         return imgs, target
 
     def load_annotation(self, sample_id, video_frame_list):
@@ -131,7 +137,7 @@ def load_annotation(self, sample_id, video_frame_list):
         return target
 
     def loadvideo(self, start_img, vid, frame_key):
-        video_frame_path = self.frame_path
+        video_frame_path = self.frame_path.format(vid)
         video_frame_list = sorted(glob(video_frame_path + '/*.jpg'))
 
         if len(video_frame_list) == 0:
diff --git a/datasets/road_frames.py b/datasets/road_frames.py
new file mode 100644
index 0000000..f0ad0cd
--- /dev/null
+++ b/datasets/road_frames.py
@@ -0,0 +1,1231 @@
+import pandas as pd
+import cv2
+import torch.utils.data as data
+from glob import glob
+import numpy as np
+from utils.misc import collate_fn
+import torch
+import random
+from PIL import Image
+import torch.nn.functional as F
+import datasets.video_transforms as T
+import json
+
+
+# class VideoDataset(data.Dataset):
+
+#     def __init__(self, frame_path, video_frame_bbox, frame_keys_list, clip_len, frame_sample_rate,
+#                  transforms, crop_size=224, resize_size=256, mode="train", class_num=80):
+#         self.video_frame_bbox = video_frame_bbox
+#         self.video_frame_list = frame_keys_list
+#         self.frame_path = frame_path
+
+#         self.video_frame_list = self.video_frame_list
+
+#         self.crop_size = crop_size
+#         self.clip_len = clip_len
+#         self.frame_sample_rate = frame_sample_rate
+#         self.class_num = class_num
+#         self.resize_size = resize_size
+
+#         self.index_cnt = 0
+#         self._transforms = transforms
+#         self.mode = mode
+
+#         print("rescale size: {}, crop size: {}".format(resize_size, crop_size))
+
+#     def __getitem__(self, index):
+
+#         frame_key = self.video_frame_list[index]
+#         print(frame_key)
+
+
+#         vid, frame_second = frame_key.split(",")
+#         timef = int(frame_second) - 900
+
+#         start_img = np.max((timef * 30 - self.clip_len // 2 * self.frame_sample_rate, 0))
+
+#         imgs, target = self.loadvideo(start_img, vid, frame_key)
+        
+#         if len(target) == 0 or target['boxes'].shape[0] == 0:
+#             pass
+#         else:
+#             if self._transforms is not None:
+#                 imgs, target = self._transforms(imgs, target)
+
+#         while len(target) == 0 or target['boxes'].shape[0] == 0:
+#             print('resample.')
+#             self.index_cnt -= 1
+#             index = np.random.randint(len(self.video_frame_list))
+#             frame_key = self.video_frame_list[index]
+#             vid, frame_second = frame_key.split(",")
+#             timef = int(frame_second) - 900
+
+#             start_img = np.max((timef * 30 - self.clip_len // 2 * self.frame_sample_rate, 0))
+
+#             imgs, target = self.loadvideo(start_img, vid, frame_key)
+
+#             if len(target)==0 or target['boxes'].shape[0] == 0:
+#                 pass
+#             else:
+#                 if self._transforms is not None:
+#                     imgs, target = self._transforms(imgs, target)
+
+#         imgs = torch.stack(imgs, dim=0)
+#         imgs = imgs.permute(1, 0, 2, 3)
+
+
+#         print(imgs.shape)
+
+#         print(target['image_id'])
+#         print(target['boxes'])
+#         print(target['raw_boxes'])
+#         print(target['labels'])
+#         print(target['size'])
+#         print(target['orig_size'])
+#         print(target['area'])
+
+#         print(rr)
+#         return imgs, target
+
+#     def load_annotation(self, sample_id, video_frame_list):
+
+#         num_classes = self.class_num
+#         boxes, classes = [], []
+#         target = {}
+
+#         first_img = cv2.imread(video_frame_list[0])
+
+#         oh = first_img.shape[0]
+#         ow = first_img.shape[1]
+#         if oh <= ow:
+#             nh = self.resize_size
+#             nw = self.resize_size * (ow / oh)
+#         else:
+#             nw = self.resize_size
+#             nh = self.resize_size * (oh / ow)
+
+#         p_t = int(self.clip_len // 2)
+#         key_pos = p_t
+#         anno_entity = self.video_frame_bbox[sample_id]
+
+#         for i, bbox in enumerate(anno_entity["bboxes"]):
+#             label_tmp = np.zeros((num_classes, ))
+#             acts_p = anno_entity["acts"][i]
+#             for l in acts_p:
+#                 label_tmp[l] = 1
+
+#             if np.sum(label_tmp) == 0: continue
+#             p_x = np.int(bbox[0] * nw)
+#             p_y = np.int(bbox[1] * nh)
+#             p_w = np.int(bbox[2] * nw)
+#             p_h = np.int(bbox[3] * nh)
+
+#             boxes.append([p_t, p_x, p_y, p_w, p_h])
+#             classes.append(label_tmp)
+
+#         boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 5)
+#         boxes[:, 1::3].clamp_(min=0, max=int(nw))
+#         boxes[:, 2::3].clamp_(min=0, max=nh)
+
+#         if boxes.shape[0]:
+#             raw_boxes = F.pad(boxes, (1, 0, 0, 0), value=self.index_cnt)
+#         else:
+#             raw_boxes = boxes
+
+#         classes = torch.as_tensor(classes, dtype=torch.float32).reshape(-1, num_classes)
+
+#         target["image_id"] = [str(sample_id).replace(",", "_"), key_pos]
+#         target['boxes'] = boxes
+#         target['raw_boxes'] = raw_boxes
+#         target["labels"] = classes
+#         target["orig_size"] = torch.as_tensor([int(nh), int(nw)])
+#         target["size"] = torch.as_tensor([int(nh), int(nw)])
+#         self.index_cnt = self.index_cnt + 1
+
+#         return target
+
+#     def loadvideo(self, start_img, vid, frame_key):
+#         video_frame_path = self.frame_path.format(vid)
+#         video_frame_list = sorted(glob(video_frame_path + '/*.jpg'))
+
+#         if len(video_frame_list) == 0:
+#             print("path doesnt exist", video_frame_path)
+#             return [], []
+        
+#         target = self.load_annotation(frame_key, video_frame_list)
+
+#         start_img = np.max(start_img, 0)
+#         end_img = start_img + self.clip_len * self.frame_sample_rate
+#         indx_img = list(np.clip(range(start_img, end_img, self.frame_sample_rate), 0, len(video_frame_list) - 1))
+#         buffer = []
+#         for frame_idx in indx_img:
+#             tmp = Image.open(video_frame_list[frame_idx])
+#             tmp = tmp.resize((target['orig_size'][1], target['orig_size'][0]))
+#             buffer.append(tmp)
+
+#         return buffer, target
+
+#     def __len__(self):
+#         return len(self.video_frame_list)
+
+
+def make_transforms(image_set, cfg):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    print("transform image crop: {}".format(cfg.CONFIG.DATA.IMG_SIZE))
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.RandomSizeCrop_Custom(cfg.CONFIG.DATA.IMG_SIZE),
+            T.ColorJitter(),
+            normalize,
+        ])
+
+    if image_set == 'val':
+        return T.Compose([
+            T.Resize_Custom(cfg.CONFIG.DATA.IMG_SIZE),
+            normalize,
+        ])
+
+    if image_set == 'visual':
+        return T.Compose([
+            T.Resize_Custom(cfg.CONFIG.DATA.IMG_SIZE),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+
+# def obtain_generated_bboxes_training(input_csv="/xxx/AVA_v2.2/ava_{}_v2.2.csv",
+#                                      eval_only=False,
+#                                      frame_root="/xxx/frames",
+#                                      mode="train"):
+#     import os
+#     from glob import glob
+#     used=[]
+#     input_csv = input_csv.format(mode)
+#     # frame_root = frame_root.format(mode)
+
+#     video_frame_bbox = {}
+#     gt_sheet = pd.read_csv(input_csv, header=None)
+#     count = 0
+#     frame_keys_list = set()
+#     missed_videos = set()
+
+#     for index, row in gt_sheet.iterrows():
+#         vid = row[0]
+#         if not os.path.isdir(frame_root + "/" + vid + ""):
+#             missed_videos.add(vid)
+#             continue
+
+#         frame_second = row[1]
+
+#         bbox_conf = row[7]
+#         if bbox_conf < 0.8:
+#             continue
+#         frame_key = "{},{}".format(vid, str(frame_second).zfill(4))
+
+#         frame_keys_list.add(frame_key)
+
+#         count += 1
+#         bbox = [row[2], row[3], row[4], row[5]]
+#         gt = int(row[6])
+
+#         if frame_key not in video_frame_bbox.keys():
+#             video_frame_bbox[frame_key] = {}
+#             video_frame_bbox[frame_key]["bboxes"] = [bbox]
+#             video_frame_bbox[frame_key]["acts"] = [[gt - 1]]
+#         else:
+#             if bbox not in video_frame_bbox[frame_key]["bboxes"]:
+#                 video_frame_bbox[frame_key]["bboxes"].append(bbox)
+#                 video_frame_bbox[frame_key]["acts"].append([gt - 1])
+#             else:
+#                 idx = video_frame_bbox[frame_key]["bboxes"].index(bbox)
+#                 video_frame_bbox[frame_key]["acts"][idx].append(gt - 1)
+
+#     print("missed vids:")
+#     print(missed_videos)
+#     return video_frame_bbox, list(frame_keys_list)
+
+
+# def make_image_key(video_id, timestamp):
+#     """Returns a unique identifier for a video id & timestamp."""
+#     return "%s,%04d" % (video_id, int(timestamp))
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+"""
+
+Target is in xmin, ymin, xmax, ymax, label
+coordinates are in range of [0, 1] normlised height and width
+
+"""
+
+import json, os
+import torch
+import pdb, time
+import torch.utils as tutils
+import pickle
+# from .transforms import get_clip_list_resized
+import torch.nn.functional as F
+import numpy as np
+from PIL import ImageFile
+ImageFile.LOAD_TRUNCATED_IMAGES =   True
+from PIL import Image, ImageDraw
+from modules.tube_helper import make_gt_tube
+import random as random
+from modules import utils 
+from random import shuffle
+
+logger = utils.get_logger(__name__)
+
+
+def make_box_anno(llist):
+    box = [llist[2], llist[3], llist[4], llist[5]]
+    return [float(b) for b in box]
+
+
+def read_ava_annotations(anno_file):
+    # print(anno_file)
+    lines = open(anno_file, 'r').readlines()
+    annotations = {}
+    is_train = anno_file.find('train') > -1
+
+    cc = 0
+    for line in lines:
+        cc += 1
+        # if cc>500:
+        #     break
+        line = line.rstrip('\n')
+        line_list = line.split(',')
+        # print(line_list)
+        video_name = line_list[0]
+        if video_name not in annotations:
+            annotations[video_name] = {}
+        time_stamp = float(line_list[1])
+        # print(line_list)
+        numf = float(line_list[7]) ## or score
+        ts = str(int(time_stamp))
+        if len(line_list) > 2:
+            box = make_box_anno(line_list)
+            label = int(line_list[6])
+            if ts not in annotations[video_name]:
+                annotations[video_name][ts] = [[time_stamp, box, label, numf]]
+            else:
+                annotations[video_name][ts] += [[time_stamp, box, label, numf]]
+        elif not is_train:
+            if video_name not in annotations:
+                annotations[video_name][ts] = [[time_stamp, None, None, numf]]
+            else:
+                annotations[video_name][ts] += [[time_stamp, None, None, numf]]
+
+    # for video_name in annotations:
+        # print(video_name)
+    return annotations
+
+
+
+def read_labelmap(labelmap_file):
+    """Read label map and class ids."""
+
+    labelmap = {}
+    class_ids_map = {}
+    name = ""
+    class_id = ""
+    class_names = []
+    print('load label map from ', labelmap_file)
+    count = 0
+    with open(labelmap_file, "r") as f:
+        for line in f:
+            # print(line)
+            if line.startswith("  name:"):
+                name = line.split('"')[1]
+            elif line.startswith("  id:") or line.startswith("  label_id:"):
+                class_id = int(line.strip().split(" ")[-1])
+                labelmap[name] = {'org_id':class_id, 'used_id': count}
+                class_ids_map[class_id] = {'used_id':count, 'clsname': name}
+                count += 1
+                # print(class_id, name)
+                class_names.append(name)
+    
+    # class_names[0]
+    print('NUmber of classes are ', count)
+
+    return class_names, class_ids_map, labelmap
+
+
+def get_box(box, counts):
+    box = box.astype(np.float32) - 1
+    box[2] += box[0]  #convert width to xmax
+    box[3] += box[1]  #converst height to ymax
+    for bi in range(4):
+        scale = 320 if bi % 2 == 0 else 240
+        box[bi] /= scale
+        assert 0<=box[bi]<=1.01, box
+        # if add_one ==0:
+        box[bi] = min(1.0, max(0, box[bi]))
+        if counts is None:
+            box[bi] = box[bi]*682 if bi % 2 == 0 else box[bi]*512
+
+    return box, counts
+
+def get_frame_level_annos_ucf24(annotations, numf, num_classes, counts=None):
+    frame_level_annos = [ {'labeled':True,'ego_label':0,'boxes':[],'labels':[]} for _ in range(numf)]
+    add_one = 1
+    # if num_classes == 24:
+    # add_one = 0
+    for tubeid, tube in enumerate(annotations):
+    # print('numf00', numf, tube['sf'], tube['ef'])
+        for frame_index, frame_num in enumerate(np.arange(tube['sf'], tube['ef'], 1)): # start of the tube to end frame of the tube
+            label = tube['label']
+            # assert action_id == label, 'Tube label and video label should be same'
+            box, counts = get_box(tube['boxes'][frame_index, :].copy(), counts)  # get the box as an array
+            frame_level_annos[frame_num]['boxes'].append(box)
+            box_labels = np.zeros(num_classes)
+            # if add_one == 1:
+            box_labels[0] = 1 
+            box_labels[label+add_one] = 1
+            frame_level_annos[frame_num]['labels'].append(box_labels)
+            frame_level_annos[frame_num]['ego_label'] = label+1
+            # frame_level_annos[frame_index]['ego_label'][] = 1
+            if counts is not None:
+                counts[0,0] += 1
+                counts[label,1] += 1
+        
+    return frame_level_annos, counts
+
+
+def get_frame_level_annos_ava(annotations, numf, num_classes, class_ids_map, counts=None, split='val'):
+    frame_level_annos = [ {'labeled':False,'ego_label':-1,'boxes':[],'labels':[]} for _ in range(numf)]
+    
+    keyframes = []
+    skip_count = 0
+    timestamps = [ str(i) for i in range(902, 1799)]
+
+    if split == 'train':
+        timestamps = [ts for ts in annotations]
+
+    for ts in timestamps:
+        boxes = {}
+        time_stamp = int(ts)
+        frame_num = int((time_stamp - 900) * 30 + 1)
+        
+        if ts in annotations:
+            # pdb.set_trace()
+            assert time_stamp == int(annotations[ts][0][0])
+            
+            for anno in annotations[ts]:
+                box_key = '_'.join('{:0.3f}'.format(b) for b in anno[1])
+                assert 80>=anno[2]>=1, 'label should be between 1 and 80 but it is {} '.format(anno[2])
+                if anno[2] not in class_ids_map:
+                    skip_count += 1
+                    continue
+
+                class_id = class_ids_map[anno[2]]['used_id']
+                # print(class_id)
+                if box_key not in boxes:
+                    boxes[box_key] = {'box':anno[1], 'labels':np.zeros(num_classes)}
+
+                boxes[box_key]['labels'][class_id+1] = 1
+                boxes[box_key]['labels'][0] = 1
+                counts[class_id,1] += 1
+            
+            new_boxes = []
+            labels = []
+            for box_key in boxes:
+                new_boxes.append(boxes[box_key]['box'])
+                labels.append(boxes[box_key]['labels'])
+            
+            if len(new_boxes):
+                new_boxes = np.asarray(new_boxes)
+                frame_level_annos[frame_num]['boxes'] = new_boxes
+
+                labels = np.asarray(labels)
+                frame_level_annos[frame_num]['labels'] = labels
+
+                frame_level_annos[frame_num]['labeled'] = True
+                frame_level_annos[frame_num]['ego_label'] = 1
+
+
+        keyframes.append(frame_num)
+        if not frame_level_annos[frame_num]['labeled']:
+            frame_level_annos[frame_num]['ego_label'] = 0
+
+    return frame_level_annos, counts, keyframes, skip_count
+
+
+def get_filtered_tubes_ucf24(annotations):
+    filtered_tubes = []
+    for tubeid, tube in enumerate(annotations):
+        frames = []
+        boxes = []
+        label = tube['label']
+        count = 0
+        for frame_index, frame_num in enumerate(np.arange(tube['sf'], tube['ef'], 1)):
+            frames.append(frame_num+1)
+            box, _ = get_box(tube['boxes'][frame_index, :].copy(), None)
+            boxes.append(box)
+            count += 1
+        assert count == tube['boxes'].shape[0], 'numb: {} count ={}'.format(tube['boxes'].shape[0], count)
+        temp_tube = make_gt_tube(frames, boxes, label)
+        filtered_tubes.append(temp_tube)
+    return filtered_tubes
+
+
+def resize(image, size):
+    image = F.interpolate(image.unsqueeze(0), size=size, mode="nearest").squeeze(0)
+    return image
+
+
+def filter_labels(ids, all_labels, used_labels):
+    """Filter the used ids"""
+    used_ids = []
+    for id in ids:
+        label = all_labels[id]
+        if label in used_labels:
+            used_ids.append(used_labels.index(label))
+    
+    return used_ids
+
+
+def get_gt_video_list(anno_file, SUBSETS):
+    """Get video list form ground truth videos used in subset 
+    and their ground truth tubes """
+
+    with open(anno_file, 'r') as fff:
+        final_annots = json.load(fff)
+
+    video_list = []
+    for videoname in final_annots['db']:
+        if is_part_of_subsets(final_annots['db'][videoname]['split_ids'], SUBSETS):
+            video_list.append(videoname)
+
+    return video_list
+
+
+def get_filtered_tubes(label_key, final_annots, videoname):
+    
+    key_tubes = final_annots['db'][videoname][label_key]
+    all_labels = final_annots['all_'+label_key.replace('tubes','labels')]
+    labels = final_annots[label_key.replace('tubes','labels')]
+    filtered_tubes = []
+    for _ , tube in key_tubes.items():
+        label_id = tube['label_id']
+        label = all_labels[label_id]
+        if label in labels:
+            new_label_id = labels.index(label)
+            # temp_tube = GtTube(new_label_id)
+            frames = []
+            boxes = []
+            if 'annos' in tube.keys():
+                for fn, anno_id in tube['annos'].items():
+                    frames.append(int(fn))
+                    anno = final_annots['db'][videoname]['frames'][fn]['annos'][anno_id]
+                    box = anno['box'].copy()
+                    for bi in range(4):
+                        assert 0<=box[bi]<=1.01, box
+                        box[bi] = min(1.0, max(0, box[bi]))
+                        box[bi] = box[bi]*682 if bi % 2 == 0 else box[bi]*512
+                    boxes.append(box)
+            else:
+                for fn in tube['frames']:
+                    frames.append(int(fn))
+
+            temp_tube = make_gt_tube(frames, boxes, new_label_id)
+            filtered_tubes.append(temp_tube)
+            
+    return filtered_tubes
+
+
+def get_filtered_frames(label_key, final_annots, videoname, filtered_gts):
+    
+    frames = final_annots['db'][videoname]['frames']
+    if label_key == 'agent_ness':
+        all_labels = []
+        labels = []
+    else:
+        all_labels = final_annots['all_'+label_key+'_labels']
+        labels = final_annots[label_key+'_labels']
+    
+    for frame_id , frame in frames.items():
+        frame_name = '{:05d}'.format(int(frame_id))
+        if frame['annotated']>0:
+            all_boxes = []
+            if 'annos' in frame:
+                frame_annos = frame['annos']
+                for key in frame_annos:
+                    anno = frame_annos[key]
+                    box = np.asarray(anno['box'].copy())
+                    for bi in range(4):
+                        assert 0<=box[bi]<=1.01, box
+                        box[bi] = min(1.0, max(0, box[bi]))
+                        box[bi] = box[bi]*682 if bi % 2 == 0 else box[bi]*512
+                    if label_key == 'agent_ness':
+                        filtered_ids = [0]
+                    else:
+                        filtered_ids = filter_labels(anno[label_key+'_ids'], all_labels, labels)
+
+                    if len(filtered_ids)>0:
+                        all_boxes.append([box, filtered_ids])
+                
+            filtered_gts[videoname+frame_name] = all_boxes
+            
+    return filtered_gts
+
+def get_av_actions(final_annots, videoname):
+    label_key = 'av_action'
+    frames = final_annots['db'][videoname]['frames']
+    all_labels = final_annots['all_'+label_key+'_labels']
+    labels = final_annots[label_key+'_labels']
+    
+    filtered_gts = {}
+    for frame_id , frame in frames.items():
+        frame_name = '{:05d}'.format(int(frame_id))
+        if frame['annotated']>0:
+            gts = filter_labels(frame[label_key+'_ids'], all_labels, labels)
+            filtered_gts[videoname+frame_name] = gts
+            
+    return filtered_gts
+
+def get_video_tubes(final_annots, videoname):
+    
+    tubes = {}
+    for key in final_annots['db'][videoname].keys():
+        if key.endswith('tubes'):
+            filtered_tubes = get_filtered_tubes(key, final_annots, videoname)
+            tubes[key] = filtered_tubes
+    
+    return tubes
+
+
+def is_part_of_subsets(split_ids, SUBSETS):
+    
+    is_it = False
+    for subset in SUBSETS:
+    
+        if subset in split_ids:
+            is_it = True
+    
+    return is_it
+
+
+class VideoDataset(tutils.data.Dataset):
+    """
+    ROAD Detection dataset class for pytorch dataloader
+    """
+
+    def __init__(self, args, train=True, input_type='rgb', transform=None, 
+                skip_step=1, full_test=False,crop_size=224, resize_size=256):
+
+        self.num_of_classes = args.CONFIG.DATA.NUM_CLASSES
+        self.DATASET = args.CONFIG.DATA.DATASET
+        if train == True:
+            self.SUBSETS = args.CONFIG.DATA.TRAIN_SUBSETS
+        else:
+            self.SUBSETS = args.CONFIG.DATA.VAL_SUBSETS
+
+        self.SEQ_LEN = args.CONFIG.DATA.SEQ_LEN
+        self.index_cnt = 0
+        self.MIN_SEQ_STEP = args.CONFIG.DATA.MIN_SEQ_STEP
+        self.MAX_SEQ_STEP = args.CONFIG.DATA.MAX_SEQ_STEP
+        # self.MULIT_SCALE = args.MULIT_SCALE
+        self.full_test = full_test
+        self.skip_step = skip_step #max(skip_step, self.SEQ_LEN*self.MIN_SEQ_STEP/2)
+        self.num_steps = max(1, int(self.MAX_SEQ_STEP - self.MIN_SEQ_STEP + 1 )//2)
+        # self.input_type = input_type
+        self.input_type = input_type+'-images'
+        self.train = train
+        self.root = args.CONFIG.DATA.DATA_ROOT + args.CONFIG.DATA.DATASET + '/'
+        self._imgpath = os.path.join(self.root, self.input_type)
+        self.anno_root = self.root
+        if len(args.CONFIG.DATA.ANNO_ROOT)>1:
+            self.anno_root = args.CONFIG.DATA.ANNO_ROOT 
+
+        self.crop_size = crop_size
+        self.resize_size = resize_size
+
+
+        # self.image_sets = image_sets
+        self._transforms = transform
+        self.ids = list()
+        if self.DATASET == 'road':
+            self._make_lists_road()
+        elif self.DATASET == 'roadpp':
+            self._make_lists_roadpp()
+        elif self.DATASET == 'ucf24':
+            self._make_lists_ucf24() 
+        else:
+            raise Exception('Specfiy corect dataset')
+        
+        self.num_label_types = len(self.label_types)
+
+
+
+
+    def _make_lists_ucf24(self):
+
+        self.anno_file  = os.path.join(self.anno_root, 'pyannot_with_class_names.pkl')
+
+        with open(self.anno_file,'rb') as fff:
+            final_annots = pickle.load(fff)
+        
+        database = final_annots['db']
+        self.trainvideos = final_annots['trainvideos']
+        ucf_classes = final_annots['classes']
+        self.label_types =  ['action_ness', 'action'] #
+        # pdb.set_trace()
+        self.num_classes_list = [1, 24]
+        self.num_classes = 25 # one for action_ness
+        
+        self.ego_classes = ['Non_action']  +  ucf_classes
+        self.num_ego_classes = len(self.ego_classes)
+        
+        counts = np.zeros((24, 2), dtype=np.int32)
+    
+        ratios = [1.0, 1.1, 1.1, 0.9, 1.1, 0.8, 0.7, 0.8, 1.1, 1.4, 1.0, 0.8, 0.7, 1.2, 1.0, 0.8, 0.7, 1.2, 1.2, 1.0, 0.9]
+    
+        self.video_list = []
+        self.numf_list = []
+        
+        frame_level_list = []
+
+        default_ego_label = np.zeros(self.num_ego_classes)
+        default_ego_label[0] = 1
+        total_labeled_frame = 0
+        total_num_frames = 0
+
+        for videoname in sorted(database.keys()):    
+            is_part = 1
+            if 'train' in self.SUBSETS and videoname not in self.trainvideos:
+                continue
+            elif 'test' in self.SUBSETS and videoname in self.trainvideos:
+                continue
+            # print(database[videoname].keys())
+            action_id = database[videoname]['label']
+            annotations = database[videoname]['annotations']
+            
+            numf = database[videoname]['numf']
+            self.numf_list.append(numf)
+            self.video_list.append(videoname)
+            
+            # frames = database[videoname]['frames']
+            
+            frame_level_annos, counts = get_frame_level_annos_ucf24(annotations, numf, self.num_classes, counts)
+
+            frames_with_boxes = 0
+            for frame_index in range(numf): #frame_level_annos:
+                if len(frame_level_annos[frame_index]['labels'])>0:
+                    frames_with_boxes += 1
+                frame_level_annos[frame_index]['labels'] = np.asarray(frame_level_annos[frame_index]['labels'], dtype=np.float32)
+                frame_level_annos[frame_index]['boxes'] = np.asarray(frame_level_annos[frame_index]['boxes'], dtype=np.float32)
+
+            total_labeled_frame += frames_with_boxes
+            total_num_frames += numf
+
+            # logger.info('Frames with Boxes are {:d} out of {:d} in {:s}'.format(frames_with_boxes, numf, videoname))
+            frame_level_list.append(frame_level_annos)  
+            ## make ids
+            start_frames = [ f for f in range(numf-self.MIN_SEQ_STEP*self.SEQ_LEN, -1,  -self.skip_step)]
+            
+            if self.full_test and 0 not in start_frames:
+                start_frames.append(0)
+            # logger.info('number of start frames: '+ str(len(start_frames)))
+            for frame_num in start_frames:
+                step_list = [s for s in range(self.MIN_SEQ_STEP, self.MAX_SEQ_STEP+1) if numf-s*self.SEQ_LEN>=frame_num]
+                shuffle(step_list)
+                # print(len(step_list), self.num_steps)
+                for s in range(min(self.num_steps, len(step_list))):
+                    video_id = self.video_list.index(videoname)
+                    self.ids.append([video_id, frame_num ,step_list[s]])
+
+        logger.info('Labeled frames {:d}/{:d}'.format(total_labeled_frame, total_num_frames))
+        # pdb.set_trace()
+        ptrstr = '\n'
+        self.frame_level_list = frame_level_list
+        self.all_classes = [['action_ness'], ucf_classes.copy()]
+        for k, name in enumerate(self.label_types):
+            labels = self.all_classes[k]
+            # self.num_classes_list.append(len(labels))
+            for c, cls_ in enumerate(labels): # just to see the distribution of train and test sets
+                ptrstr += '-'.join(self.SUBSETS) + ' {:05d} label: ind={:02d} name:{:s}\n'.format(
+                                                counts[c,k] , c, cls_)
+        
+        ptrstr += 'Number of ids are {:d}\n'.format(len(self.ids))
+        ptrstr += 'Labeled frames {:d}/{:d}'.format(total_labeled_frame, total_num_frames)
+        self.childs = {}
+        self.num_videos = len(self.video_list)
+        self.print_str = ptrstr
+        
+    
+    def _make_lists_roadpp(self):
+        
+        # if self.MODE =='train':
+        #     self.anno_file  = os.path.join(self.root, 'road_plus_plus_trainval_v1.0.json')
+        # else:
+        #     self.anno_file  = os.path.join(self.root, 'road_plus_plus_test_v1.0.json')
+        
+        self.anno_file  = os.path.join(self.root, 'road_plus_plus_trainval_v1.0.json')
+        with open(self.anno_file,'r') as fff:
+            final_annots = json.load(fff)
+        
+        database = final_annots['db']
+        
+        # self.label_types =  final_annots['label_types'] #['agent', 'action', 'loc', 'duplex', 'triplet'] #
+        self.label_types = ['agent', 'action', 'loc'] #
+        # print(self.label_types)
+        # print(rr)
+
+        num_label_type = len(self.label_types)
+        self.num_classes = 1 ## one for presence
+        self.num_classes_list = [1]
+        for name in self.label_types: 
+            logger.info('Number of {:s}: all :: {:d} to use: {:d}'.format(name, 
+                len(final_annots['all_'+name+'_labels']),len(final_annots[name+'_labels'])))
+            numc = len(final_annots[name+'_labels'])
+            self.num_classes_list.append(numc)
+            self.num_classes += numc
+        
+        self.ego_classes = final_annots['av_action_labels']
+        self.num_ego_classes = len(self.ego_classes)
+
+        # counts = np.zeros((len(final_annots[self.label_types[-1] + '_labels']), num_label_type), dtype=np.int32)
+        counts = np.zeros((len(final_annots[self.label_types[0] + '_labels']) + len(final_annots[self.label_types[1] + '_labels']) +len(final_annots[self.label_types[2] + '_labels'])  , num_label_type), dtype=np.int32)
+
+
+        self.video_list = []
+        self.numf_list = []
+        frame_level_list = []
+
+        for videoname in sorted(database.keys()):
+            # print(is_part_of_subsets(final_annots['db'][videoname]['split_ids'], self.SUBSETS))
+            if not is_part_of_subsets(final_annots['db'][videoname]['split_ids'], self.SUBSETS):
+                continue
+            
+            numf = database[videoname]['numf']
+            self.numf_list.append(numf)
+            self.video_list.append(videoname)
+            
+            frames = database[videoname]['frames']
+            # print(numf)
+            frame_level_annos = [ {'labeled':False,'ego_label':-1,'boxes':np.asarray([]),'labels':np.asarray([])} for _ in range(numf)]
+
+            frame_nums = [int(f) for f in frames.keys()]
+            frames_with_boxes = 0
+            for frame_num in sorted(frame_nums): #loop from start to last possible frame which can make a legit sequence
+                frame_id = str(frame_num)
+                if frame_id in frames.keys() and frames[frame_id]['annotated']>0:
+                    
+                    frame_index = frame_num-1  
+                    frame_level_annos[frame_index]['labeled'] = True 
+                    # frame_level_annos[frame_index]['ego_label'] = frames[frame_id]['av_action_ids'][0]
+                    
+                    frame = frames[frame_id]
+                    if 'annos' not in frame.keys():
+                        frame = {'annos':{}}
+                    
+                    all_boxes = []
+                    all_labels = []
+                    frame_annos = frame['annos']
+                    # temp_img = cv2.imread('../roadpp/rgb-images/'+videoname+'/{:05d}.jpg'.format(frame_num))
+                    for key in frame_annos:
+                        width, height = frame['width'], frame['height']
+                        anno = frame_annos[key]
+                        box = anno['box']
+                        
+                        assert box[0]<box[2] and box[1]<box[3], box
+                        assert width==1920 and height==1280, (width, height, box)
+                        
+                        # temp_img = cv2.rectangle(temp_img, (int(box[0]*1920),int(box[1]*1280)), (int(box[2]*1920),int(box[3]*1280)), (255,0,0), 2)
+                        # cv2.imwrite('temp_img.png',temp_img)
+                        for bi in range(4):
+                            assert 0<=box[bi]<=1.01, box
+                            box[bi] = min(1.0, max(0, box[bi]))
+                        
+                        all_boxes.append(box)
+                        box_labels = np.zeros(self.num_classes-1)
+                        list_box_labels = []
+                        cc = 1
+                        for idx, name in enumerate(self.label_types):
+                            # print(idx,name)
+                            filtered_ids = filter_labels(anno[name+'_ids'], final_annots['all_'+name+'_labels'], final_annots[name+'_labels'])
+                            list_box_labels.append(filtered_ids)
+                            for fid in filtered_ids:
+                                box_labels[fid+cc-1] = 1
+                                # box_labels[0] = 1
+                            cc += self.num_classes_list[idx+1]
+
+                        all_labels.append(box_labels)
+
+                        # for box_labels in all_labels:
+                        for k, bls in enumerate(list_box_labels):
+                            for l in bls:
+                                counts[l, k] += 1 
+                    # print(videoname,frame_num)
+                    # print(rr)
+                    all_labels = np.asarray(all_labels, dtype=np.float32)
+                    all_boxes = np.asarray(all_boxes, dtype=np.float32)
+
+                    if all_boxes.shape[0]>0:
+                        frames_with_boxes += 1    
+                    frame_level_annos[frame_index]['labels'] = all_labels
+                    frame_level_annos[frame_index]['boxes'] = all_boxes
+
+            logger.info('Frames with Boxes are {:d} out of {:d} in {:s}'.format(frames_with_boxes, numf, videoname))
+            frame_level_list.append(frame_level_annos)  
+
+            ## make ids
+            start_frames = [ f for f in range(numf-self.MIN_SEQ_STEP*self.SEQ_LEN, 1,  -self.skip_step)]
+            if self.full_test and 1 not in start_frames:
+                start_frames.append(1)
+            logger.info('number of start frames: '+ str(len(start_frames)))
+            for frame_num in start_frames:
+                step_list = [s for s in range(self.MIN_SEQ_STEP, self.MAX_SEQ_STEP+1) if numf-s*self.SEQ_LEN>=frame_num]
+                shuffle(step_list)
+                # print(len(step_list), self.num_steps)
+                for s in range(min(self.num_steps, len(step_list))):
+                    video_id = self.video_list.index(videoname)
+                    if len(frame_level_list[video_id][frame_num+int(self.SEQ_LEN/2)]['boxes']) >0:
+                            self.ids.append([video_id, frame_num ,step_list[s]])
+
+        # pdb.set_trace()
+        ptrstr = ''
+        self.frame_level_list = frame_level_list
+        self.all_classes = [['agent_ness']]
+        for k, name in enumerate(self.label_types):
+            labels = final_annots[name+'_labels']
+            self.all_classes.append(labels)
+            # self.num_classes_list.append(len(labels))
+            for c, cls_ in enumerate(labels): # just to see the distribution of train and test sets
+                ptrstr += '-'.join(self.SUBSETS) + ' {:05d} label: ind={:02d} name:{:s}\n'.format(
+                                                counts[c,k] , c, cls_)
+        
+        ptrstr += 'Number of ids are {:d}\n'.format(len(self.ids))
+
+        self.label_types = ['agent_ness'] + self.label_types
+        self.childs = {'duplex_childs':final_annots['duplex_childs'], 'triplet_childs':final_annots['triplet_childs']}
+        self.num_videos = len(self.video_list)
+        self.print_str = ptrstr
+
+
+
+    def _make_lists_road(self):
+        
+        self.anno_file  = os.path.join(self.root, 'road_trainval_v1.0.json')
+
+        with open(self.anno_file,'r') as fff:
+            final_annots = json.load(fff)
+        
+        database = final_annots['db']
+        
+        # self.label_types =  final_annots['label_types'] #['agent', 'action', 'loc', 'duplex', 'triplet'] #
+        self.label_types = ['agent', 'action', 'loc']
+        num_label_type = len(self.label_types)
+        self.num_classes = 1 ## one for presence
+        self.num_classes_list = [1]
+        for name in self.label_types: 
+            logger.info('Number of {:s}: all :: {:d} to use: {:d}'.format(name, 
+                len(final_annots['all_'+name+'_labels']),len(final_annots[name+'_labels'])))
+            numc = len(final_annots[name+'_labels'])
+            self.num_classes_list.append(numc)
+            self.num_classes += numc
+        
+        self.ego_classes = final_annots['av_action_labels']
+        self.num_ego_classes = len(self.ego_classes)
+        
+        counts = np.zeros(((len(final_annots[self.label_types[0] + '_labels'])+len(final_annots[self.label_types[1] + '_labels'])+len(final_annots[self.label_types[2] + '_labels'])), num_label_type), dtype=np.int32)
+
+        self.video_list = []
+        self.numf_list = []
+        frame_level_list = []
+
+        for videoname in sorted(database.keys()):
+
+            if not is_part_of_subsets(final_annots['db'][videoname]['split_ids'], self.SUBSETS):
+                continue
+
+            numf = database[videoname]['numf']
+            self.numf_list.append(numf)
+            self.video_list.append(videoname)
+            
+            frames = database[videoname]['frames']
+            frame_level_annos = [ {'labeled':False,'ego_label':-1,'boxes':np.asarray([]),'labels':np.asarray([])} for _ in range(numf)]
+
+            frame_nums = [int(f) for f in frames.keys()]
+            frames_with_boxes = 0
+            for frame_num in sorted(frame_nums): #loop from start to last possible frame which can make a legit sequence
+                frame_id = str(frame_num)
+                if frame_id in frames.keys() and frames[frame_id]['annotated']>0:
+                    
+                    frame_index = frame_num-1  
+                    frame_level_annos[frame_index]['labeled'] = True 
+                    frame_level_annos[frame_index]['ego_label'] = frames[frame_id]['av_action_ids'][0]
+                    
+                    frame = frames[frame_id]
+                    if 'annos' not in frame.keys():
+                        frame = {'annos':{}}
+                    
+                    all_boxes = []
+                    all_labels = []
+                    frame_annos = frame['annos']
+                    for key in frame_annos:
+                        width, height = frame['width'], frame['height']
+                        anno = frame_annos[key]
+                        box = anno['box']
+                        
+                        assert box[0]<box[2] and box[1]<box[3], box
+                        assert width==1280 and height==960, (width, height, box)
+
+                        for bi in range(4):
+                            assert 0<=box[bi]<=1.01, box
+                            box[bi] = min(1.0, max(0, box[bi]))
+                        
+                        all_boxes.append(box)
+                        box_labels = np.zeros(self.num_classes-1)
+                        list_box_labels = []
+                        cc = 1
+                        for idx, name in enumerate(self.label_types):
+                            filtered_ids = filter_labels(anno[name+'_ids'], final_annots['all_'+name+'_labels'], final_annots[name+'_labels'])
+                            list_box_labels.append(filtered_ids)
+                            for fid in filtered_ids:
+                                box_labels[fid+cc-1] = 1
+                                # box_labels[0] = 1
+                            cc += self.num_classes_list[idx+1]
+
+                        all_labels.append(box_labels)
+
+                        # for box_labels in all_labels:
+                        for k, bls in enumerate(list_box_labels):
+                            for l in bls:
+                                counts[l, k] += 1 
+
+                    all_labels = np.asarray(all_labels, dtype=np.float32)
+                    all_boxes = np.asarray(all_boxes, dtype=np.float32)
+
+                    if all_boxes.shape[0]>0:
+                        frames_with_boxes += 1    
+                    frame_level_annos[frame_index]['labels'] = all_labels
+                    frame_level_annos[frame_index]['boxes'] = all_boxes
+
+            logger.info('Frames with Boxes are {:d} out of {:d} in {:s}'.format(frames_with_boxes, numf, videoname))
+            frame_level_list.append(frame_level_annos)  
+
+            ## make ids
+            start_frames = [ f for f in range(numf-self.MIN_SEQ_STEP*self.SEQ_LEN, 1,  -self.skip_step)]
+            # if self.full_test and 0 not in start_frames:
+            #     start_frames.append(0)
+            logger.info('number of start frames: '+ str(len(start_frames)))
+            for frame_num in start_frames:
+                step_list = [s for s in range(self.MIN_SEQ_STEP, self.MAX_SEQ_STEP+1) if numf-s*self.SEQ_LEN>=frame_num]
+                shuffle(step_list)
+                # print(len(step_list), self.num_steps)
+                for s in range(min(self.num_steps, len(step_list))):
+                    video_id = self.video_list.index(videoname)
+                    if len(frame_level_list[video_id][frame_num+int(self.SEQ_LEN/2)]['boxes']) >0:
+                        self.ids.append([video_id, frame_num ,step_list[s]])
+        # print(rr) 
+        # pdb.set_trace()
+        ptrstr = ''
+        self.frame_level_list = frame_level_list
+        self.all_classes = [['agent_ness']]
+        for k, name in enumerate(self.label_types):
+            
+            labels = final_annots[name+'_labels']
+            self.all_classes.append(labels)
+
+            # self.num_classes_list.append(len(labels))
+            for c, cls_ in enumerate(labels): # just to see the distribution of train and test sets
+                ptrstr += '-'.join(self.SUBSETS) + ' {:05d} label: ind={:02d} name:{:s}\n'.format(
+                                                counts[c,k] , c, cls_)
+        
+        ptrstr += 'Number of ids are {:d}\n'.format(len(self.ids))
+
+        self.label_types = ['agent_ness'] + self.label_types
+        self.childs = {'duplex_childs':final_annots['duplex_childs'], 'triplet_childs':final_annots['triplet_childs']}
+        self.num_videos = len(self.video_list)
+        self.print_str = ptrstr
+        
+    def __len__(self):
+        return len(self.ids)
+
+    def __getitem__(self, index):
+        id_info = self.ids[index]
+        
+        video_id, start_frame, step_size = id_info
+        videoname = self.video_list[video_id]
+        images = []
+        frame_num = start_frame
+        ego_labels = np.zeros(self.SEQ_LEN)-1
+        all_boxes = []
+        labels = []
+        ego_labels = []
+        mask = np.zeros(self.SEQ_LEN, dtype=np.int)
+        indexs = []
+        target = {}
+
+        first_img = cv2.imread(self._imgpath + '/{:s}/{:05d}.jpg'.format(videoname, frame_num+int(self.SEQ_LEN/2)))
+
+        oh = first_img.shape[0]
+        ow = first_img.shape[1]
+        if oh <= ow:
+            nh = self.resize_size
+            nw = self.resize_size * (ow / oh)
+        else:
+            nw = self.resize_size
+            nh = self.resize_size * (oh / ow)
+
+
+        p_t = int(self.SEQ_LEN // 2)
+        key_pos = p_t
+        target["image_id"] = [videoname+"_"+str(frame_num+key_pos), key_pos]
+        target["orig_size"] = torch.as_tensor([int(nh), int(nw)])
+        target["size"] = torch.as_tensor([int(nh), int(nw)])
+        
+
+        for i in range(self.SEQ_LEN):
+            indexs.append(frame_num)
+            img_name = self._imgpath + '/{:s}/{:05d}.jpg'.format(videoname, frame_num)
+            # img_name = self._imgpath + '/{:s}/img_{:05d}.jpg'.format(videoname, frame_num)
+            img = Image.open(img_name)
+            img = img.resize((target['orig_size'][1], target['orig_size'][0]))
+            images.append(img)
+            if self.frame_level_list[video_id][frame_num]['labeled']:
+                mask[i] = 1
+                all_boxes.append(self.frame_level_list[video_id][frame_num]['boxes'].copy())
+                labels.append(self.frame_level_list[video_id][frame_num]['labels'].copy())
+                # ego_labels.append(self.frame_level_list[video_id][frame_num]['ego_label'])
+            else:
+                self.index_cnt -= 1
+                all_boxes.append(np.asarray([]))
+                labels.append(np.asarray([]))
+                # ego_labels.append(-1)            
+            frame_num += step_size
+        
+        imgs, target = self._transforms(images, target)
+
+
+        imgs = torch.stack(imgs, dim=0)
+        imgs = imgs.permute(1, 0, 2, 3)
+
+
+        keyframe_box = all_boxes[key_pos]
+        keyframe_label = labels[key_pos]
+
+        boxes = []
+        for i, bbox in enumerate(keyframe_box):
+            boxes.append([p_t, bbox[0],bbox[1],bbox[2],bbox[3]])
+
+        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 5)
+        boxes[:, 1::3].clamp_(min=0, max=int(nw))
+        boxes[:, 2::3].clamp_(min=0, max=nh)
+
+        if boxes.shape[0]:
+            raw_boxes = F.pad(boxes, (1, 0, 0, 0), value=self.index_cnt)
+        else:
+            raw_boxes = boxes
+       
+        for i, bbox in enumerate(raw_boxes):
+            raw_boxes[i][2] = np.int(raw_boxes[i][2] * nw)
+            raw_boxes[i][3] = np.int(raw_boxes[i][3] * nh)
+            raw_boxes[i][4] = np.int(raw_boxes[i][4] * nw)
+            raw_boxes[i][5] = np.int(raw_boxes[i][5] * nh)
+
+
+
+        classes = torch.as_tensor(keyframe_label, dtype=torch.float32).reshape(-1, self.num_of_classes)
+
+        target['boxes'] = boxes
+        target['raw_boxes'] = raw_boxes
+        target["labels"] = classes
+        self.index_cnt = self.index_cnt + 1
+
+        # print('img',imgs.shape)
+        # print('tar',target)
+        # print('tar shape',target.shape)
+        # print(rr)
+
+        return imgs, target
+
+
+def build_dataloader(cfg):
+
+
+    train_dataset = VideoDataset(cfg, train=True, skip_step=cfg.CONFIG.DATA.train_skip_step, transform=make_transforms("train", cfg),resize_size=cfg.CONFIG.DATA.IMG_RESHAPE_SIZE,crop_size=cfg.CONFIG.DATA.IMG_SIZE)
+
+
+    val_dataset = VideoDataset(cfg, train=False, transform=make_transforms("val", cfg), skip_step=cfg.CONFIG.DATA.skip_step, full_test=True,resize_size=cfg.CONFIG.DATA.IMG_SIZE,crop_size=cfg.CONFIG.DATA.IMG_SIZE)
+
+    # train_bbox_json = json.load(open(cfg.CONFIG.DATA.ANNO_PATH.format("train")))
+    # train_video_frame_bbox, train_frame_keys_list = train_bbox_json["video_frame_bbox"], train_bbox_json["frame_keys_list"]
+
+    # train_dataset = VideoDataset(cfg.CONFIG.DATA.DATA_PATH,
+    #                              train_video_frame_bbox,
+    #                              train_frame_keys_list,
+    #                              transforms=make_transforms("train", cfg),
+    #                              frame_sample_rate=cfg.CONFIG.DATA.FRAME_RATE,
+    #                              clip_len=cfg.CONFIG.DATA.TEMP_LEN,
+    #                              resize_size=cfg.CONFIG.DATA.IMG_RESHAPE_SIZE,
+    #                              crop_size=cfg.CONFIG.DATA.IMG_SIZE,
+    #                              mode="train")
+
+    # val_bbox_json = json.load(open(cfg.CONFIG.DATA.ANNO_PATH.format("val")))
+    # val_video_frame_bbox, val_frame_keys_list = val_bbox_json["video_frame_bbox"], val_bbox_json["frame_keys_list"]
+
+    # val_dataset = VideoDataset(cfg.CONFIG.DATA.DATA_PATH,
+    #                            val_video_frame_bbox,
+    #                            val_frame_keys_list,
+    #                            transforms=make_transforms("val", cfg),
+    #                            frame_sample_rate=cfg.CONFIG.DATA.FRAME_RATE,
+    #                            clip_len=cfg.CONFIG.DATA.TEMP_LEN,
+    #                            resize_size=cfg.CONFIG.DATA.IMG_SIZE,
+    #                            crop_size=cfg.CONFIG.DATA.IMG_SIZE,
+    #                            mode="val")
+
+    if cfg.DDP_CONFIG.DISTRIBUTED:
+        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
+        val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset)
+        batch_sampler_train = torch.utils.data.BatchSampler(train_sampler, cfg.CONFIG.TRAIN.BATCH_SIZE, drop_last=True)
+    else:
+        train_sampler = None
+        val_sampler = None
+        batch_sampler_train = None
+
+    train_loader = torch.utils.data.DataLoader(train_dataset, shuffle=(train_sampler is None),
+                                               num_workers=9, pin_memory=True, batch_sampler=batch_sampler_train,
+                                               collate_fn=collate_fn)
+    val_loader = torch.utils.data.DataLoader(
+        val_dataset, batch_size=cfg.CONFIG.VAL.BATCH_SIZE, shuffle=(val_sampler is None),
+        num_workers=9, sampler=val_sampler, pin_memory=True, collate_fn=collate_fn)
+
+    # print(cfg.CONFIG.DATA.ANNO_PATH.format("train"), cfg.CONFIG.DATA.ANNO_PATH.format("val"))
+
+    return train_loader, val_loader, train_sampler, val_sampler, None
+
+def reverse_norm(imgs):
+    img = imgs
+    mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
+    std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
+    img = (img * std + mean) * 255.0
+    img = img.transpose((1, 2, 0))[..., ::-1].astype(np.uint8)
+    return img
+
+
+
+
+
+
+
+
diff --git a/datasets/road_labels.pbtxt b/datasets/road_labels.pbtxt
new file mode 100644
index 0000000..d8faa3a
--- /dev/null
+++ b/datasets/road_labels.pbtxt
@@ -0,0 +1,164 @@
+item {
+  name: "Ped"
+  id: 1
+}
+item {
+  name: "Car"
+  id: 2
+}
+item {
+  name: "Cyc"
+  id: 3
+}
+item {
+  name: "Mobike"
+  id: 4
+}
+item {
+  name: "MedVeh"
+  id: 5
+}
+item {
+  name: "LarVeh"
+  id: 6
+}
+item {
+  name: "Bus"
+  id: 7
+}
+item {
+  name: "EmVeh"
+  id: 8
+}
+item {
+  name: "TL"
+  id: 9
+}
+item {
+  name: "OthTL"
+  id: 10
+}
+item {
+  name: "Red"
+  id: 11
+}
+item {
+  name: "Amber"
+  id: 12
+}
+item {
+  name: "Green"
+  id: 13
+}
+item {
+  name: "MovAway"
+  id: 14
+}
+item {
+  name: "MovTow"
+  id: 15
+}
+item {
+  name: "Mov"
+  id: 16
+}
+item {
+  name: "Brake"
+  id: 17
+}
+item {
+  name: "Stop"
+  id: 18
+}
+item {
+  name: "IncatLft"
+  id: 19
+}
+item {
+  name: "IncatRht"
+  id: 20
+}
+item {
+  name: "HazLit"
+  id: 21
+}
+item {
+  name: "TurLft"
+  id: 22
+}
+item {
+  name: "TurRht"
+  id: 23
+}
+item {
+  name: "Ovtak"
+  id: 24
+}
+item {
+  name: "Wait2X"
+  id: 25
+}
+item {
+  name: "XingFmLft"
+  id: 26
+}
+item {
+  name: "XingFmRht"
+  id: 27
+}
+item {
+  name: "Xing"
+  id: 28
+}
+item {
+  name: "PushObj"
+  id: 29
+}
+item {
+  name: "VehLane"
+  id: 30
+}
+item {
+  name: "OutgoLane"
+  id: 31
+}
+item {
+  name: "OutgoCycLane"
+  id: 32
+}
+item {
+  name: "IncomLane"
+  id: 33
+}
+item {
+  name: "IncomCycLane"
+  id: 34
+}
+item {
+  name: "Pav"
+  id: 35
+}
+item {
+  name: "LftPav"
+  id: 36
+}
+item {
+  name: "RhtPav"
+  id: 37
+}
+item {
+  name: "Jun"
+  id: 38
+}
+item {
+  name: "xing"
+  id: 39
+}
+item {
+  name: "BusStop"
+  id: 40
+}
+item {
+  name: "parking"
+  id: 41
+}
\ No newline at end of file
diff --git a/datasets/roadpp.pbtxt b/datasets/roadpp.pbtxt
new file mode 100644
index 0000000..de6c7e5
--- /dev/null
+++ b/datasets/roadpp.pbtxt
@@ -0,0 +1,172 @@
+item {
+  name: "Ped"
+  id: 1
+}
+item {
+  name: "Car"
+  id: 2
+}
+item {
+  name: "Mobike"
+  id: 3
+}
+item {
+  name: "SmalVeh"
+  id: 4
+}
+item {
+  name: "MedVeh"
+  id: 5
+}
+item {
+  name: "LarVeh"
+  id: 6
+}
+item {
+  name: "Bus"
+  id: 7
+}
+item {
+  name: "EmVeh"
+  id: 8
+}
+item {
+  name: "MovAway"
+  id: 9
+}
+item {
+  name: "MovTow"
+  id: 10
+}
+item {
+  name: "Mov"
+  id: 11
+}
+item {
+  name: "Rev"
+  id: 12
+}
+item {
+  name: "Brake"
+  id: 13
+}
+item {
+  name: "Stop"
+  id: 14
+}
+item {
+  name: "IncatLft"
+  id: 15
+}
+item {
+  name: "IncatRht"
+  id: 16
+}
+item {
+  name: "HazLit"
+  id: 17
+}
+item {
+  name: "TurLft"
+  id: 18
+}
+item {
+  name: "TurRht"
+  id: 19
+}
+item {
+  name: "MovRht"
+  id: 20
+}
+item {
+  name: "MovLft"
+  id: 21
+}
+item {
+  name: "Ovtak"
+  id: 22
+}
+item {
+  name: "Wait2X"
+  id: 23
+}
+item {
+  name: "XingFmLft"
+  id: 24
+}
+item {
+  name: "XingFmRht"
+  id: 25
+}
+item {
+  name: "Xing"
+  id: 26
+}
+item {
+  name: "PushObj"
+  id: 27
+}
+item {
+  name: "VehLane"
+  id: 28
+}
+item {
+  name: "OutgoLane"
+  id: 29
+}
+item {
+  name: "OutgoCycLane"
+  id: 30
+}
+item {
+  name: "OutgoBusLane"
+  id: 31
+}
+item {
+  name: "IncomLane"
+  id: 32
+}
+item {
+  name: "IncomCycLane"
+  id: 33
+}
+item {
+  name: "IncomBusLane"
+  id: 34
+}
+item {
+  name: "Pav"
+  id: 35
+}
+item {
+  name: "LftPav"
+  id: 36
+}
+item {
+  name: "RhtPav"
+  id: 37
+}
+item {
+  name: "Jun"
+  id: 38
+}
+item {
+  name: "xing"
+  id: 39
+}
+item {
+  name: "BusStop"
+  id: 40
+}
+item {
+  name: "parking"
+  id: 41
+}
+item {
+  name: "LftParking"
+  id: 42
+}
+item {
+  name: "rightParking"
+  id: 43
+}
\ No newline at end of file
diff --git a/datasets/roadpp_labels.pbtxt b/datasets/roadpp_labels.pbtxt
new file mode 100644
index 0000000..361391c
--- /dev/null
+++ b/datasets/roadpp_labels.pbtxt
@@ -0,0 +1,172 @@
+item {
+    name: "Ped"
+    id: 1
+}
+item {
+    name: "Car"
+    id: 2
+}
+item {
+    name: "Mobike"
+    id: 3
+}
+item {
+    name: "SmalVeh"
+    id: 4
+}
+item {
+    name: "MedVeh"
+    id: 5
+}
+item {
+    name: "LarVeh"
+    id: 6
+}
+item {
+    name: "Bus"
+    id: 7   
+}
+item {
+    name: "EmVeh"
+    id: 8   
+}
+item {
+    name: "MovAway"
+    id: 9
+}
+item {
+    name: "MovTow"
+    id: 10  
+}
+item {
+    name: "Mov"
+    id: 11
+}
+item {
+    name: "Rev"
+    id: 12
+}
+item {
+    name: "Brake"
+    id: 13
+}
+item {
+    name: "Stop"
+    id: 14
+}
+item {
+    name: "IncatLft"
+    id: 15
+}
+item {
+    name: "IncatRht"
+    id: 16
+}
+item {
+    name: "HazLit"
+    id: 17
+}
+item {
+    name: "TurLft"
+    id: 18
+}
+item {
+    name: "TurRht"
+    id: 19
+}
+item {
+    name: "MovRht"
+    id: 20
+}
+item {
+    name: "MovLft"
+    id: 21
+}
+item {
+    name: "Ovtak"
+    id: 22
+}
+item {
+    name: "Wait2X"
+    id: 23
+}
+item {
+    name: "XingFmLft"
+    id: 24
+}
+item {
+    name: "XingFmRht"
+    id: 25
+}
+item {
+    name: "Xing"
+    id: 26
+}
+item {
+    name: "PushObj"
+    id: 27
+}
+item {
+    name: "VehLane"
+    id: 28
+}
+item {
+    name: "OutgoLane"
+    id: 29
+}
+item {
+    name: "OutgoCycLane"
+    id: 30
+}
+item {
+    name: "OutgoBusLane"
+    id: 31
+}
+item {
+    name: "IncomLane"
+    id: 32
+}
+item {
+    name: "IncomCycLane"
+    id: 33
+}
+item {
+    name: "IncomBusLane"
+    id: 34
+}
+item {
+    name: "Pav"
+    id: 35
+}
+item {
+    name: "LftPav"
+    id: 36
+}
+item {
+    name: "RhtPav"
+    id: 37
+}
+item {
+    name: "Jun"
+    id: 38
+}
+item {
+    name: "xing"
+    id: 39
+}
+item {
+    name: "BusStop"
+    id: 40
+}
+item {
+    name: "parking"
+    id: 41
+}
+item {
+    name: "LftParking"
+    id: 42
+}
+item {
+    name: "rightParking"
+    id: 43
+}
\ No newline at end of file
diff --git a/eval_tuber_roadpp.py b/eval_tuber_roadpp.py
new file mode 100644
index 0000000..785f801
--- /dev/null
+++ b/eval_tuber_roadpp.py
@@ -0,0 +1,59 @@
+import argparse
+import datetime
+import time
+
+import torch
+import torch.optim
+from tensorboardX import SummaryWriter
+
+from models.tuber_ava import build_model
+from utils.model_utils import deploy_model, load_model, save_checkpoint
+from utils.video_action_recognition import validate_tuber_detection
+from pipelines.video_action_recognition_config import get_cfg_defaults
+from pipelines.launch import spawn_workers
+from utils.utils import build_log_dir
+from datasets.road_frames import build_dataloader
+
+
+def main_worker(cfg):
+    # create tensorboard and logs
+    if cfg.DDP_CONFIG.GPU_WORLD_RANK == 0:
+        tb_logdir = build_log_dir(cfg)
+        writer = SummaryWriter(log_dir=tb_logdir)
+    else:
+        writer = None
+    # cfg.freeze()
+
+    # create model
+    print('Creating TubeR model: %s' % cfg.CONFIG.MODEL.NAME)
+    model, criterion, postprocessors = build_model(cfg)
+    model = deploy_model(model, cfg, is_tuber=True)
+    num_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print('Number of parameters in the model: %6.2fM' % (num_parameters / 1000000))
+
+    # create dataset and dataloader
+    _, test_loader, _, test_sampler,_ = build_dataloader(cfg)
+
+    # docs: add resume option
+    if not cfg.CONFIG.MODEL.LOAD: raise ("model dir not found")
+    model, _ = load_model(model, cfg, load_fc=cfg.CONFIG.MODEL.LOAD_FC)
+
+    print('Start Validation...')
+    start_time = time.time()
+    validate_tuber_detection(cfg, model, criterion, postprocessors, test_loader, 0, writer)
+
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print('testing time {}'.format(total_time_str))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Train video action recognition transformer models.')
+    parser.add_argument('--config-file',
+                        default='/xxx/TubeR_AVA_v2.2_CSN-152.yaml',
+                        help='path to config file.')
+    args = parser.parse_args()
+
+    cfg = get_cfg_defaults()
+    cfg.merge_from_file(args.config_file)
+    spawn_workers(main_worker, cfg)
diff --git a/evaluates/evaluate_ava.py b/evaluates/evaluate_ava.py
index dbb7849..4acd2f1 100644
--- a/evaluates/evaluate_ava.py
+++ b/evaluates/evaluate_ava.py
@@ -29,11 +29,18 @@ class STDetectionEvaluater(object):
 
     def __init__(self, label_path, tiou_thresholds=[0.5], load_from_dataset=False, class_num=60):
         self.label_path = label_path
+        # print('lab_path', self.label_path)
         categories, class_whitelist = read_labelmap(self.label_path)
+        # print('categories', categories)
+        # print('class_whitelist', class_whitelist)
+        
         self.class_num = class_num
+        # print('self.class_num', self.class_num)
+        
+
         if class_num == 80:
             self.exclude_keys = []
-            f = open("/xxx/datasets/ava_val_excluded_timestamps_v2.1.csv")
+            f = open("datasets/assets/ava_val_excluded_timestamps_v2.1.csv")
             while True:
                 line = f.readline().strip()
                 if not line: break
diff --git a/evaluates/utils/object_detection_evaluation.py b/evaluates/utils/object_detection_evaluation.py
index 63bd217..892c35e 100644
--- a/evaluates/utils/object_detection_evaluation.py
+++ b/evaluates/utils/object_detection_evaluation.py
@@ -132,9 +132,10 @@ def __init__(self,
         Raises:
             ValueError: If the category ids are not 1-indexed.
         """
+
         super(ObjectDetectionEvaluator, self).__init__(categories)
         self._num_classes = max([cat['id'] for cat in categories])
-
+        
 
         if min(cat['id'] for cat in categories) < 1:
             raise ValueError('Classes should be 1-indexed.')
diff --git a/modules/__init__.py b/modules/__init__.py
new file mode 100644
index 0000000..7a00503
--- /dev/null
+++ b/modules/__init__.py
@@ -0,0 +1,19 @@
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self, momentum=0.95):
+        self.momentum = momentum
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.count = 0
+        
+    def update(self, val, n=1):
+        if n>0:
+            self.val = val
+            if self.count == 0:
+                self.avg = self.val
+            else:
+                self.avg = self.avg*self.momentum + (1-self.momentum)* val
+            self.count += n
diff --git a/modules/anchor_box_kmeans.py b/modules/anchor_box_kmeans.py
new file mode 100644
index 0000000..0f98d5a
--- /dev/null
+++ b/modules/anchor_box_kmeans.py
@@ -0,0 +1,72 @@
+import torch
+from math import sqrt as sqrt
+from itertools import product as product
+import numpy as np
+from modules.utils import BufferList
+
+
+class anchorBox(torch.nn.Module):
+    """Compute anchorbox coordinates in center-offset form for each source
+    feature map.
+    """
+    def __init__(self, aspect_ratios =[0.5, 1 / 1., 1.5],
+                    scale_ratios = [1.,]):
+
+        super(anchorBox, self).__init__()
+        self.aspect_ratios = aspect_ratios
+        self.scale_ratios = scale_ratios
+        self.default_sizes= [0.01, 0.06, 0.2, 0.4, 0.85]
+        self.anchor_boxes = len(self.aspect_ratios)*len(self.scale_ratios)
+        self.ar = self.anchor_boxes
+        self.num_anchors = self.ar
+        self.cell_anchors = BufferList(self._get_cell_anchors())
+        
+    def _get_cell_anchors(self):
+        anchors = []
+        base_anchors = np.asarray([[0.0000, 0.0000, 0.0141, 0.0365],
+                                    [0.0000, 0.0000, 0.0178, 0.0614],
+                                    [0.0000, 0.0000, 0.0343, 0.0487],
+                                    [0.0000, 0.0000, 0.0450, 0.1475],
+                                    [0.0000, 0.0000, 0.0284, 0.0986],
+                                    [0.0000, 0.0000, 0.0667, 0.0691],
+                                    [0.0000, 0.0000, 0.0699, 0.2465],
+                                    [0.0000, 0.0000, 0.1629, 0.1744],
+                                    [0.0000, 0.0000, 0.1110, 0.1124],
+                                    [0.0000, 0.0000, 0.1349, 0.3740],
+                                    [0.0000, 0.0000, 0.2773, 0.3713],
+                                    [0.0000, 0.0000, 0.2406, 0.2320],
+                                    [0.0000, 0.0000, 0.3307, 0.6395],
+                                    [0.0000, 0.0000, 0.7772, 0.6261],
+                                    [0.0000, 0.0000, 0.4732, 0.3153]])
+        
+        for s1 in range(len(self.default_sizes)):
+            p_anchors = base_anchors[s1*3:(s1+1)*3,:]
+            p_anchors[:,:2] = p_anchors[:,:2]-p_anchors[:,2:]/2.0
+            p_anchors[:,2:] = p_anchors[:,2:]/2.0
+            p_anchors = torch.FloatTensor(p_anchors).cuda()
+            # print(p_anchors)
+            anchors.append(p_anchors)
+
+        return anchors
+    
+    # based on forward from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/modeling/rpn/anchor_generator.py
+    def forward(self, grid_sizes):
+        
+        anchors = []
+        for size, base_anchors in zip(grid_sizes, self.cell_anchors):
+            grid_height, grid_width = size
+            stride_h = 1.0/grid_height
+            stride_w = 1.0/grid_width
+            device = base_anchors.device
+            shifts_x = torch.arange(0, grid_width, dtype=torch.float32, device=device).cuda()
+            shifts_y = torch.arange(0, grid_height, dtype=torch.float32, device=device).cuda() 
+            shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x) 
+            shift_x = (shift_x.reshape(-1) + 0.5) * stride_w
+            shift_y = (shift_y.reshape(-1) + 0.5) * stride_h
+            shifts = torch.stack((shift_x, shift_y, shift_x, shift_y), dim=1)
+            anchors.append( (shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)).reshape(-1, 4) )
+        
+        anchors = torch.cat(anchors, 0)
+        anchors.clamp_(max=1, min=0)
+        return anchors
+        
diff --git a/modules/anchor_box_retinanet.py b/modules/anchor_box_retinanet.py
new file mode 100644
index 0000000..2b79619
--- /dev/null
+++ b/modules/anchor_box_retinanet.py
@@ -0,0 +1,80 @@
+import torch
+from math import sqrt as sqrt
+from itertools import product as product
+import numpy as np
+from modules.utils import BufferList
+
+class anchorBox(torch.nn.Module):
+    """Compute anchorbox coordinates in center-offset form for each source
+    feature map.
+    """
+    def __init__(self, sizes = [32, 64, 128, 256, 512],
+                        ratios = np.asarray([0.5, 1 / 1., 2.0]),
+                        strides = [8, 16, 32, 64, 128],
+                        scales = np.array([1, 1.25992, 1.58740])):
+
+        super(anchorBox, self).__init__()
+        self.sizes = sizes
+        self.ratios = ratios
+        self.scales = scales
+        self.strides = strides
+        self.ar = len(self.ratios)*len(self.ratios)
+        self.cell_anchors = BufferList(self._get_cell_anchors())
+        
+    def _get_cell_anchors(self):
+        anchors = []
+        for s1 in self.sizes:
+            p_anchors = np.asarray(self._gen_generate_anchors_on_one_level(s1))
+            p_anchors = torch.FloatTensor(p_anchors).cuda()
+            anchors.append(p_anchors)
+
+        return anchors
+    
+    # modified from https://github.com/fizyr/keras-retinanet/blob/master/keras_retinanet/utils/anchors.py
+    # Copyright 2017-2018 Fizyr (https://fizyr.com)
+    def _gen_generate_anchors_on_one_level(self, base_size=32):
+        
+        """
+        Generate anchor (reference) windows by enumerating aspect ratios X
+        scales w.r.t. a reference window.
+        
+        """
+
+        num_anchors = len(self.ratios) * len(self.scales)
+
+        # initialize output anchors
+        anchors = np.zeros((num_anchors, 4))
+        
+        # print(self.scales)
+        # scale base_size
+        anchors[:, 2:] = base_size * np.tile(self.scales, (2, len(self.ratios))).T
+        # print(anchors)
+        # compute areas of anchors
+        areas = anchors[:, 2] * anchors[:, 3]
+
+        anchors[:, 2] = np.sqrt(areas / np.repeat(self.ratios, len(self.scales)))
+        anchors[:, 3] = anchors[:, 2] * np.repeat(self.ratios, len(self.scales))
+        # print(anchors)
+        # transform from (x_ctr, y_ctr, w, h) -> (x1, y1, x2, y2)
+        anchors[:, 0::2] -= np.tile(anchors[:, 2] * 0.5, (2, 1)).T
+        anchors[:, 1::2] -= np.tile(anchors[:, 3] * 0.5, (2, 1)).T
+        # print(anchors)
+        return anchors
+
+    # forward from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/modeling/rpn/anchor_generator.py
+    def forward(self, grid_sizes):
+        
+        anchors = []
+        for size, stride, base_anchors in zip(grid_sizes, self.strides, self.cell_anchors):
+            grid_height, grid_width = size
+            device = base_anchors.device
+            shifts_x = torch.arange(0, grid_width, dtype=torch.float32, device=device)
+            shifts_y = torch.arange(0, grid_height, dtype=torch.float32, device=device)
+            shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x) 
+            shift_x = (shift_x.reshape(-1) + 0.5) * stride
+            shift_y = (shift_y.reshape(-1) + 0.5) * stride
+            shifts = torch.stack((shift_x, shift_y, shift_x, shift_y), dim=1)
+            anchors.append( (shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)).reshape(-1, 4) )
+
+        return torch.cat(anchors, 0)
+        
diff --git a/modules/box_utils.py b/modules/box_utils.py
new file mode 100644
index 0000000..94a973a
--- /dev/null
+++ b/modules/box_utils.py
@@ -0,0 +1,401 @@
+import torch, pdb, math
+import numpy as np
+import torchvision
+
+
+def match_anchors_wIgnore(gt_boxes, gt_labels, anchors, pos_th=0.5, nge_th=0.4, variances=[0.1, 0.2], seq_len=1):
+    # pdb.set_trace()
+    # pdb.set_trace()
+    num_mt = int(gt_labels.size(0)/seq_len)
+    
+    # pdb.set_trace()
+    seq_overlaps =[]
+    inds = torch.LongTensor([m*seq_len for m in range(num_mt)])  
+    # print('indexs device', inds.device)
+    # print(inds, num_mt)
+    ## get indexes of first frame in seq for each microtube
+    gt_labels = gt_labels[inds]
+    # print('gtb', gt_boxes)
+    # print('anchors', anchors[:10])
+    
+    for s in range(seq_len):
+        seq_overlaps.append(jaccard(gt_boxes[inds+s, :], anchors))
+    # pdb.set_trace()
+    overlaps = seq_overlaps[0]
+    # print('overlap max ', overlaps.max())
+    ## Compute average overlap
+    for s in range(seq_len-1):
+        overlaps = overlaps + seq_overlaps[s+1]
+    overlaps = overlaps/float(seq_len)
+    # pdb.set_trace()
+    best_anchor_overlap, best_anchor_idx = overlaps.max(1, keepdim=True)
+    
+    # print('MIN VAL::', best_anchor_overlap.min().item())
+    # if best_anchor_overlap.min().item()<0.25:
+    #     print('MIN VAL::', best_anchor_overlap.min().item())
+    #     print('lower than o.5', best_anchor_overlap, gt_boxes)
+    # [1,num_anchors] best ground truth for each anchor
+    
+    best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True)
+    best_truth_idx.squeeze_(0)
+    best_truth_overlap.squeeze_(0)
+    best_anchor_idx.squeeze_(1)
+    best_anchor_overlap.squeeze_(1)
+    best_truth_overlap.index_fill_(0, best_anchor_idx, 2)  # ensure best anchor
+    # ensure every gt matches with its anchor of max overlap
+    for j in range(best_anchor_idx.size(0)):
+        best_truth_idx[best_anchor_idx[j]] = j
+
+    conf = gt_labels[best_truth_idx] + 1  # assigned nearest class label
+    conf[best_truth_overlap < pos_th] = -1  # label as ignore
+    conf[best_truth_overlap < nge_th] = 0  # label as background
+
+    for s in range(seq_len):
+        st = gt_boxes[inds + s, :]
+        matches = st[best_truth_idx]  # Shape: [num_anchors,4]
+        if s == 0:
+            loc = encode(matches, anchors[:, s * 4:(s + 1) * 4], variances)  
+            # Shape: [num_anchors, 4] -- encode the gt boxes for frame i
+        else:
+            temp = encode(matches, anchors[:, s * 4:(s + 1) * 4], variances)
+            loc = torch.cat([loc, temp], 1)  # shape: [num_anchors x 4 * seql_len] : stacking the location targets for different frames
+    # pdb.set_trace()
+    return conf, loc
+            
+
+def hard_negative_mining(loss, labels, neg_pos_ratio):
+    """
+    It used to suppress the presence of a large number of negative prediction.
+    It works on image level not batch level.
+    For any example/image, it keeps all the positive predictions and
+     cut the number of negative predictions to make sure the ratio
+     between the negative examples and positive examples is no more
+     the given ratio for an image.
+    Args:
+        loss (N, num_anchors): the loss for each example.
+        labels (N, num_anchors): the labels.
+        neg_pos_ratio:  the ratio between the negative examples and positive examples.
+    
+    """
+    
+    pos_mask = labels > 0
+    num_pos = pos_mask.long().sum(dim=1, keepdim=True)
+    num_neg = num_pos * neg_pos_ratio
+
+    loss[pos_mask] = -math.inf
+    _, indexes = loss.sort(dim=1, descending=True)
+    _, orders = indexes.sort(dim=1)
+    neg_mask = orders < num_neg
+    return pos_mask | neg_mask
+    
+
+def point_form(boxes):
+    """ Convert anchor_boxes to (xmin, ymin, xmax, ymax)
+    representation for comparison to point form ground truth data.
+    Args:
+        boxes: (tensor) center-size default boxes from anchorbox layers.
+    Return:
+        boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes.
+    """
+    return torch.cat((boxes[:, :2] - boxes[:, 2:]/2,     # xmin, ymin
+                     boxes[:, :2] + boxes[:, 2:]/2), 1)  # xmax, ymax
+
+
+def center_size(boxes):
+    """ Convert anchor_boxes to (cx, cy, w, h)
+    representation for comparison to center-size form ground truth data.
+    Args:
+        boxes: (tensor) point_form boxes
+    Return:
+        boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes.
+    """
+    return torch.cat((boxes[:, 2:] + boxes[:, :2])/2,  # cx, cy
+                     boxes[:, 2:] - boxes[:, :2], 1)  # w, h
+
+
+def intersect(box_a, box_b):
+    """ 
+    
+    We resize both tensors to [A,B,2] without new malloc:
+    [A,2] -> [A,1,2] -> [A,B,2]
+    [B,2] -> [1,B,2] -> [A,B,2]
+    Then we compute the area of intersect between box_a and box_b.
+    Args:
+      box_a: (tensor) bounding boxes, Shape: [A,4].
+      box_b: (tensor) bounding boxes, Shape: [B,4].
+    Return:
+      (tensor) intersection area, Shape: [A,B].
+    
+    """
+    # print(box_a, box_b)
+    A = box_a.size(0)
+    B = box_b.size(0)
+    # pdb.set_trace()
+    # print(box_a.type(), box_b.type())
+    max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
+                       box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
+    min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
+                       box_b[:, :2].unsqueeze(0).expand(A, B, 2))
+    inter = torch.clamp((max_xy - min_xy), min=0)
+    return inter[:, :, 0] * inter[:, :, 1]
+
+
+def jaccard(box_a, box_b):
+    """Compute the jaccard overlap of two sets of boxes.  The jaccard overlap
+    is simply the intersection over union of two boxes.  Here we operate on
+    ground truth boxes and default boxes.
+    E.g.:
+        A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
+    Args:
+        box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
+        box_b: (tensor) anchor boxes from anchorbox layers, Shape: [num_anchors,4]
+    Return:
+        jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
+    """
+    # pdb.set_trace()
+    inter = intersect(box_a, box_b)
+    area_a = ((box_a[:, 2]-box_a[:, 0]) *
+              (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter)  # [A,B]
+    area_b = ((box_b[:, 2]-box_b[:, 0]) *
+              (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter)  # [A,B]
+    
+    union = area_a + area_b - inter
+    min_union = union.min()
+
+    # print('minnin ', min_union, union)
+
+    return inter / union  # [A,B]
+
+
+def get_ovlp_cellwise(overlaps):
+    feature_maps = [38, 19, 10, 5, 3, 1]
+    aratios = [4, 6, 6, 6, 4, 4]
+    dim = 0
+    for f in feature_maps:
+        dim += f*f
+    out_ovlp = np.zeros(dim)
+    count = 0
+    st = 0
+    for k, f in enumerate(feature_maps):
+        ar = aratios[k]
+        for i in range(f*f):
+            et = st+ar
+            ovlps_tmp = overlaps[0, st:et]
+            #pdb.set_trace()
+            out_ovlp[count] = max(ovlps_tmp)
+            count += 1
+            st = et
+    assert count == dim
+
+    return out_ovlp
+
+
+def encode(matched, anchors, variances):
+    
+    """
+    
+    Encode the variances from the anchorbox layers into the ground truth boxes
+    we have matched (based on jaccard overlap) with the anchor boxes.
+    Args:
+        matched: (tensor) Coords of ground truth for each anchor in point-form
+            Shape: [num_anchors, 4].
+        anchors: (tensor) anchor boxes in center-offset form
+            Shape: [num_anchors,4].
+        variances: (list[float]) Variances of anchorboxes
+    
+    Return:
+        encoded boxes (tensor), Shape: [num_anchors, 4]
+    
+    """
+    
+    TO_REMOVE = 1 if anchors[0,2]>1 else 0 # TODO remove
+    ex_widths = anchors[:, 2] - anchors[:, 0] + TO_REMOVE
+    ex_heights = anchors[:, 3] - anchors[:, 1] + TO_REMOVE
+    ex_ctr_x = anchors[:, 0] + 0.5 * ex_widths
+    ex_ctr_y = anchors[:, 1] + 0.5 * ex_heights
+
+    gt_widths = matched[:, 2] - matched[:, 0] + TO_REMOVE
+    gt_heights = matched[:, 3] - matched[:, 1] + TO_REMOVE
+    gt_ctr_x = matched[:, 0] + 0.5 * gt_widths
+    gt_ctr_y = matched[:, 1] + 0.5 * gt_heights
+
+    
+    targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths / variances[0]
+    targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights / variances[0]
+    targets_dw = torch.log(gt_widths / ex_widths) / variances[1]
+    targets_dh = torch.log(gt_heights / ex_heights) / variances[1]
+
+    targets = torch.stack((targets_dx, targets_dy, targets_dw, targets_dh), dim=1)
+
+    return targets
+
+def decode(loc, anchors, variances=[0.1, 0.2], bbox_xform_clip=math.log(1000. / 16)):
+#     """
+#     Decode locations from predictions using anchors to undo
+#     the encoding we did for offset regression at train time.
+#     Args:
+#         loc (tensor): location predictions for loc layers,
+#             Shape: [num_anchors,4]
+#         anchors (tensor): anchor boxes in center-offset form.
+#             Shape: [num_anchors,4].
+#         variances: (list[float]) Variances of anchorboxes
+#     Return:
+#         decoded bounding box predictions
+#     """
+#     #pdb.set_trace()
+    
+    TO_REMOVE = 1 if anchors[0,2]>1 else 0 # TODO remove
+    widths = anchors[:, 2] - anchors[:, 0] + TO_REMOVE
+    heights = anchors[:, 3] - anchors[:, 1] + TO_REMOVE
+    ctr_x = anchors[:, 0] + 0.5 * widths
+    ctr_y = anchors[:, 1] + 0.5 * heights
+
+    dx = loc[:, 0::4] * variances[0]
+    dy = loc[:, 1::4] * variances[0]
+    dw = loc[:, 2::4] * variances[1]
+    dh = loc[:, 3::4] * variances[1]
+
+    # Prevent sending too large values into torch.exp()
+    dw = torch.clamp(dw, max=bbox_xform_clip)
+    dh = torch.clamp(dh, max=bbox_xform_clip)
+
+    pred_ctr_x = dx * widths[:, None] + ctr_x[:, None]
+    pred_ctr_y = dy * heights[:, None] + ctr_y[:, None]
+    pred_w = torch.exp(dw) * widths[:, None]
+    pred_h = torch.exp(dh) * heights[:, None]
+
+    pred_boxes = torch.zeros_like(loc)
+    # x1
+    pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w
+    # y1
+    pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h
+    # x2 (note: "- 1" is correct; don't be fooled by the asymmetry)
+    pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w - TO_REMOVE
+    # y2 (note: "- 1" is correct; don't be fooled by the asymmetry)
+    pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h - TO_REMOVE
+
+    return pred_boxes
+
+
+# Adapted from https://github.com/Hakuyume/chainer-ssd
+def decode_01(loc, anchors, variances):
+    """Decode locations from predictions using anchors to undo
+    the encoding we did for offset regression at train time.
+    Args:
+        loc (tensor): location predictions for loc layers,
+            Shape: [num_anchors,4]
+        anchors (tensor): anchor boxes in center-offset form.
+            Shape: [num_anchors,4].
+        variances: (list[float]) Variances of anchorboxes
+    Return:
+        decoded bounding box predictions
+    """
+    #pdb.set_trace()
+    boxes = torch.cat((
+        anchors[:, :2] + loc[:, :2] * variances[0] * anchors[:, 2:],
+        anchors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
+    boxes[:, :2] -= boxes[:, 2:] / 2
+    boxes[:, 2:] += boxes[:, :2]
+    return boxes
+    
+def decode_seq(loc, anchors, variances, seq_len):
+    boxes = []
+    #print('variances', variances)
+    for s in range(seq_len):
+        if s == 0:
+            boxes = decode(loc[:, :4], anchors[:, :4], variances)
+        else:
+            boxes = torch.cat((boxes,decode(loc[:,s*4:(s+1)*4], anchors[:,s*4:(s+1)*4], variances)),1)
+
+    return boxes
+
+
+def log_sum_exp(x):
+    """Utility function for computing log_sum_exp while determining
+    This will be used to determine unaveraged confidence loss across
+    all examples in a batch.
+    Args:
+        x (Variable(tensor)): conf_preds from conf layers
+    """
+    x_max = x.data.max()
+    return torch.log(torch.sum(torch.exp(x-x_max), 1, keepdim=True)) + x_max
+
+
+
+# def nms_pt(boxes, scores, overlap=0.5):
+#     keep = torchvision.ops.nms(boxes, scores, overlap)
+#     return keep
+    # gpu_keep = torchvision.ops.nms(boxes_for_nms.to('cuda'), scores.to('cuda'), iou_threshold)
+
+# Original author: Francisco Massa:
+# https://github.com/fmassa/object-detection.torch
+# Ported to PyTorch by Max deGroot (02/01/2017)
+def nms(boxes, scores, overlap=0.5, top_k=20, use_old_code=False):
+    """Apply non-maximum suppression at test time to avoid detecting too many
+    overlapping bounding boxes for a given object.
+    Args:
+        boxes: (tensor) The location preds for the img, Shape: [num_anchors,4].
+        scores: (tensor) The class predscores for the img, Shape:[num_anchors].
+        overlap: (float) The overlap thresh for suppressing unnecessary boxes.
+        top_k: (int) The Maximum number of box preds to consider.
+    Return:
+        The indices of the kept boxes with respect to num_anchors.
+    """
+    if use_old_code:
+        keep = scores.new(scores.size(0)).zero_().long()
+        if boxes.numel() == 0:
+            return keep, 0
+        x1 = boxes[:, 0]
+        y1 = boxes[:, 1]
+        x2 = boxes[:, 2]
+        y2 = boxes[:, 3]
+        area = torch.mul(x2 - x1, y2 - y1)
+        v, idx = scores.sort(0)  # sort in ascending order
+        # I = I[v >= 0.01]
+        idx = idx[-top_k:]  # indices of the top-k largest vals
+        xx1 = boxes.new()
+        yy1 = boxes.new()
+        xx2 = boxes.new()
+        yy2 = boxes.new()
+        w = boxes.new()
+        h = boxes.new()
+
+        # keep = torch.Tensor()
+        count = 0
+        while idx.numel() > 0:
+            i = idx[-1]  # index of current largest val
+            # keep.append(i)
+            keep[count] = i
+            count += 1
+            if idx.size(0) == 1:
+                break
+            idx = idx[:-1]  # remove kept element from view
+            # load bboxes of next highest vals
+            torch.index_select(x1, 0, idx, out=xx1)
+            torch.index_select(y1, 0, idx, out=yy1)
+            torch.index_select(x2, 0, idx, out=xx2)
+            torch.index_select(y2, 0, idx, out=yy2)
+            # store element-wise max with next highest score
+            xx1 = torch.clamp(xx1, min=x1[i])
+            yy1 = torch.clamp(yy1, min=y1[i])
+            xx2 = torch.clamp(xx2, max=x2[i])
+            yy2 = torch.clamp(yy2, max=y2[i])
+            w.resize_as_(xx2)
+            h.resize_as_(yy2)
+            w = xx2 - xx1
+            h = yy2 - yy1
+            # check sizes of xx1 and xx2.. after each iteration
+            w = torch.clamp(w, min=0.0)
+            h = torch.clamp(h, min=0.0)
+            inter = w*h
+            # IoU = i / (area(a) + area(b) - i)
+            rem_areas = torch.index_select(area, 0, idx)  # load remaining areas)
+            union = (rem_areas - inter) + area[i]
+            IoU = inter/union  # store result in iou
+            # keep only elements with an IoU <= overlap
+            idx = idx[IoU.le(overlap)]
+    else:
+        keep = torchvision.ops.nms(boxes, scores, overlap)
+        count = keep.shape[0]
+    
+    return keep, count
diff --git a/modules/detection_loss.py b/modules/detection_loss.py
new file mode 100644
index 0000000..f0a3a9f
--- /dev/null
+++ b/modules/detection_loss.py
@@ -0,0 +1,165 @@
+"""
+
+Copyright (c) 2019 Gurkirt Singh 
+ All Rights Reserved.
+
+"""
+
+import torch.nn as nn
+import torch.nn.functional as F
+import torch, pdb, time
+from modules import box_utils
+
+
+# Credits:: from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/layers/smooth_l1_loss.py
+# smooth l1 with beta
+def smooth_l1_loss(input, target, beta=1. / 9, reduction='sum'):
+    n = torch.abs(input - target)
+    cond = n < beta
+    loss = torch.where(cond, 0.5 * n ** 2 / beta, n - 0.5 * beta)
+    if reduction == 'mean':
+        return loss.mean()
+    return loss.sum()
+
+
+def sigmoid_focal_loss(preds, labels, num_pos, alpha, gamma):
+    '''Args::
+        preds: sigmoid activated predictions
+        labels: one hot encoded labels
+        num_pos: number of positve samples
+        alpha: weighting factor to baclence +ve and -ve
+        gamma: Exponent factor to baclence easy and hard examples
+       Return::
+        loss: computed loss and reduced by sum and normlised by num_pos
+     '''
+    loss = F.binary_cross_entropy(preds, labels, reduction='none')
+    alpha_factor = alpha * labels + (1.0 - alpha) * (1.0 - labels)
+    pt = preds * labels + (1.0 - preds) * (1.0 - labels)
+    focal_weight = alpha_factor * ((1-pt) ** gamma)
+    loss = (loss * focal_weight).sum() / num_pos
+    return loss
+
+def get_one_hot_labels(tgt_labels, numc):
+    new_labels = torch.zeros([tgt_labels.shape[0], numc], device=tgt_labels.device)
+    new_labels[:, tgt_labels] = 1.0
+    return new_labels
+
+
+
+class FocalLoss(nn.Module):
+    def __init__(self, args, alpha=0.25, gamma=2.0):
+        """Implement YOLO Loss.
+        Basically, combines focal classification loss
+         and Smooth L1 regression loss.
+        """
+        super(FocalLoss, self).__init__()
+        self.positive_threshold = args.POSTIVE_THRESHOLD
+        self.negative_threshold = args.NEGTIVE_THRESHOLD
+        self.num_classes = args.num_classes
+        self.num_label_types = args.num_label_types
+        self.num_classes_list = args.num_classes_list
+        self.alpha = 0.25
+        self.gamma = 2.0
+
+
+    def forward(self, confidence, predicted_locations, gt_boxes, gt_labels, counts, anchors, ego_preds, ego_labels):
+        ## gt_boxes, gt_labels, counts, ancohor_boxes
+        
+        """
+        
+        Compute classification loss and smooth l1 loss.
+        Args:
+            confidence (batch_size, num_anchors, num_classes): class predictions.
+            locations (batch_size, num_anchors, 4): predicted locations.
+            boxes list of len = batch_size and nx4 arrarys
+            anchors: (num_anchors, 4)
+
+        """
+        ego_preds = torch.sigmoid(ego_preds)
+        ps = confidence.shape
+        preds = torch.sigmoid(confidence)
+        # ps = predicted_locations.shape
+        # predicted_locations = predicted_locations.view(ps[0],ps[1], -1, [-1])
+        ball_labels = []
+        bgt_locations = []
+        blabels_bin = []
+        # mask = torch.zeros([preds.shape[0],preds.shape[1]], dtype=torch.int)
+
+        with torch.no_grad():
+            # gt_boxes = gt_boxes.cpu()
+            # gt_labels = gt_labels.cpu()
+            # anchors = anchors.cpu()
+            # device = torch.device("cpu")
+            device = preds.device
+            zeros_tensor = torch.zeros(1, gt_labels.shape[-1], device=device)
+            for b in range(gt_boxes.shape[0]):
+                all_labels = []
+                gt_locations = []
+                labels_bin = []
+                for s in range(gt_boxes.shape[1]):
+                    gt_boxes_batch = gt_boxes[b, s, :counts[b,s], :]
+                    gt_labels_batch = gt_labels[b, s, :counts[b,s], :]
+                    if counts[b,s]>0:
+                        gt_dumy_labels_batch = torch.LongTensor([i for i in range(counts[b,s])]).to(device)
+                        conf, loc = box_utils.match_anchors_wIgnore(gt_boxes_batch, gt_dumy_labels_batch, 
+                            anchors, pos_th=self.positive_threshold, nge_th=self.negative_threshold )
+                    else:
+                        loc = torch.zeros_like(anchors, device=device)
+                        conf = ego_labels.new_zeros(anchors.shape[0], device=device) - 1
+                    
+                    # print(conf.device)
+                    # print(loc.device)
+                    gt_locations.append(loc)
+                    labels_bin.append(conf)
+
+                    dumy_conf = conf.clone()
+                    dumy_conf[dumy_conf<0] = 0
+                    labels_bs = torch.cat((zeros_tensor, gt_labels_batch),0)
+                    batch_labels = labels_bs[dumy_conf,:]
+                    all_labels.append(batch_labels)
+
+                all_labels = torch.stack(all_labels, 0).float()
+                gt_locations = torch.stack(gt_locations, 0)
+                labels_bin = torch.stack(labels_bin, 0).float()
+                ball_labels.append(all_labels)
+                bgt_locations.append(gt_locations)
+                blabels_bin.append(labels_bin)
+            
+            all_labels = torch.stack(ball_labels, 0)
+            gt_locations = torch.stack(bgt_locations, 0)
+            labels_bin = torch.stack(blabels_bin, 0)
+            # mask = labels_bin > -1
+            # device = ego_preds.device
+            # all_labels = all_labels.to(device)
+            # gt_locations = gt_locations.to(device)
+            # labels_bin = labels_bin.to(device)
+
+        # bgt_locations = []
+        # blabels_bin = []
+        pos_mask = labels_bin > 0
+        num_pos = max(1.0, float(pos_mask.sum()))
+        
+        gt_locations = gt_locations[pos_mask].reshape(-1, 4)
+        predicted_locations = predicted_locations[pos_mask].reshape(-1, 4)
+        regression_loss = smooth_l1_loss(predicted_locations, gt_locations)/(num_pos * 4.0)
+        
+        # if regression_loss.item()>40:
+        #     pdb.set_trace()
+        
+        mask = labels_bin > -1 # Get mask to remove ignore examples
+        
+        masked_labels = all_labels[mask].reshape(-1, self.num_classes) # Remove Ignore labels
+        masked_preds = preds[mask].reshape(-1, self.num_classes) # Remove Ignore preds
+        cls_loss = sigmoid_focal_loss(masked_preds, masked_labels, num_pos, self.alpha, self.gamma)
+
+        mask = ego_labels>-1
+        numc = ego_preds.shape[-1]
+        masked_preds = ego_preds[mask].reshape(-1, numc) # Remove Ignore preds
+        masked_labels = ego_labels[mask].reshape(-1) # Remove Ignore labels
+        one_hot_labels = get_one_hot_labels(masked_labels, numc)
+        ego_loss = 0
+        if one_hot_labels.shape[0]>0:
+            ego_loss = sigmoid_focal_loss(masked_preds, one_hot_labels, one_hot_labels.shape[0], self.alpha, self.gamma)
+        
+        # print(regression_loss, cls_loss, ego_loss)
+        return regression_loss, cls_loss/8.0 + ego_loss/4.0
\ No newline at end of file
diff --git a/modules/evaluation.py b/modules/evaluation.py
new file mode 100644
index 0000000..14d3b09
--- /dev/null
+++ b/modules/evaluation.py
@@ -0,0 +1,692 @@
+'''
+
+Author:: Gurkirt Singh
+
+'''
+import copy
+import os
+import json
+import time
+import pdb
+import pickle
+import numpy as np
+import scipy.io as io  # to save detection as mat files
+from data.datasets import is_part_of_subsets, get_filtered_tubes, get_filtered_frames, filter_labels, read_ava_annotations
+from data.datasets import get_frame_level_annos_ucf24, get_filtered_tubes_ucf24, read_labelmap
+from modules.tube_helper import get_tube_3Diou, make_det_tube
+from modules import utils
+logger = utils.get_logger(__name__)
+
+def voc_ap(rec, prec, use_07_metric=False):
+    """ ap = voc_ap(rec, prec, [use_07_metric])
+    Compute VOC AP given precision and recall.
+    If use_07_metric is true, uses the
+    VOC 07 11 point method (default:False).
+    """
+    if use_07_metric:
+        # 11 point metric
+        ap = 0.
+        for t in np.arange(0., 1.1, 0.1):
+            if np.sum(rec >= t) == 0:
+                p = 0
+            else:
+                p = np.max(prec[rec >= t])
+            ap = ap + p / 11.
+    else:
+        # correct AP calculation
+        # first append sentinel values at the end
+        mrec = np.concatenate(([0.], rec, [1.]))
+        mpre = np.concatenate(([0.], prec, [0.]))
+
+        # compute the precision envelope
+        for i in range(mpre.size - 1, 0, -1):
+            mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
+
+        # to calculate area under PR curve, look for points
+        # where X axis (recall) changes value
+        i = np.where(mrec[1:] != mrec[:-1])[0]
+
+        # and sum (\Delta recall) * prec
+        ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
+    return ap*100
+
+
+def pr_to_ap(pr):
+    """
+    Compute AP given precision-recall
+    pr is a Nx2 array with first row being precision and second row being recall
+    """
+
+    prdif = pr[1:, 1] - pr[:-1, 1]
+    prsum = pr[1:, 0] + pr[:-1, 0]
+
+    return np.sum(prdif * prsum * 0.5)
+
+
+def get_gt_of_cls(gt_boxes, cls):
+    cls_gt_boxes = []
+    for i in range(gt_boxes.shape[0]):
+        if len(gt_boxes.shape) > 1 and int(gt_boxes[i, -1]) == cls:
+            cls_gt_boxes.append(gt_boxes[i, :-1])
+    return np.asarray(cls_gt_boxes)
+
+def compute_iou_dict(det, cls_gt_boxes):
+    # print(cls_gt_boxes, type(cls_gt_boxes))
+    cls_gt_boxes = cls_gt_boxes.reshape(-1,4)
+    # print(cls_gt_boxes, type(cls_gt_boxes))
+    return compute_iou(det['box'], cls_gt_boxes)[0]
+
+def compute_iou(box, cls_gt_boxes):
+
+    ious = np.zeros(cls_gt_boxes.shape[0])
+
+    for m in range(cls_gt_boxes.shape[0]):
+        gtbox = cls_gt_boxes[m]
+
+        xmin = max(gtbox[0], box[0])
+        ymin = max(gtbox[1], box[1])
+        xmax = min(gtbox[2], box[2])
+        ymax = min(gtbox[3], box[3])
+        iw = np.maximum(xmax - xmin, 0.)
+        ih = np.maximum(ymax - ymin, 0.)
+        if iw > 0 and ih > 0:
+            intsc = iw*ih
+        else:
+            intsc = 0.0
+        union = (gtbox[2] - gtbox[0]) * (gtbox[3] - gtbox[1]) + \
+            (box[2] - box[0]) * (box[3] - box[1]) - intsc
+        ious[m] = intsc/union
+
+    return ious
+
+
+def evaluate_detections(gt_boxes, det_boxes, classes=[], iou_thresh=0.5):
+
+    ap_strs = []
+    num_frames = len(gt_boxes)
+    logger.info('Evaluating for '+ str(num_frames) + ' frames')
+    ap_all = np.zeros(len(classes), dtype=np.float32)
+    # loop over each class 'cls'
+    for cls_ind, class_name in enumerate(classes):
+        scores = np.zeros(num_frames * 2000)
+        istp = np.zeros(num_frames * 2000)
+        det_count = 0
+        num_postives = 0.0
+        for nf in range(num_frames):  # loop over each frame 'nf'
+                # if len(gt_boxes[nf])>0 and len(det_boxes[cls_ind][nf]):
+            # get frame detections for class cls in nf
+            frame_det_boxes = np.copy(det_boxes[cls_ind][nf])
+            # get gt boxes for class cls in nf frame
+            cls_gt_boxes = get_gt_of_cls(np.copy(gt_boxes[nf]), cls_ind)
+            num_postives += cls_gt_boxes.shape[0]
+            # check if there are dection for class cls in nf frame
+            if frame_det_boxes.shape[0] > 0:
+                # sort in descending order
+                sorted_ids = np.argsort(-frame_det_boxes[:, -1])
+                for k in sorted_ids:  # start from best scoring detection of cls to end
+                    box = frame_det_boxes[k, :-1]  # detection bounfing box
+                    score = frame_det_boxes[k, -1]  # detection score
+                    ispositive = False  # set ispostive to false every time
+                    # we can only find a postive detection
+                    if cls_gt_boxes.shape[0] > 0:
+                        # if there is atleast one gt bounding for class cls is there in frame nf
+                        # compute IOU between remaining gt boxes
+                        iou = compute_iou(box, cls_gt_boxes)
+                        # and detection boxes
+                        # get the max IOU window gt index
+                        maxid = np.argmax(iou)
+                        # check is max IOU is greater than detection threshold
+                        if iou[maxid] >= iou_thresh:
+                            ispositive = True  # if yes then this is ture positive detection
+                            # remove assigned gt box
+                            cls_gt_boxes = np.delete(cls_gt_boxes, maxid, 0)
+                    # fill score array with score of current detection
+                    scores[det_count] = score
+                    if ispositive:
+                        # set current detection index (det_count)
+                        istp[det_count] = 1
+                        #  to 1 if it is true postive example
+                    det_count += 1
+        
+        if num_postives < 1:
+            num_postives = 1
+
+        scores = scores[:det_count]
+        istp = istp[:det_count]
+        argsort_scores = np.argsort(-scores)  # sort in descending order
+        istp = istp[argsort_scores]  # reorder istp's on score sorting
+        fp = np.cumsum(istp == 0)  # get false positives
+        tp = np.cumsum(istp == 1)  # get  true positives
+        fp = fp.astype(np.float64)
+        tp = tp.astype(np.float64)
+        recall = tp / float(num_postives)  # compute recall
+        # compute precision
+        precision = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
+        # compute average precision using voc2007 metric
+        cls_ap = voc_ap(recall, precision)
+        ap_all[cls_ind] = cls_ap
+        ap_str = class_name + ' : ' + \
+            str(num_postives) + ' : ' + str(det_count) + ' : ' + str(cls_ap)
+        ap_strs.append(ap_str)
+
+    mAP = np.mean(ap_all)
+    logger.info('Mean ap '+ str(mAP))
+    return mAP, ap_all, ap_strs
+
+
+def evaluate(gts, dets, all_classes, iou_thresh=0.5):
+    # np.mean(ap_all), ap_all, ap_strs
+    aps, aps_all, ap_strs = [], [], []
+    for nlt in range(len(gts)):
+        a, b, c = evaluate_detections(
+            gts[nlt], dets[nlt], all_classes[nlt], iou_thresh)
+        aps.append(a)
+        aps_all.append(b)
+        ap_strs.append(c)
+    return aps, aps_all, ap_strs
+
+
+def get_class_ap_from_scores(scores, istp, num_postives):
+    # num_postives = np.sum(istp)
+    if num_postives < 1:
+        num_postives = 1
+    argsort_scores = np.argsort(-scores)  # sort in descending order
+    istp = istp[argsort_scores]  # reorder istp's on score sorting
+    fp = np.cumsum(istp == 0)  # get false positives
+    tp = np.cumsum(istp == 1)  # get  true positives
+    fp = fp.astype(np.float64)
+    tp = tp.astype(np.float64)
+    recall = tp / float(num_postives)  # compute recall
+    # compute precision
+    precision = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
+    # compute average precision using voc2007 metric
+    cls_ap = voc_ap(recall, precision)
+    return cls_ap
+
+
+def evaluate_ego(gts, dets, classes):
+    ap_strs = []
+    num_frames = gts.shape[0]
+    logger.info('Evaluating for ' + str(num_frames) + ' frames')
+    
+    if num_frames<1:
+        return 0, [0, 0], ['no gts present','no gts present']
+
+    ap_all = []
+    sap = 0.0
+    for cls_ind, class_name in enumerate(classes):
+        scores = dets[:, cls_ind]
+        istp = np.zeros_like(gts)
+        istp[gts == cls_ind] = 1
+        det_count = num_frames
+        num_postives = np.sum(istp)
+        cls_ap = get_class_ap_from_scores(scores, istp, num_postives)
+        ap_all.append(cls_ap)
+        sap += cls_ap
+        ap_str = class_name + ' : ' + \
+            str(num_postives) + ' : ' + str(det_count) + ' : ' + str(cls_ap)
+        ap_strs.append(ap_str)
+    
+    mAP = sap/len(classes)
+    ap_strs.append('FRAME Mean AP:: {:0.2f}'.format(mAP))
+    
+    return mAP, ap_all, ap_strs
+
+
+def get_gt_tubes_ucf(final_annots, subset, label_type):
+    """Get video list form ground truth videos used in subset 
+    and their ground truth tubes """
+
+    video_list = []
+    tubes = {}
+    for videoname in final_annots['db']:
+        if videoname not in final_annots['trainvideos']:
+            video_list.append(videoname)
+            tubes[videoname] = get_filtered_tubes(
+                label_type+'_tubes', final_annots, videoname)
+
+    return video_list, tubes
+
+
+def get_gt_tubes(final_annots, subset, label_type, dataset):
+    """Get video list form ground truth videos used in subset 
+    and their ground truth tubes """
+
+    video_list = []
+    tubes = {}
+    for videoname in final_annots['db']:
+        if dataset == 'road':
+            cond = is_part_of_subsets(final_annots['db'][videoname]['split_ids'], [subset])
+        else:
+            cond = videoname not in final_annots['trainvideos']
+        if cond:
+            video_list.append(videoname)
+            if dataset == 'road':
+                tubes[videoname] = get_filtered_tubes(
+                    label_type+'_tubes', final_annots, videoname)
+            else:
+                tubes[videoname] = get_filtered_tubes_ucf24(final_annots['db'][videoname]['annotations'])
+
+    return video_list, tubes
+
+
+def get_det_class_tubes(tubes, cl_id):
+    class_tubes = []
+    for video, video_tubes in tubes.items():
+        for tube in video_tubes:
+            if tube['label_id'] == cl_id:
+                # scores, boxes = tube['scores'], tube['boxes']
+                # frames, label_id  = tube['frames'], tube['label_id']
+                class_tubes.append([video, tube]) #make_det_tube(scores, boxes, frames, label_id)])
+    return class_tubes
+
+
+def get_gt_class_tubes(tubes, cl_id):
+    class_tubes = {}
+    for video, video_tubes in tubes.items():
+        class_tubes[video] = []
+        for tube in video_tubes:
+            if tube['label_id'] == cl_id:
+                class_tubes[video].append(tube)
+    return class_tubes
+
+def compute_class_ap(class_dets, class_gts, match_func, iou_thresh, metric_type=None):
+
+    fn = max(1, sum([len(class_gts[iid])
+                        for iid in class_gts]))  # false negatives
+    num_postives = fn
+
+    if len(class_dets) == 0:
+        return  0,num_postives ,0,0
+    pr = np.empty((len(class_dets) + 1, 2), dtype=np.float32)
+    pr[0, 0] = 1.0
+    pr[0, 1] = 0.0
+
+    
+    fp = 0  # false positives
+    tp = 0  # true positives
+    
+    scores = np.zeros(len(class_dets))
+    istp = np.zeros(len(class_dets))
+
+    inv_det_scores = np.asarray([-det[1]['score'] for det in class_dets])
+    indexs = np.argsort(inv_det_scores)
+    count = 0
+    for count, det_id in enumerate(indexs):
+        is_positive = False
+        detection = class_dets[det_id]
+        iid, det = detection
+        score = det['score']
+        # pdb.set_trace()
+        if len(class_gts[iid]) > 0:
+            if metric_type is None:
+                ious = np.asarray([match_func(det, gt)
+                                    for gt in class_gts[iid]])
+            else:
+                ious = np.asarray([match_func(det, gt, metric_type)
+                                    for gt in class_gts[iid]])
+            # print(ious)
+            max_iou_id = np.argmax(ious)
+            if ious[max_iou_id] >= iou_thresh:
+                is_positive = True
+                del class_gts[iid][max_iou_id]
+        
+        scores[count] = score
+    
+        if is_positive:
+            istp[count] = 1
+            tp += 1
+            fn -= 1
+        else:
+            fp += 1
+
+        pr[count+1, 0] = float(tp) / float(tp + fp)
+        pr[count+1, 1] = float(tp) / float(tp + fn)
+    
+    class_ap = float(100*pr_to_ap(pr))
+
+    return class_ap, num_postives, count, pr[count+1, 1]
+
+
+def evaluate_tubes(anno_file, det_file,  subset='val_3', dataset='road', iou_thresh=0.2, metric_type='stiou'):
+
+    logger.info('Evaluating tubes for datasets '+ dataset)
+    logger.info('GT FILE:: '+ anno_file)
+    logger.info('Result File:: '+ det_file)
+
+    if dataset == 'road':
+        with open(anno_file, 'r') as fff:
+            final_annots = json.load(fff)
+    else:
+        with open(anno_file, 'rb') as fff:
+            final_annots = pickle.load(fff)
+
+    with open(det_file, 'rb') as fff:
+        detections = pickle.load(fff)
+
+    if dataset == 'road':
+        label_types = final_annots['label_types']
+    else:
+        label_types = ['action']
+    
+    results = {} 
+    for _, label_type in enumerate(label_types):
+
+        if dataset != 'road':
+            classes = final_annots['classes']
+        else:
+            classes = final_annots[label_type+'_labels']
+
+        logger.info('Evaluating {} {}'.format(label_type, len(classes)))
+        ap_all = []
+        re_all = []
+        ap_strs = []
+        sap = 0.0
+        video_list, gt_tubes = get_gt_tubes(final_annots, subset, label_type, dataset)
+        det_tubes = {}
+        
+        for videoname in video_list:
+            det_tubes[videoname] = detections[label_type][videoname]
+
+        for cl_id, class_name in enumerate(classes):
+            
+            class_dets = get_det_class_tubes(det_tubes, cl_id)
+            class_gts = get_gt_class_tubes(gt_tubes, cl_id)
+
+            class_ap, num_postives, count, recall = compute_class_ap(class_dets, class_gts, get_tube_3Diou, iou_thresh, metric_type=metric_type)
+
+            recall = recall*100
+            sap += class_ap
+            ap_all.append(class_ap)
+            re_all.append(recall)
+            ap_str = class_name + ' : ' + str(num_postives) + \
+                ' : ' + str(count) + ' : ' + str(class_ap) +\
+                ' : ' + str(recall)
+            ap_strs.append(ap_str)
+        mAP = sap/len(classes)
+        mean_recall = np.mean(np.asarray(re_all))
+        ap_strs.append('\nMean AP:: {:0.2f} mean Recall {:0.2f}'.format(mAP,mean_recall))
+        results[label_type] = {'mAP':mAP, 'ap_all':ap_all, 'ap_strs':ap_strs, 'recalls':re_all, 'mR':mean_recall}
+        logger.info('MAP:: {}'.format(mAP))
+    
+    return results
+
+
+
+def get_gt_frames_ucf24(final_annots, label_type):
+    """Get video list form ground truth videos used in subset 
+    and their ground truth frames """
+
+    frames = {}
+    trainvideos = final_annots['trainvideos']
+    # labels = final_annots['classes']
+    labels = ['action_ness'] + final_annots['classes']
+    num_classes = len(labels)
+    database = final_annots['db']
+    for videoname in final_annots['db']:
+        if videoname not in trainvideos:
+            numf = database[videoname]['numf']
+            fframe_level_annos, _ = get_frame_level_annos_ucf24(database[videoname]['annotations'], numf, num_classes)
+            for frame_id , frame in enumerate(fframe_level_annos):
+                frame_name = '{:05d}'.format(int(frame_id+1))
+                all_boxes = []
+                label = 0 if label_type == 'action_ness' else database[videoname]['label']
+                for k in range(len(frame['boxes'])):
+                    all_boxes.append([frame['boxes'][k], [label]])
+                frames[videoname+frame_name] = all_boxes
+
+    return frames
+
+
+def get_gt_frames_ava(final_annots, label_type):
+    """Get video list form ground truth videos used in subset 
+    and their ground truth frames """
+
+    assert label_type in ['action_ness', 'actions'], 'only valid for action classes not for actionness but TODO: should be easy to incorprate just add to eval_framewise_ego_actions_ucf24 as preds are same but gt in this format {}'.format(label_type)
+
+    frames = {}
+    # trainvideos = final_annots['trainvideos']
+    # labels = final_annots['classes']
+    # labels = ['action_ness'] + final_annots['classes']
+    # num_classes = len(labels)
+    # database = final_annots['db']
+    for videoname in final_annots:
+        # class_ids_map
+        for ts in final_annots[videoname]:
+            boxes = {}
+            time_stamp = int(ts)
+            frame_num = int((time_stamp - 900) * 30 + 1)
+            frame_name = '{:05d}'.format(frame_num)
+            if ts in final_annots[videoname]:
+                # assert time_stamp == int(annotations[ts][0][0])
+                for anno in final_annots[videoname][ts]:
+                    box_key = '_'.join('{:0.3f}'.format(b) for b in anno[1])
+                    box = copy.deepcopy(anno[1])
+                    for bi in range(4):
+                        assert 0<=box[bi]<=1.01, box
+                        box[bi] = min(1.0, max(0, box[bi]))
+                        box[bi] = box[bi]*682 if bi % 2 == 0 else box[bi]*512
+                    
+                    box = np.asarray(box)
+
+                    assert 80>=anno[2]>=1, 'label should be between 1 and 80 but it is {} '.format(anno[2])
+
+                    if box_key not in boxes:
+                        boxes[box_key] = {'box':box, 'labels':[]}
+                    if label_type == 'action_ness':
+                        boxes[box_key]['labels'].append(0)
+                    else:
+                        boxes[box_key]['labels'].append(anno[2])
+                    
+                    
+                all_boxes = []
+                for box_key in boxes:
+                    all_boxes.append([boxes[box_key]['box'], boxes[box_key]['labels']])
+                frames[videoname+frame_name] = all_boxes
+
+    return frames
+
+
+def get_gt_frames(final_annots, subsets, label_type, dataset):
+    """Get video list form ground truth videos used in subset 
+    and their ground truth frames """
+    if dataset == 'road':
+        # video_list = []
+        frames = {}
+        if not isinstance(subsets, list):
+            subsets = [subsets]
+        for videoname in final_annots['db']:
+            if is_part_of_subsets(final_annots['db'][videoname]['split_ids'], subsets):
+                # video_list.append(videoname)
+                frames = get_filtered_frames(
+                    label_type, final_annots, videoname, frames)
+    elif dataset == 'ucf24':
+        return get_gt_frames_ucf24(final_annots, label_type)
+    else:
+        return get_gt_frames_ava(final_annots, label_type)
+    
+    return frames
+
+
+def get_det_class_frames(dets, cl_id, frame_ids, dataset):
+    class_dets = []
+    for frame_id in dets:
+        if dataset == 'ucf24' or frame_id in frame_ids:
+            all_frames_dets = dets[frame_id][cl_id]
+            for i in range(all_frames_dets.shape[0]):
+                det = {'box':all_frames_dets[i,:4], 'score':all_frames_dets[i,4]}
+                class_dets.append([frame_id, det])
+    return class_dets
+
+
+def get_gt_class_frames(gts, cl_id):
+    frames = {}
+    for frame_id, frame in gts.items():
+        boxes = []
+        for anno in frame:
+            if cl_id in anno[1]:
+                boxes.append(anno[0].copy())
+        frames[frame_id] = boxes
+
+    return frames
+
+
+def eval_framewise_ego_actions_road(final_annots, detections, subsets):
+    """Get video list form ground truth videos used in subset 
+    and their ground truth frames """
+
+
+    if not isinstance(subsets, list):
+        subsets = [subsets]
+    
+    label_key = 'av_action'
+    filtered_gts = []
+    filtered_preds = []
+    all_labels = final_annots['all_'+label_key+'_labels']
+    labels = final_annots[label_key+'_labels']
+    for videoname in final_annots['db']:
+        if is_part_of_subsets(final_annots['db'][videoname]['split_ids'], subsets):
+            # label_key = 'av_actions'
+            frames = final_annots['db'][videoname]['frames']
+            
+            for frame_id , frame in frames.items():
+                # frame_name = '{:05d}'.format(int(frame_id))
+                frame_name = '{:05d}'.format(int(frame_id))
+                if frame['annotated']>0:
+                    gts = filter_labels(frame[label_key+'_ids'], all_labels, labels)
+                    filtered_gts.append(gts)
+                    frame_name = '{:05d}'.format(int(frame_id))
+                    filtered_preds.append(detections[videoname+frame_name])
+
+    gts = np.asarray(filtered_gts)
+    preds = np.asarray(filtered_preds)
+    return evaluate_ego(gts, preds, labels)
+    
+
+def eval_framewise_ego_actions_ucf24(final_annots, detections, subsets):
+    """Get video list form ground truth videos used in subset 
+    and their ground truth frames """
+
+    filtered_gts = []
+    filtered_preds = []
+    trainvideos = final_annots['trainvideos']
+    labels = ['Non_action'] + final_annots['classes']
+    num_classes = len(labels)
+    database = final_annots['db']
+    for videoname in final_annots['db']:
+        if videoname not in trainvideos:
+            numf = database[videoname]['numf']
+            fframe_level_annos, _ = get_frame_level_annos_ucf24(database[videoname]['annotations'], numf, num_classes)
+            for frame_id , frame in enumerate(fframe_level_annos):
+                frame_name = '{:05d}'.format(int(frame_id+1))
+                gts = [frame['ego_label']]
+                filtered_gts.append(gts)
+                filtered_preds.append(detections[videoname+frame_name])
+
+    gts = np.asarray(filtered_gts)
+    preds = np.asarray(filtered_preds)
+    
+    return evaluate_ego(gts, preds, labels)
+
+
+def eval_framewise_ego_actions(final_annots, detections, subsets, dataset='road'):
+    if dataset == 'road':
+        return eval_framewise_ego_actions_road(final_annots, detections, subsets)
+    else:
+        return eval_framewise_ego_actions_ucf24(final_annots, detections, subsets)
+
+
+def evaluate_frames(anno_file, det_file, subset, iou_thresh=0.5, dataset='road'):
+    
+
+    logger.info('Evaluating frames for datasets '+ dataset)
+    t0 = time.perf_counter()
+    if dataset == 'road':
+        with open(anno_file, 'r') as fff:
+            final_annots = json.load(fff)
+    elif dataset == 'ucf24':
+        with open(anno_file, 'rb') as fff:
+            final_annots = pickle.load(fff)
+    elif dataset == 'ava':
+        final_annots = read_ava_annotations(anno_file)
+        labelmap_file = os.path.join(os. path. dirname(anno_file), 'ava_actions.pbtxt')
+        class_names_ava, class_ids_map, label_map = read_labelmap(labelmap_file)
+
+    with open(det_file, 'rb') as fff:
+        detections = pickle.load(fff)
+
+    results = {}
+    if dataset == 'road':
+        label_types = ['av_actions'] + ['agent_ness'] + final_annots['label_types']
+    elif dataset == 'ucf24':
+        label_types = ['frame_actions', 'action_ness', 'action']
+    elif dataset == 'ava':
+        label_types = ['action_ness', 'actions']
+    else:
+        raise Exception('Define data type prpperly follwong is not in the list ::: '+dataset)
+
+    t1 = time.perf_counter()
+    logger.info('Time taken to load for evaluation {}'.format(t1-t0))
+    for nlt, label_type in enumerate(label_types):
+        if label_type in ['av_actions', 'frame_actions']:
+            mAP, ap_all, ap_strs = eval_framewise_ego_actions(final_annots, detections[label_type], subset, dataset)
+            re_all = [1.0 for _ in range(len(ap_all))]
+            for apstr in ap_strs:
+                logger.info(apstr)
+        else:
+            # t0 = time.perf_counter()
+            ap_all = []
+            ap_strs = []
+            re_all = []
+            sap = 0.0
+            gt_frames = get_gt_frames(final_annots, subset, label_type, dataset)
+            t1 = time.perf_counter()
+            # logger.info('Time taken to get GT frame for evaluation {}'.format(t0-t1))
+            if label_type == 'agent_ness':
+                classes = ['agent_ness']
+            elif label_type == 'action_ness':
+                classes = ['action_ness']
+            elif dataset == 'ava':
+                classes = class_names_ava
+            elif dataset != 'road':
+                classes = final_annots['classes'] ## valid for ucf24
+            else:
+                classes = final_annots[label_type+'_labels']
+            
+            for cl_id, class_name in enumerate(classes):
+                t1 = time.perf_counter()
+                # print(cl_id, class_name, label_type)
+                ## gather gt of class "class_name" from frames which are not marked igonre
+                if dataset == 'ava' and label_type != 'action_ness':
+                    class_gts = get_gt_class_frames(gt_frames, label_map[class_name]['org_id'])
+                else:
+                    class_gts = get_gt_class_frames(gt_frames, cl_id)
+
+                t2 = time.perf_counter()
+
+                frame_ids = [f for f in class_gts.keys()]
+                ## gather detection from only that are there in gt or not marked ignore
+                class_dets = get_det_class_frames(detections[label_type], cl_id, frame_ids, dataset) 
+                t3 = time.perf_counter()
+
+                class_ap, num_postives, count, recall = compute_class_ap(class_dets, class_gts, compute_iou_dict, iou_thresh)
+
+                recall = recall*100
+                sap += class_ap
+                ap_all.append(class_ap)
+                re_all.append(recall)
+                ap_str = class_name + ' : ' + str(num_postives) + \
+                    ' : ' + str(count) + ' : ' + str(class_ap) +\
+                    ' : ' + str(recall)
+                ap_strs.append(ap_str)
+                t4 = time.perf_counter()
+
+
+            mAP = sap/len(classes)
+        mean_recall = np.mean(np.asarray(re_all))
+        ap_strs.append('\nMean AP:: {:0.2f} mean Recall {:0.2f}'.format(mAP,mean_recall))
+        results[label_type] = {'mAP':mAP, 'ap_all':ap_all, 'ap_strs':ap_strs, 'recalls':re_all, 'mR':mean_recall}
+        logger.info('{} MAP:: {}'.format(label_type, mAP))
+    t1 = time.perf_counter()
+    logger.info('Time taken to complete evaluation {}'.format(t1-t0))
+    return results
\ No newline at end of file
diff --git a/modules/gen_agent_paths.py b/modules/gen_agent_paths.py
new file mode 100644
index 0000000..dce06b4
--- /dev/null
+++ b/modules/gen_agent_paths.py
@@ -0,0 +1,276 @@
+import numpy as np
+import pdb
+
+def update_agent_paths(live_paths, dead_paths, dets, num_classes_to_use, time_stamp, iouth=0.1, costtype='scoreiou', jumpgap=5, min_len=5): ## trim_threshold=100, keep_num=60,
+    num_box = dets['boxes'].shape[0]
+    if len(live_paths) == 0:
+        # Start a path for each box in first frame
+        for b in range(num_box):
+            live_paths.append({'boxes': None, 'scores': [], 'allScores': None, 'foundAt': [], 'count': 1})
+            live_paths[b]['boxes'] = dets['boxes'][b, :].reshape(1,-1)  # bth box x0,y0,x1,y1 at frame t
+            live_paths[b]['scores'].append(dets['scores'][b])  # action score of bth box at frame t
+            live_paths[b]['allScores'] = dets['allScores'][b, :].reshape(1,-1)  # scores for all action for bth box at frame t
+            live_paths[b]['foundAt'].append(time_stamp)  # frame box was found in
+            live_paths[b]['count'] = 1  # current box count for bth box tube
+    else:
+        # Link each path to detections at frame t
+        lp_count = len(live_paths)  # total paths at time t
+        dead_count = 0
+        covered_boxes = np.zeros(num_box)
+        path_order_score = np.zeros(lp_count)
+        avoid_dets = []
+        for lp in range(lp_count):
+            # Check whether path has gone stale
+            if time_stamp - live_paths[lp]['foundAt'][-1] <= jumpgap:
+                # IoU scores for path lp
+                as1 = live_paths[lp]['allScores'][-1,:num_classes_to_use]
+                as2 = dets['allScores'][:,:num_classes_to_use]
+                box_to_lp_score = score_of_edge(live_paths[lp], dets, iouth, costtype, avoid_dets, as1, as2, jumpgap)
+                
+                if np.sum(box_to_lp_score) > 0.1: 
+                    # print('We are here', np.sum(box_to_lp_score)) 
+                    # check if there's at least one match to detection in this frame
+                    maxInd = np.argmax(box_to_lp_score)
+                    # m_score = np.max(box_to_lp_score)
+                    live_paths[lp]['count'] = live_paths[lp]['count'] + 1
+                    live_paths[lp]['boxes'] = np.vstack((live_paths[lp]['boxes'], dets['boxes'][maxInd, :]))
+                    live_paths[lp]['scores'].append(dets['scores'][maxInd])
+                    live_paths[lp]['allScores'] = np.vstack((live_paths[lp]['allScores'], dets['allScores'][maxInd, :]))
+                    live_paths[lp]['foundAt'].append(time_stamp)
+                    avoid_dets.append(maxInd)
+                    covered_boxes[maxInd] = 1
+                
+                # else:
+                # live_paths[lp]['lastfound'] += 1
+                scores = sorted(np.asarray(live_paths[lp]['scores']))
+                num_sc = len(scores)
+                path_order_score[lp] = np.mean(np.asarray(scores[int(max(0, num_sc - jumpgap-1)):num_sc]))
+            else:
+                # If the path is stale, increment the dead_count
+                dead_count += 1
+        
+        # Sort the path based on score of the boxes and terminate dead path
+        if len(path_order_score)>1 or dead_count>0:
+            # print('sorting path')
+            live_paths, dead_paths = sort_live_paths(live_paths, path_order_score, dead_paths, jumpgap, time_stamp)
+
+
+        # start new paths using boxes that are not assigned
+        lp_count = len(live_paths)
+        if np.sum(covered_boxes) < num_box:
+            for b in range(num_box):
+                if covered_boxes[b] < 0.99:
+                    # print('numb and covered ', num_box, covered_boxes)
+                    live_paths.append({'boxes': [], 'scores': [], 'allScores': None, 'foundAt': [], 'count': 1})
+                    live_paths[lp_count]['boxes'] = dets['boxes'][b, :].reshape(1,-1)  # bth box x0,y0,x1,y1 at frame t
+                    live_paths[lp_count]['scores'].append(dets['scores'][b])  # action score of bth box at frame t
+                    live_paths[lp_count]['allScores'] = dets['allScores'][b, :].reshape(1,-1)  # scores for all action for bth box at frame t
+                    live_paths[lp_count]['count'] = 1  # current box count for bth box tube
+                    live_paths[lp_count]['foundAt'].append(time_stamp)  # frame box was found in
+                    lp_count += 1
+
+    # live_paths = trim_paths(live_paths, trim_threshold, keep_num)
+    # dead_paths = remove_dead_paths(dead_paths, min_len, time_stamp)
+
+    return live_paths, dead_paths
+
+def trim_paths(live_paths, trim_threshold, keep_num):
+    lp_count = len(live_paths)
+    for lp in range(lp_count):
+        # print(live_paths[lp]['boxes'].shape, live_paths[lp]['allScores'].shape)
+        if len(live_paths[lp]['boxes']) > trim_threshold:
+            live_paths[lp]['boxes'] = live_paths[lp]['boxes'][-keep_num:, :]
+            live_paths[lp]['scores'] = live_paths[lp]['scores'][-keep_num:]
+            live_paths[lp]['allScores'] = live_paths[lp]['allScores'][-keep_num:, :]
+            live_paths[lp]['foundAt'] = live_paths[lp]['foundAt'][-keep_num:]
+    return live_paths
+
+
+def remove_dead_paths(live_paths, min_len, time_stamp):
+    dead_paths = []
+    dp_count = 0
+    for olp in range(len(dead_paths)):
+        if len(dead_paths[olp]['boxes']) >= min_len:
+            dead_paths.append({'boxes': None, 'scores': None, 'allScores': None,
+                               'foundAt': None, 'count': None})
+            dead_paths[dp_count]['boxes'] = live_paths[olp]['boxes']
+            dead_paths[dp_count]['scores'] = live_paths[olp]['scores']
+            dead_paths[dp_count]['allScores'] = live_paths[olp]['allScores']
+            dead_paths[dp_count]['foundAt'] = live_paths[olp]['foundAt']
+            dead_paths[dp_count]['count'] = live_paths[olp]['count']
+            dp_count += 1
+
+    return dead_paths
+
+def sort_live_paths(live_paths, path_order_score, dead_paths, jumpgap, time_stamp):
+    inds = path_order_score.flatten().argsort()[::-1]
+    sorted_live_paths = []
+    lpc = 0
+    dp_count = len(dead_paths)
+    for lp in range(len(live_paths)):
+        olp = inds[lp]
+        if time_stamp-live_paths[olp]['foundAt'][-1] <= jumpgap:
+            sorted_live_paths.append({'boxes': None, 'scores': None, 'allScores': None,
+                                      'foundAt': None, 'count': None})
+            sorted_live_paths[lpc]['boxes'] = live_paths[olp]['boxes']
+            sorted_live_paths[lpc]['scores'] = live_paths[olp]['scores']
+            sorted_live_paths[lpc]['allScores'] = live_paths[olp]['allScores']
+            sorted_live_paths[lpc]['foundAt'] = live_paths[olp]['foundAt']
+            sorted_live_paths[lpc]['count'] = live_paths[olp]['count']
+            lpc += 1
+        else:
+            dead_paths.append({'boxes': None, 'scores': None, 'allScores': None,
+                               'foundAt': None, 'count': None})
+            dead_paths[dp_count]['boxes'] = live_paths[olp]['boxes']
+            dead_paths[dp_count]['scores'] = live_paths[olp]['scores']
+            dead_paths[dp_count]['allScores'] = live_paths[olp]['allScores']
+            dead_paths[dp_count]['foundAt'] = live_paths[olp]['foundAt']
+            dead_paths[dp_count]['count'] = live_paths[olp]['count']
+            dp_count = dp_count + 1
+
+    return sorted_live_paths, dead_paths
+
+def copy_live_to_dead(live_paths, dead_paths, min_len):
+    dp_count = len(dead_paths)
+    for lp in range(len(live_paths)):
+        # path_score = np.mean(live_paths[lp]['scores'])
+        # if len(live_paths[lp]['boxes']) >= min_len or path_score > 0.01:
+        dead_paths.append({'boxes': None, 'scores': None, 'allScores': None,
+                            'foundAt': None, 'count': None})
+        dead_paths[dp_count]['boxes'] = live_paths[lp]['boxes']
+        dead_paths[dp_count]['scores'] = live_paths[lp]['scores']
+        dead_paths[dp_count]['allScores'] = live_paths[lp]['allScores']
+        dead_paths[dp_count]['foundAt'] = live_paths[lp]['foundAt']
+        dead_paths[dp_count]['count'] = live_paths[lp]['count']
+        dp_count = dp_count + 1
+
+    return dead_paths
+
+
+def score_of_edge(v1, v2, iouth, costtype, avoid_dets, as1, as2, jumpgap):
+
+    N2 = v2['boxes'].shape[0]
+    score = np.zeros(N2)
+    curent_boxes = v1['boxes'][-1,:]
+    tm = min(jumpgap+1, v1['boxes'].shape[0])
+    past_boxes = v1['boxes'][-tm, :]
+    expected_boxes = curent_boxes + (curent_boxes-past_boxes)/max(1,tm-1)
+    ious = bbox_overlaps(expected_boxes, v2['boxes'])
+    if ious.any()>1:
+        print(ious)
+    # pdb.set_trace()
+    for i in range(0, N2):
+        if ious[i] >= iouth and i not in avoid_dets:
+            scores2 = v2['scores'][i]
+            if costtype == 'score':
+                score[i] = scores2
+            elif costtype == 'scoreiou':
+                score[i] = (scores2 + ious[i])/2
+            elif costtype == 'ioul2':
+                score[i] = (scores2 + ious[i])/2
+                invl2_diff = 1.0/np.sqrt(np.sum((as1-as2[i,:])**2))
+                score[i] += invl2_diff
+            elif costtype == 'iou':
+                score[i] =  ious[i]
+    return score
+
+
+def intersect(box_a, box_b):
+        # A = box_a.size(0)
+        B = box_b.shape[0]
+        inters = np.zeros(B)
+        for b in range(B):
+            max_x = min(box_a[2], box_b[b, 2])
+            max_y = min(box_a[3], box_b[b, 3])
+            min_x = max(box_a[0], box_b[b, 0])
+            min_y = max(box_a[1], box_b[b, 1])
+            inters[b] = (max_x-min_x)*(max_y-min_y)
+        return inters
+
+def bbox_overlaps(box_a, box_b):
+
+    inter = intersect(box_a, box_b)
+    area_a = (box_a[2]-box_a[0])*(box_a[3]-box_a[1])
+    B = box_b.shape[0]
+    ious = np.zeros(B)
+    for b in range(B):
+        if inter[b]>0:
+            area_b = (box_b[b,2] - box_b[b,0]) * (box_b[b,3] - box_b[b,1])
+            union = area_a + area_b - inter[b]
+            ious[b] = inter[b]/union
+    return ious
+
+def check_if_sorted(array):
+    sorted = True
+    for i in range(len(array)-1):
+        if array[i]>array[i+1]:
+            sorted = False
+            break
+    return sorted
+
+def are_there_gaps(array):
+    gaps = False
+    for i in range(len(array)-1):
+        if array[i+1] - array[i] > 1 :
+            gaps = True
+            # print(array[i+1], array[i])
+            break
+    return gaps
+
+
+def fill_gaps(paths, min_len_with_gaps=8, minscore=0.3):
+    lp_count = len(paths)
+    new_paths = []
+    filling_gaps = 0
+    for lp in range(lp_count):
+        path = paths[lp]
+        path_score = np.mean(path['scores'])
+        if len(path['boxes']) >= min_len_with_gaps or path_score > minscore:
+            foundAt = path['foundAt']
+            assert sorted(foundAt), 'foundAt should have been sorted i.e., paths should be built incremently'
+            if are_there_gaps(foundAt):
+                if len(foundAt)<=min_len_with_gaps:
+                    continue
+                filling_gaps += 1
+                numb = foundAt[-1] - foundAt[0] + 1
+                new_path = {'boxes': np.zeros((numb,4)), 'scores': np.zeros(numb), 
+                            'allScores': np.zeros((numb, path['allScores'].shape[1])),
+                            'foundAt': np.zeros(numb, dtype=np.int32)}
+                            
+                count = 0
+                fn = foundAt[0]
+                for n in range(len(foundAt)):
+                    next_ = foundAt[n]
+                    if fn == next_ :
+                        new_path['foundAt'][count] =  foundAt[n]
+                        new_path['boxes'][count, :]  = path['boxes'][n, :]
+                        new_path['scores'][count]  = path['scores'][n]
+                        new_path['allScores'][count, :]  = path['allScores'][n, :]
+                        count += 1
+                        fn += 1
+                    else:
+                        pfn = fn-1
+                        pcount = count -1
+                        while fn <= next_:
+                            weight = (fn - pfn) / (next_ - pfn)
+                            new_path['foundAt'][count] = fn 
+                            new_path['boxes'][count,:] = new_path['boxes'][pcount,:] + weight*(path['boxes'][n,:] - new_path['boxes'][pcount,:])
+                            new_path['allScores'][count,:] = new_path['allScores'][pcount,:] + weight*(path['allScores'][n,:] - new_path['allScores'][pcount,:])
+                            new_path['scores'][count] = new_path['scores'][pcount] + weight*(path['scores'][n] - new_path['scores'][pcount])
+                            # print(fn, weight, path['boxes'][n,:] - new_path['boxes'][pcount,:], foundAt)
+                            # pdb.set_trace()
+                            fn += 1
+                            count += 1
+                    # pdb.set_trace()
+                assert count == numb, 'count {:d} numb {:d} are not equal'.format(count, numb)
+            else:
+                new_path = {'boxes': path['boxes'], 'scores': path['scores'], 
+                            'allScores': path['allScores'],
+                            'foundAt': path['foundAt']}
+            
+            new_paths.append(new_path)
+            
+            # paths[lp]['labels'] = paths[lp]['labels'][-keep_num:]
+    # print('Number of tube paths with gaps are ', filling_gaps)
+
+    return paths
diff --git a/modules/solver.py b/modules/solver.py
new file mode 100644
index 0000000..4ffff27
--- /dev/null
+++ b/modules/solver.py
@@ -0,0 +1,76 @@
+import torch, pdb
+import torch.optim as optim
+# from .madamw import Adam as AdamM
+# from .adamw import Adam as AdamW
+
+from torch.optim.lr_scheduler import MultiStepLR
+
+class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler):
+    def __init__(self, optimizer, MILESTONES, GAMMAS, last_epoch=-1):
+        self.MILESTONES = MILESTONES
+        self.GAMMAS = GAMMAS
+        assert len(GAMMAS) == len(MILESTONES), 'MILESTONES and GAMMAS should be of same length GAMMAS are of len ' + (len(GAMMAS)) + ' and MILESTONES '+ str(len(MILESTONES))
+        super(WarmupMultiStepLR, self).__init__(optimizer, last_epoch)
+
+    def get_lr(self):
+        if self.last_epoch not in self.MILESTONES:
+            return [group['lr'] for group in self.optimizer.param_groups]
+        else:
+            index = self.MILESTONES.index(self.last_epoch)
+            return [group['lr'] * self.GAMMAS[index] for group in self.optimizer.param_groups]
+    
+    #def print_lr(self):
+    #   print([[group['name'], group['lr']] for group in self.optimizer.param_groups])
+
+def get_optim(args, net):
+    freeze_layers = ['backbone_net.layer'+str(n) for n in range(1, args.FREEZE_UPTO+1)]
+    params = []
+    solver_print_str = '\n\nSolver configs are as follow \n\n\n'
+    for key, value in net.named_parameters():
+        
+        if args.FREEZE_UPTO>0 and (key.find('backbone.conv1')>-1 or key.find('backbone.bn1')>-1): # Freeze first conv layer and bn layer in resnet
+            value.requires_grad = False
+            continue
+        
+        if key.find('backbone')>-1: 
+            for layer_id in freeze_layers:
+                if key.find(layer_id)>-1:
+                    value.requires_grad = False    
+                    continue
+        
+        if not value.requires_grad:
+            continue
+        
+        lr = args.LR
+        wd = args.WEIGHT_DECAY
+        
+        if args.OPTIM == 'ADAM':
+            wd = 0.0
+        
+        if "bias" in key:
+            lr = lr*2.0
+        
+        if args.OPTIM == 'SGD':
+            params += [{"params": [value], "name":key, "lr": lr, "weight_decay":wd, "momentum":args.MOMENTUM}]
+        else:
+            params += [{"params": [value], "name":key, "lr": lr, "weight_decay":wd}]
+        
+        print_l = key +' is trained at the rate of ' + str(lr)
+        print(print_l)
+        solver_print_str += print_l + '\n'
+        
+        
+    if args.OPTIM == 'SGD':
+        optimizer = optim.SGD(params)
+    elif args.OPTIM == 'ADAM':
+        optimizer = optim.Adam(params)
+    else:
+        raise NotImplementedError('Define optimiser type')
+    
+    solver_print_str += 'optimizer is '+ args.OPTIM + '\nDone solver configs\n\n'
+
+    #print(args.MILSTONES, args.GAMMAS)
+    #scheduler = WarmupMultiStepLR(optimizer, args.MILESTONES, args.GAMMAS)
+    scheduler = MultiStepLR(optimizer, args.MILESTONES, args.GAMMA)
+
+    return optimizer, scheduler, solver_print_str
diff --git a/modules/tube_helper.py b/modules/tube_helper.py
new file mode 100644
index 0000000..280fccc
--- /dev/null
+++ b/modules/tube_helper.py
@@ -0,0 +1,375 @@
+import numpy as np
+import pdb
+from modules import utils
+import scipy.signal as signal
+logger = utils.get_logger(__name__)
+from scipy.signal import savgol_filter
+# from gen_dets import make_joint_probs_from_marginals
+from modules.utils import make_joint_probs_from_marginals
+
+over_s = 0.0
+under_s = 0.0
+over_e = 0.0
+under_e = 0.0
+oa_s = 0.0
+ua_s = 0.0
+oa_e = 0.0
+ua_e = 0.0
+
+def make_det_tube(scores, boxes, frames, label_id):
+    tube = {}
+    tube['label_id'] =label_id
+    tube['scores'] = np.asarray(scores)
+    tube['boxes'] = np.asarray(boxes)
+    tube['score'] = np.mean(scores)
+    tube['frames'] = np.asarray(frames)
+    # assert tube['frames'].shape[0] == tube['boxes'].shape[0], 'must be equal'
+    return tube
+
+def get_nonnp_det_tube(scores, boxes, start, end, label_id, score=None):
+    tube = {}
+    tube['label_id'] =label_id
+    tube['scores'] = scores
+    tube['boxes'] = boxes
+    
+    if score is not None:
+        tube['score'] = score
+    else:
+        tube['score'] = float(np.mean(scores))
+
+    tube['frames'] = np.asarray([i for i in range(start, end)])
+    assert len(tube['frames']) == len(tube['boxes']), 'must be equal'
+
+    return tube
+
+def make_gt_tube(frames, boxes, label_id):
+    frames = np.asarray(frames)
+    indexs = np.argsort(frames)
+    frames = frames[indexs]
+    boxes = np.asarray(boxes)
+    if boxes.shape[0]>0:
+        boxes = boxes[indexs,:]
+    tube = {}
+    tube['frames'] = frames
+    tube['boxes'] = boxes
+    tube['label_id'] = label_id
+    return tube
+
+def trim_tubes(start_id, numc, paths, childs, num_classes_list, topk=5, alpha=3, min_len=3, trim_method='None'):
+    """ Trim the paths into tubes using DP"""
+    tubes = []
+    for path in paths:
+        if len(childs)>0:
+            allScores = make_joint_probs_from_marginals(path['allScores'], childs, num_classes_list, start_id=0)
+        else:
+            allScores = path['allScores']
+        allScores = allScores[:,start_id:start_id+numc]
+        path_start_frame = path['foundAt'][0]
+        if allScores.shape[0]<=min_len:
+            continue
+        
+        # print(allScores.shape)
+        if trim_method == 'none': # 
+            # print('no trimming')
+            topk_classes, topk_scores = get_topk_classes(allScores, topk)
+            for i in range(topk):
+                label, start, end = topk_classes[i], path_start_frame, allScores.shape[0] + path_start_frame 
+                if end-start+1 > min_len:
+                    # tube = get_nonnp_det_tube(allScores[:,label], path['boxes'], int(start), int(end), int(label))
+                    tube = get_nonnp_det_tube(allScores[:,label], path['boxes'], int(start), int(end), int(label), score=topk_scores[i])
+                    tubes.append(tube)
+        elif trim_method == 'dpscores': ## standarded method Multi class-DP
+            allScores = path['allScores'][:,start_id:start_id+numc]
+            score_mat = np.transpose(allScores.copy())
+            for _ in range(topk):
+                (segments, _) = dpEMmax(score_mat, alpha)
+                # print(segments)
+                labels, starts, ends = getLabels(segments)
+                # print(labels, starts, ends)
+                for i in range(len(labels)):
+                    if ends[i] - starts[i] >= min_len:
+                        scores = score_mat[labels[i], starts[i]:ends[i]+1]
+                        boxes = path['boxes'][starts[i]:ends[i]+1, :]
+                        start = starts[i] + path_start_frame
+                        end = ends[i] + path_start_frame + 1
+                        tube = get_nonnp_det_tube(scores, boxes, int(start), int(end), int(labels[i]))
+                        tubes.append(tube)
+                        score_mat[labels[i], starts[i]:ends[i]+1] = 0.0
+
+        elif trim_method == 'dpscorestopn': ## bit fancy only select top segments
+            score_mat = np.transpose(allScores.copy())
+            for _ in range(topk):
+                (segments, _) = dpEMmax(score_mat, alpha)
+                # print(segments)
+                labels, starts, ends = getLabels(segments)
+                # print(labels, starts, ends)
+                num_seg = labels.shape[0]
+                seg_scores = np.zeros(num_seg)
+                for i in range(min(2,len(labels))):
+                    if ends[i] - starts[i] >= min_len:
+                        scores = score_mat[labels[i], starts[i]:ends[i]+1]
+                        seg_scores[i] = np.mean(scores)
+                    else:
+                        score_mat[labels[i], starts[i]:ends[i]+1] = 0.0
+                        seg_scores[i] = 0.0
+
+                inds = np.argsort(-seg_scores)
+                for ii in range(min(2, num_seg)):
+                    i = inds[ii]
+                    # if ends[i] - starts[i] >= min_len:
+                    scores = score_mat[labels[i], starts[i]:ends[i]+1]
+                    boxes = path['boxes'][starts[i]:ends[i]+1, :]
+                    start = starts[i] + path_start_frame
+                    if boxes.shape[0] != -starts[i] + ends[i] + 1:
+                        print('We have exceptions', boxes.shape[0], -starts[i] + ends[i]+1)
+                    end = ends[i] + path_start_frame + 1
+                    tube = get_nonnp_det_tube(scores, boxes, int(start), int(end), int(labels[i]))
+                    tubes.append(tube)
+                    score_mat[labels[i], starts[i]:ends[i]+1] = 0.0
+        else: #indvidual class-wise dp
+            aa = 0
+            if alpha == 0 and numc == 24:
+                # alphas = [1, 1, 16, 1, 1, 2, 16, 8,  4, 16, 6, 16, 20, 16, 1, 16, 16, 20, 16, 2, 4, 8, 1, 20]
+                # alphas = [1, 1,  8, 1, 1, 3, 16, 16, 2, 16, 3, 16, 20, 16, 1,  8,  8,  8, 16, 2, 2, 8, 1, 20]
+                # alphas = [1, 5, 16, 8, 1, 3, 16, 16, 16, 3, 8, 16, 16, 16, 1, 5, 16, 16, 5, 2, 1, 8, 3, 16]
+                # alphas = [1, 3, 16, 2, 1, 3, 8, 16, 16, 3, 3, 16, 16, 16, 1, 5, 16, 8, 5, 2, 1, 16, 2, 16]
+                alphas = [1, 1, 16, 3, 1, 8, 16, 16, 10, 10, 3, 16, 16, 10, 1, 8, 16, 16, 16, 2, 1, 8, 2, 16]
+            else:
+                alphas = np.zeros(numc)+alpha
+                
+            topk_classes, topk_scores = get_topk_classes(allScores, topk)
+            for idx in range(topk_classes.shape[0]):
+                current_label = int(topk_classes[idx])
+                if numc == 24:
+                    in_scores = path['allScores'][:,start_id-1]
+                else:
+                    in_scores = allScores[:,current_label]
+
+                smooth_scores = signal.medfilt(in_scores, 5)
+                smooth_scores = in_scores/np.max(smooth_scores)
+                score_mat =  np.hstack((smooth_scores[:, np.newaxis], 1 - smooth_scores[:, np.newaxis])) 
+                score_mat = np.transpose(score_mat.copy())
+                (segments, _) = dpEMmax(score_mat, alphas[current_label])
+                labels, starts, ends = getLabels(segments)
+                for i in range(len(labels)):
+                    if ends[i] - starts[i] >= min_len and labels[i]==0:
+                        scores = allScores[starts[i]:ends[i]+1, current_label]
+                        sorted_classes = np.argsort(-scores)
+                        sorted_scores = scores[sorted_classes]
+                        topn = max(1,int(sorted_scores.shape[0]/2))
+                        mscore = np.mean(sorted_scores[:topn])
+                        boxes = path['boxes'][starts[i]:ends[i]+1, :]
+                        start = starts[i] + path_start_frame
+                        end = ends[i] + path_start_frame + 1
+                        sf = max(1,int(start)-aa)
+                        ef = int(end)-(start-sf)
+                        tube = get_nonnp_det_tube(scores, boxes, sf, ef, int(current_label), score=mscore) #topk_scores[idx])
+                        tubes.append(tube)
+                        # score_mat[labels[i], starts[i]:ends[i]+1] = 0.0
+    return tubes
+
+def getLabels(segments, cls=1):
+    starts = np.zeros(len(segments), dtype='int32')
+    ends = np.zeros(len(segments), dtype='int32')
+    labels = np.zeros(len(segments), dtype='int32')
+    fl = 0
+    i=0
+    starts[i]=0
+    fl = segments[0]
+    labels[i] =  segments[0]
+#    print segments[0]
+#    pdb.set_trace()
+    for ii in range(len(segments)):
+        if abs(segments[ii] -fl)>0:
+            ends[i]=ii-1
+            fl = segments[ii]
+            i+=1
+            starts[i]=ii
+            labels[i] = fl
+    ends[i] = len(segments)-1
+    return labels[:i+1],starts[:i+1],ends[:i+1]
+
+def get_topk_classes(allScores, topk):
+    scores = np.zeros(allScores.shape[1])
+    # print(scores.shape)
+    topn = max(1, allScores.shape[1]//4)
+    for k in range(scores.shape[0]):
+        temp_scores = allScores[:,k]
+        sorted_score = np.sort(-temp_scores)
+        # print(sorted_score[:topn])
+        scores[k] = np.mean(-sorted_score[:topn])
+    sorted_classes = np.argsort(-scores)
+    sorted_scores = scores[sorted_classes]
+    # sorted_scores = sorted_scores/np.sum(sorted_scores)
+    # print(sorted_scores)
+    return sorted_classes[:topk], sorted_scores[:topk]
+
+
+def dpEMmax(M, alpha=3):
+    (r,c) = np.shape(M)
+    D = np.zeros((r, c+1)) # add an extra column
+    D[:,0] = 1 # % put the maximum cost
+    D[:, 1:(c+1)] = M
+    phi = np.zeros((r,c))
+    for j in range(1,c):
+        for i in range(r):
+            v1 = np.ones(r)*alpha
+            v1[i] = 0
+            values= D[:, j-1] - v1
+            tb = np.argmax(values)
+            dmax = max(values)
+            D[i,j] = D[i,j]+dmax
+            phi[i,j] = tb
+
+    q = c-1
+    values= D[:, c-1]
+    p = np.argmax(values)
+    i = p
+    j = q 
+    ps = np.zeros(c)
+    ps[q] = p
+    while j>0:
+        tb = phi[i,j]
+        j = int(j-1)
+        q = j
+        ps[q] = tb
+        i = int(tb)
+    
+    D = D[:,1:]
+    return (ps,D)
+
+
+def intersect(box_a, box_b):
+        # A = box_a.size(0)
+        B = box_b.shape[0]
+        inters = np.zeros(B)
+        for b in range(B):
+            max_x = min(box_a[2], box_b[b, 2])
+            max_y = min(box_a[3], box_b[b, 3])
+            min_x = max(box_a[0], box_b[b, 0])
+            min_y = max(box_a[1], box_b[b, 1])
+            inters[b] = (max_x-min_x)*(max_y-min_y)
+        return inters
+
+
+def bbox_overlaps(box_a, box_b):
+
+    inter = intersect(box_a, box_b)
+    area_a = (box_a[2]-box_a[0])*(box_a[3]-box_a[1])
+    B = box_b.shape[0]
+    ious = np.zeros(B)
+    for b in range(B):
+        if inter[b]>0:
+            area_b = (box_b[b,2] - box_b[b,0]) * (box_b[b,3] - box_b[b,1])
+            union = area_a + area_b - inter[b]
+            ious[b] = inter[b]/union
+    return ious
+
+
+def get_tube_3Diou(tube_a, tube_b , metric_type='stiou'):
+    """Compute the spatio-temporal IoU between two tubes"""
+
+    
+
+    tmin = max(tube_a['frames'][0], tube_b['frames'][0])
+    tmax = min(tube_a['frames'][-1], tube_b['frames'][-1])
+    
+    if tmax < tmin: return 0.0
+
+    temporal_inter = tmax - tmin + 1
+    temporal_union = max(tube_a['frames'][-1], tube_b['frames'][-1]) - min(tube_a['frames'][0], tube_b['frames'][0]) + 1
+    tiou = temporal_inter / temporal_union
+    if metric_type == 'tiou':
+        return tiou
+    # try:
+
+    tube_a_boxes = tube_a['boxes'][int(np.where(tube_a['frames'] == tmin)[0][0]): int(
+        np.where(tube_a['frames'] == tmax)[0][0]) + 1, :]
+    tube_b_boxes = tube_b['boxes'][int(np.where(tube_b['frames'] == tmin)[0][0]): int(
+        np.where(tube_b['frames'] == tmax)[0][0]) + 1, :]
+    # except:
+    #     pdb.set_trace()     print('something', tube_a_boxes, tube_b_boxes, iou)
+
+    siou = iou3d(tube_a_boxes, tube_b_boxes)
+
+    global over_s, over_e, under_s, under_e, oa_s, oa_e, ua_s, ua_e
+    
+    if tube_a['frames'][-1]>= tube_b['frames'][-1]:
+        over_e += 1
+        oa_e += tube_a['frames'][-1] - tube_b['frames'][-1] 
+    else:
+        under_e += 1
+        ua_e += tube_a['frames'][-1] - tube_b['frames'][-1]
+    
+    if tube_a['frames'][0]<= tube_b['frames'][0]:
+        over_s += 1
+        oa_s += tube_a['frames'][0] - tube_b['frames'][0] 
+    else:
+        under_s += 1
+        ua_s += tube_a['frames'][0] - tube_b['frames'][0]
+    
+    # if not (tube_a['frames'][-1]>= tube_b['frames'][-1] and tube_a['frames'][0]<= tube_b['frames'][0]):
+    #     tiou = 1.0
+    # logger.info('over_s {} over_e {} under_s {} under_e {} oa_s {} oa_e {} ua_s {} ua_e {}'.format(over_s, over_e, under_s, under_e, oa_s, oa_e, ua_s, ua_e))
+    # if siou>0.5 and temporal_inter>= tube_b['frames'][-1]-tube_b['frames'][0]:
+    #     print(tube_b['frames'][0],tube_b['frames'][-1], tube_a['frames'][0],tube_a['frames'][-1], tube_a['scores'])
+    if metric_type == 'siou':
+        return siou
+    else:
+        return  siou * tiou
+
+
+def iou3d(tube_a, tube_b):
+    """Compute the IoU between two tubes with same temporal extent"""
+
+    assert tube_a.shape[0] == tube_b.shape[0]
+    # assert np.all(b1[:, 0] == b2[:, 0])
+
+    ov = overlap2d(tube_a,tube_b)
+
+    return np.mean(ov / (area2d(tube_a) + area2d(tube_b) - ov) )
+
+
+def area2d(b):
+    """Compute the areas for a set of 2D boxes"""
+
+    return (b[:,2]-b[:,0]+1) * (b[:,3]-b[:,1]+1)
+
+
+def overlap2d(b1, b2):
+    """Compute the overlaps between a set of boxes b1 and one box b2"""
+
+    xmin = np.maximum(b1[:,0], b2[:,0])
+    ymin = np.maximum(b1[:,1], b2[:,1])
+    xmax = np.minimum(b1[:,2] + 1, b2[:,2] + 1)
+    ymax = np.minimum(b1[:,3] + 1, b2[:,3] + 1)
+
+    width = np.maximum(0, xmax - xmin)
+    height = np.maximum(0, ymax - ymin)
+
+    return width * height
+
+def nms3dt(tubes, overlap=0.6):
+    """Compute NMS of scored tubes. Tubes are given as list of (tube, score)
+    return the list of indices to keep
+    """
+
+    if not tubes:
+        return np.array([], dtype=np.int32)
+
+    I = np.argsort([t['score'] for t in tubes])
+    indices = np.zeros(I.size, dtype=np.int32)
+    counter = 0
+
+    while I.size > 0:
+        i = I[-1]
+        indices[counter] = i
+        counter += 1
+        ious = np.array([get_tube_3Diou(tubes[ii], tubes[i]) for ii in I[:-1]])
+        I = I[np.where(ious <= overlap)[0]]
+    indices = indices[:counter]
+    final_tubes = []
+    for ind in indices:
+        final_tubes.append(tubes[ind])
+    
+    return final_tubes
diff --git a/modules/utils.py b/modules/utils.py
new file mode 100644
index 0000000..df6b292
--- /dev/null
+++ b/modules/utils.py
@@ -0,0 +1,288 @@
+import os, sys
+import shutil
+import socket
+import getpass
+import copy
+import numpy as np
+from modules.box_utils import nms
+import datetime
+import logging 
+import torch
+import pdb
+import torchvision
+
+# from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/modeling/rpn/anchor_generator.py
+class BufferList(torch.nn.Module):
+    """    
+    Similar to nn.ParameterList, but for buffers
+    """
+
+    def __init__(self, buffers=None):
+        super(BufferList, self).__init__()
+        if buffers is not None:
+            self.extend(buffers)
+
+    def extend(self, buffers):
+        offset = len(self)
+        for i, buffer in enumerate(buffers):
+            self.register_buffer(str(offset + i), buffer)
+        return self
+
+    def __len__(self):
+        return len(self._buffers)
+
+    def __iter__(self):
+        return iter(self._buffers.values())
+        
+def setup_logger(args):
+    """
+    Sets up the logging.
+    """
+    log_file_name = '{:s}/{:s}-{date:%m-%d-%Hx}.log'.format(args.SAVE_ROOT, args.MODE, date=datetime.datetime.now())
+    args.log_dir = 'logs/'+args.exp_name+'/'
+    if not os.path.isdir(args.log_dir):
+        os.makedirs(args.log_dir)
+        
+    added_log_file = '{}{}-{date:%m-%d-%Hx}.log'.format(args.log_dir, args.MODE, date=datetime.datetime.now())
+
+   
+    # Set up logging format.
+    _FORMAT = "[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s"
+
+    logging.root.handlers = []
+    logging.basicConfig(
+        level=logging.INFO, format=_FORMAT, stream=sys.stdout
+    )
+    logging.getLogger().addHandler(logging.FileHandler(log_file_name, mode='a'))
+    # logging.getLogger().addHandler(logging.FileHandler(added_log_file, mode='a'))
+
+
+def get_logger(name):
+    """
+    Retrieve the logger with the specified name or, if name is None, return a
+    logger which is the root logger of the hierarchy.
+    Args:
+        name (string): name of the logger.
+    """
+    return logging.getLogger(name)
+
+def copy_source(source_dir):
+    if not os.path.isdir(source_dir):
+        os.system('mkdir -p ' + source_dir)
+    
+    for dirpath, dirs, files in os.walk('./', topdown=True):
+        for file in files:
+            if file.endswith('.py'): #fnmatch.filter(files, filepattern):
+                shutil.copy2(os.path.join(dirpath, file), source_dir)
+
+
+def set_args(args):
+    args.MAX_SIZE = int(args.MIN_SIZE*1.35)
+    args.MILESTONES = [int(val) for val in args.MILESTONES.split(',')]
+    #args.GAMMAS = [float(val) for val in args.GAMMAS.split(',')]
+    args.EVAL_EPOCHS = [int(val) for val in args.EVAL_EPOCHS.split(',')]
+
+    args.TRAIN_SUBSETS = [val for val in args.TRAIN_SUBSETS.split(',') if len(val)>1]
+    args.VAL_SUBSETS = [val for val in args.VAL_SUBSETS.split(',') if len(val)>1]
+    args.TEST_SUBSETS = [val for val in args.TEST_SUBSETS.split(',') if len(val)>1]
+    args.TUBES_EVAL_THRESHS = [ float(val) for val in args.TUBES_EVAL_THRESHS.split(',') if len(val)>0.0001]
+    args.model_subtype = args.MODEL_TYPE.split('-')[0]
+    ## check if subsets are okay
+    possible_subets = ['test', 'train','val']
+    for idx in range(1,4):
+        possible_subets.append('train_'+str(idx))        
+        possible_subets.append('val_'+str(idx))        
+
+    if len(args.VAL_SUBSETS) < 1 and args.DATASET == 'road':
+        args.VAL_SUBSETS = [ss.replace('train', 'val') for ss in args.TRAIN_SUBSETS]
+    if len(args.TEST_SUBSETS) < 1:
+        # args.TEST_SUBSETS = [ss.replace('train', 'val') for ss in args.TRAIN_SUBSETS]
+        args.TEST_SUBSETS = args.VAL_SUBSETS
+    
+    for subsets in [args.TRAIN_SUBSETS, args.VAL_SUBSETS, args.TEST_SUBSETS]:
+        for subset in subsets:
+            assert subset in possible_subets, 'subest should from one of these '+''.join(possible_subets)
+
+    args.DATASET = args.DATASET.lower()
+    args.ARCH = args.ARCH.lower()
+
+    args.MEANS =[0.485, 0.456, 0.406]
+    args.STDS = [0.229, 0.224, 0.225]
+
+    username = getpass.getuser()
+    hostname = socket.gethostname()
+    args.hostname = hostname
+    args.user = username
+    
+    args.model_init = 'kinetics'
+
+    args.MODEL_PATH = args.MODEL_PATH[:-1] if args.MODEL_PATH.endswith('/') else args.MODEL_PATH 
+
+    assert args.MODEL_PATH.endswith('kinetics-pt') or args.MODEL_PATH.endswith('imagenet-pt') 
+    args.model_init = 'imagenet' if args.MODEL_PATH.endswith('imagenet-pt') else 'kinetics'
+    
+    if args.MODEL_PATH == 'imagenet':
+        args.MODEL_PATH = os.path.join(args.MODEL_PATH, args.ARCH+'.pth')
+    else:
+        args.MODEL_PATH = os.path.join(args.MODEL_PATH, args.ARCH+args.MODEL_TYPE+'.pth')
+            
+    
+    print('Your working directories are::\nLOAD::> ', args.DATA_ROOT, '\nSAVE::> ', args.SAVE_ROOT)
+    print('Your model will be initialized using', args.MODEL_PATH)
+    
+    return args
+
+
+def create_exp_name(args):
+    """Create name of experiment using training parameters """
+    splits = ''.join([split[0]+split[-1] for split in args.TRAIN_SUBSETS])
+    args.exp_name = '{:s}{:s}{:d}-P{:s}-b{:0d}s{:d}x{:d}x{:d}-{:s}{:s}-h{:d}x{:d}x{:d}'.format(
+        args.ARCH, args.MODEL_TYPE,
+        args.MIN_SIZE, args.model_init, args.BATCH_SIZE,
+        args.SEQ_LEN, args.MIN_SEQ_STEP, args.MAX_SEQ_STEP,
+        args.DATASET, splits, 
+        args.HEAD_LAYERS, args.CLS_HEAD_TIME_SIZE,
+        args.REG_HEAD_TIME_SIZE,
+        )
+
+    args.SAVE_ROOT += args.DATASET+'/'
+    args.SAVE_ROOT = args.SAVE_ROOT+'cache/'+args.exp_name+'/'
+    if not os.path.isdir(args.SAVE_ROOT):
+        print('Create: ', args.SAVE_ROOT)
+        os.makedirs(args.SAVE_ROOT)
+
+    return args
+    
+# Freeze batch normlisation layers
+def set_bn_eval(m):
+    classname = m.__class__.__name__
+    if classname.find('BatchNorm') > -1:
+        m.eval()
+        if m.affine:
+            m.weight.requires_grad = False
+            m.bias.requires_grad = False
+        
+
+def get_individual_labels(gt_boxes, tgt_labels):
+    # print(gt_boxes.shape, tgt_labels.shape)
+    new_gts = np.zeros((gt_boxes.shape[0]*20, 5))
+    ccc = 0
+    for n in range(tgt_labels.shape[0]):
+        for t in range(tgt_labels.shape[1]):
+            if tgt_labels[n,t]>0:
+                new_gts[ccc, :4] = gt_boxes[n,:]
+                new_gts[ccc, 4] = t
+                ccc += 1
+    return new_gts[:ccc,:]
+
+
+def get_individual_location_labels(gt_boxes, tgt_labels):
+    return [gt_boxes, tgt_labels]
+
+
+def filter_detections(args, scores, decoded_boxes_batch):
+    c_mask = scores.gt(args.CONF_THRESH)  # greater than minmum threshold
+    scores = scores[c_mask].squeeze()
+    if scores.dim() == 0 or scores.shape[0] == 0:
+        return np.asarray([])
+    
+    boxes = decoded_boxes_batch[c_mask, :].view(-1, 4)
+    ids, counts = nms(boxes, scores, args.NMS_THRESH, args.TOPK*5)  # idsn - ids after nms
+    scores = scores[ids[:min(args.TOPK,counts)]].cpu().numpy()
+    boxes = boxes[ids[:min(args.TOPK,counts)]].cpu().numpy()
+    cls_dets = np.hstack((boxes, scores[:, np.newaxis])).astype(np.float32, copy=True)
+
+    return cls_dets
+
+
+def filter_detections_for_tubing(args, scores, decoded_boxes_batch, confidences):
+    c_mask = scores.gt(args.CONF_THRESH)  # greater than minmum threshold
+    scores = scores[c_mask].squeeze()
+    if scores.dim() == 0 or scores.shape[0] == 0:
+        return  np.zeros((0,200))
+    
+    boxes = decoded_boxes_batch[c_mask, :].clone().view(-1, 4)
+    numc = confidences.shape[-1]
+    confidences = confidences[c_mask,:].clone().view(-1, numc)
+
+    max_k = min(args.TOPK*60, scores.shape[0])
+    ids, counts = nms(boxes, scores, args.NMS_THRESH, max_k)  # idsn - ids after nms
+    scores = scores[ids[:min(args.TOPK,counts)]].cpu().numpy()
+    boxes = boxes[ids[:min(args.TOPK,counts)],:].cpu().numpy()
+    confidences = confidences[ids[:min(args.TOPK, counts)],:].cpu().numpy()
+    cls_dets = np.hstack((boxes, scores[:, np.newaxis])).astype(np.float32, copy=True)
+    save_data = np.hstack((cls_dets, confidences[:,1:])).astype(np.float32)
+    #print(save_data.shape)
+    return save_data
+
+
+def filter_detections_for_dumping(args, scores, decoded_boxes_batch, confidences):
+    c_mask = scores.gt(args.GEN_CONF_THRESH)  # greater than minmum threshold
+    scores = scores[c_mask].squeeze()
+    if scores.dim() == 0 or scores.shape[0] == 0:
+        return np.zeros((0,5)), np.zeros((0,200))
+    
+    boxes = decoded_boxes_batch[c_mask, :].clone().view(-1, 4)
+    numc = confidences.shape[-1]
+    confidences = confidences[c_mask,:].clone().view(-1, numc)
+
+    # sorted_ind = np.argsort(-scores.cpu().numpy())
+    # sorted_ind = sorted_ind[:topk*10]
+    # boxes_np = boxes.cpu().numpy()
+    # confidences_np = confidences.cpu().numpy()
+    # save_data = np.hstack((boxes_np[sorted_ind,:], confidences_np[sorted_ind, :]))
+    # args.GEN_TOPK, args.GEN_NMS
+     
+    max_k = min(args.GEN_TOPK*500, scores.shape[0])
+    ids, counts = nms(boxes, scores, args.GEN_NMS, max_k)  # idsn - ids after nms
+    # keepids = torchvision.ops.nms(boxes, scores, args.GEN_NMS)
+    # pdb.set_trace()
+    scores = scores[ids[:min(args.GEN_TOPK,counts)]].cpu().numpy()
+    boxes = boxes[ids[:min(args.GEN_TOPK,counts)],:].cpu().numpy()
+    confidences = confidences[ids[:min(args.GEN_TOPK, counts)],:].cpu().numpy()
+    cls_dets = np.hstack((boxes, scores[:, np.newaxis])).astype(np.float32, copy=True)
+    save_data = np.hstack((cls_dets, confidences[:,1:])).astype(np.float32)
+    #print(save_data.shape)
+    return cls_dets, save_data
+
+def make_joint_probs_from_marginals(frame_dets, childs, num_classes_list, start_id=4):
+    
+    # pdb.set_trace()
+
+    add_list = copy.deepcopy(num_classes_list[:3])
+    add_list[0] = start_id+1
+    add_list[1] = add_list[0]+add_list[1]
+    add_list[2] = add_list[1]+add_list[2]
+    # for ind in range(frame_dets.shape[0]):
+    for nlt, ltype in enumerate(['duplex','triplet']):
+        lchilds = childs[ltype+'_childs']
+        lstart = start_id
+        for num in num_classes_list[:4+nlt]:
+            lstart += num
+        
+        for c in range(num_classes_list[4+nlt]):
+            tmp_scores = []
+            for chid, ch in enumerate(lchilds[c]):
+                if len(tmp_scores)<1:
+                    tmp_scores = copy.deepcopy(frame_dets[:,add_list[chid]+ch])
+                else:
+                    tmp_scores *= frame_dets[:,add_list[chid]+ch]
+            frame_dets[:,lstart+c] = tmp_scores
+
+    return frame_dets
+
+
+
+def eval_strings():
+    return ["Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = ",
+            "Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = ",
+            "Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = ",    
+            "Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = ",
+            "Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = ",
+            "Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = ",
+            "Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = ",
+            "Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = ",
+            "Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = ",
+            "Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = ",
+            "Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = ",
+            "Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = "]
diff --git a/train_tuber_road.py b/train_tuber_road.py
new file mode 100644
index 0000000..083b652
--- /dev/null
+++ b/train_tuber_road.py
@@ -0,0 +1,104 @@
+import argparse
+import datetime
+import time
+
+import torch
+import torch.optim
+from tensorboardX import SummaryWriter
+
+from models.tuber_ava import build_model
+from utils.model_utils import deploy_model, load_model, save_checkpoint
+from utils.video_action_recognition import train_tuber_detection, validate_tuber_detection
+from pipelines.video_action_recognition_config import get_cfg_defaults
+from pipelines.launch import spawn_workers
+from utils.utils import build_log_dir
+from datasets.road_frames import build_dataloader
+from utils.lr_scheduler import build_scheduler
+
+
+def main_worker(cfg):
+    # create tensorboard and logs
+    if cfg.DDP_CONFIG.GPU_WORLD_RANK == 0:
+        tb_logdir = build_log_dir(cfg)
+        writer = SummaryWriter(log_dir=tb_logdir)
+    else:
+        writer = None
+    # cfg.freeze()
+
+    # create model
+    print('Creating TubeR model: %s' % cfg.CONFIG.MODEL.NAME)
+    model, criterion, postprocessors = build_model(cfg)
+    model = deploy_model(model, cfg, is_tuber=True)
+    num_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print('Number of parameters in the model: %6.2fM' % (num_parameters / 1000000))
+
+    # create dataset and dataloader
+    train_loader, val_loader, train_sampler, val_sampler, mg_sampler = build_dataloader(cfg)
+
+
+    # create criterion
+    criterion = criterion.cuda()
+
+    param_dicts = [
+        {"params": [p for n, p in model.named_parameters() if "backbone" not in n and "class_embed" not in n and "query_embed" not in n and p.requires_grad]},
+        {
+            "params": [p for n, p in model.named_parameters() if "backbone" in n and p.requires_grad],
+            "lr": cfg.CONFIG.TRAIN.LR_BACKBONE,
+        },
+        {
+            "params": [p for n, p in model.named_parameters() if "class_embed" in n and p.requires_grad],
+            "lr": cfg.CONFIG.TRAIN.LR, #10
+        },
+        {
+            "params": [p for n, p in model.named_parameters() if "query_embed" in n and p.requires_grad],
+            "lr": cfg.CONFIG.TRAIN.LR, #10
+        },
+    ]
+
+    # create optimizer
+    optimizer = torch.optim.AdamW(param_dicts, lr=cfg.CONFIG.TRAIN.LR, weight_decay=cfg.CONFIG.TRAIN.W_DECAY)
+
+    # create lr scheduler
+    if cfg.CONFIG.TRAIN.LR_POLICY == "step":
+        lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[30,60], gamma=0.1)
+    else:
+        lr_scheduler = build_scheduler(cfg, optimizer, len(train_loader))
+
+    # docs: add resume option
+    if cfg.CONFIG.MODEL.LOAD:
+        model, _ = load_model(model, cfg, load_fc=cfg.CONFIG.MODEL.LOAD_FC)
+
+    print('Start training...')
+    start_time = time.time()
+    max_accuracy = 0.0
+    for epoch in range(cfg.CONFIG.TRAIN.START_EPOCH, cfg.CONFIG.TRAIN.EPOCH_NUM):
+        if cfg.DDP_CONFIG.DISTRIBUTED:
+            train_sampler.set_epoch(epoch)
+
+        train_tuber_detection(cfg, model, criterion, train_loader, optimizer, epoch, cfg.CONFIG.LOSS_COFS.CLIPS_MAX_NORM, lr_scheduler, writer)
+
+        if cfg.DDP_CONFIG.GPU_WORLD_RANK == 0 and (
+                epoch % cfg.CONFIG.LOG.SAVE_FREQ == 0 or epoch == cfg.CONFIG.TRAIN.EPOCH_NUM - 1):
+            save_checkpoint(cfg, epoch, model, max_accuracy, optimizer, lr_scheduler)
+
+        if epoch % cfg.CONFIG.VAL.FREQ == 0 or epoch == cfg.CONFIG.TRAIN.EPOCH_NUM - 1:
+            validate_tuber_detection(cfg, model, criterion, postprocessors, val_loader, epoch, writer)
+
+    if writer is not None:
+        writer.close()
+
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print('Training time {}'.format(total_time_str))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Train video action recognition transformer models.')
+    parser.add_argument('--config-file',
+                        default='/xxx/TubeR_AVA_v2.1_CSN-152.yaml',
+                        help='path to config file.')
+    args = parser.parse_args()
+
+    cfg = get_cfg_defaults()
+    cfg.merge_from_file(args.config_file)
+    spawn_workers(main_worker, cfg)
diff --git a/train_tuber_roadpp.py b/train_tuber_roadpp.py
new file mode 100644
index 0000000..083b652
--- /dev/null
+++ b/train_tuber_roadpp.py
@@ -0,0 +1,104 @@
+import argparse
+import datetime
+import time
+
+import torch
+import torch.optim
+from tensorboardX import SummaryWriter
+
+from models.tuber_ava import build_model
+from utils.model_utils import deploy_model, load_model, save_checkpoint
+from utils.video_action_recognition import train_tuber_detection, validate_tuber_detection
+from pipelines.video_action_recognition_config import get_cfg_defaults
+from pipelines.launch import spawn_workers
+from utils.utils import build_log_dir
+from datasets.road_frames import build_dataloader
+from utils.lr_scheduler import build_scheduler
+
+
+def main_worker(cfg):
+    # create tensorboard and logs
+    if cfg.DDP_CONFIG.GPU_WORLD_RANK == 0:
+        tb_logdir = build_log_dir(cfg)
+        writer = SummaryWriter(log_dir=tb_logdir)
+    else:
+        writer = None
+    # cfg.freeze()
+
+    # create model
+    print('Creating TubeR model: %s' % cfg.CONFIG.MODEL.NAME)
+    model, criterion, postprocessors = build_model(cfg)
+    model = deploy_model(model, cfg, is_tuber=True)
+    num_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print('Number of parameters in the model: %6.2fM' % (num_parameters / 1000000))
+
+    # create dataset and dataloader
+    train_loader, val_loader, train_sampler, val_sampler, mg_sampler = build_dataloader(cfg)
+
+
+    # create criterion
+    criterion = criterion.cuda()
+
+    param_dicts = [
+        {"params": [p for n, p in model.named_parameters() if "backbone" not in n and "class_embed" not in n and "query_embed" not in n and p.requires_grad]},
+        {
+            "params": [p for n, p in model.named_parameters() if "backbone" in n and p.requires_grad],
+            "lr": cfg.CONFIG.TRAIN.LR_BACKBONE,
+        },
+        {
+            "params": [p for n, p in model.named_parameters() if "class_embed" in n and p.requires_grad],
+            "lr": cfg.CONFIG.TRAIN.LR, #10
+        },
+        {
+            "params": [p for n, p in model.named_parameters() if "query_embed" in n and p.requires_grad],
+            "lr": cfg.CONFIG.TRAIN.LR, #10
+        },
+    ]
+
+    # create optimizer
+    optimizer = torch.optim.AdamW(param_dicts, lr=cfg.CONFIG.TRAIN.LR, weight_decay=cfg.CONFIG.TRAIN.W_DECAY)
+
+    # create lr scheduler
+    if cfg.CONFIG.TRAIN.LR_POLICY == "step":
+        lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[30,60], gamma=0.1)
+    else:
+        lr_scheduler = build_scheduler(cfg, optimizer, len(train_loader))
+
+    # docs: add resume option
+    if cfg.CONFIG.MODEL.LOAD:
+        model, _ = load_model(model, cfg, load_fc=cfg.CONFIG.MODEL.LOAD_FC)
+
+    print('Start training...')
+    start_time = time.time()
+    max_accuracy = 0.0
+    for epoch in range(cfg.CONFIG.TRAIN.START_EPOCH, cfg.CONFIG.TRAIN.EPOCH_NUM):
+        if cfg.DDP_CONFIG.DISTRIBUTED:
+            train_sampler.set_epoch(epoch)
+
+        train_tuber_detection(cfg, model, criterion, train_loader, optimizer, epoch, cfg.CONFIG.LOSS_COFS.CLIPS_MAX_NORM, lr_scheduler, writer)
+
+        if cfg.DDP_CONFIG.GPU_WORLD_RANK == 0 and (
+                epoch % cfg.CONFIG.LOG.SAVE_FREQ == 0 or epoch == cfg.CONFIG.TRAIN.EPOCH_NUM - 1):
+            save_checkpoint(cfg, epoch, model, max_accuracy, optimizer, lr_scheduler)
+
+        if epoch % cfg.CONFIG.VAL.FREQ == 0 or epoch == cfg.CONFIG.TRAIN.EPOCH_NUM - 1:
+            validate_tuber_detection(cfg, model, criterion, postprocessors, val_loader, epoch, writer)
+
+    if writer is not None:
+        writer.close()
+
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print('Training time {}'.format(total_time_str))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Train video action recognition transformer models.')
+    parser.add_argument('--config-file',
+                        default='/xxx/TubeR_AVA_v2.1_CSN-152.yaml',
+                        help='path to config file.')
+    args = parser.parse_args()
+
+    cfg = get_cfg_defaults()
+    cfg.merge_from_file(args.config_file)
+    spawn_workers(main_worker, cfg)
diff --git a/utils/video_action_recognition.py b/utils/video_action_recognition.py
index 065f12a..20aa3e6 100644
--- a/utils/video_action_recognition.py
+++ b/utils/video_action_recognition.py
@@ -92,14 +92,18 @@ def train_tuber_detection(cfg, model, criterion, data_loader, optimizer, epoch,
     # metric_logger.add_meter('class_error', utils.SmoothedValue(window_size=1, fmt='{value:.2f}'))
     header = 'Epoch: [{}]'.format(epoch)
     print_freq = 10
-
+    skip_iter = False
     for idx, data in enumerate(data_loader):
+        # if idx > 10:
+        #     break
 
         data_time.update(time.time() - end)
 
         # for samples, targets in metric_logger.log_every(data_loader, print_freq, epoch, ddp_params, writer, header):
         device = "cuda:" + str(cfg.DDP_CONFIG.GPU)
         samples = data[0]
+
+
         if cfg.CONFIG.TWO_STREAM:
             samples2 = data[1]
             targets = data[2]
@@ -116,10 +120,23 @@ def train_tuber_detection(cfg, model, criterion, data_loader, optimizer, epoch,
             else:
                 lfb_features = data[-1]
                 lfb_features = lfb_features.to(device)
+        
+        
+
+        # for target in targets:
+        #     if len(target['boxes']) == 0:
+        #         skip_iter = True
+        #         break
+        # if skip_iter:
+        #     print("skip iteration ...")
+        #     skip_iter=False
+        #     continue
+    
         for t in targets: del t["image_id"]
 
         samples = samples.to(device)
         targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
+        
 
         if cfg.CONFIG.TWO_STREAM:
             if cfg.CONFIG.USE_LFB:
@@ -190,7 +207,8 @@ def train_tuber_detection(cfg, model, criterion, data_loader, optimizer, epoch,
             class_err.update(loss_dict_reduced['class_error'], len(targets))
 
             if cfg.CONFIG.MATCHER.BNY_LOSS:
-                losses_ce_b.update(loss_dict_reduced['loss_ce_b'].item(), len(targets))
+                losses_ce_b.update(loss_dict_reduced['loss_ce_b'].item(), len(targets))   
+
 
             if not math.isfinite(loss_value):
                 print("Loss is {}, stopping training".format(loss_value))
@@ -254,6 +272,8 @@ def validate_tuber_detection(cfg, model, criterion, postprocessors, data_loader,
         print("all tmp files removed")
 
     for idx, data in enumerate(data_loader):
+        # if idx > 10:
+        #     break
         data_time.update(time.time() - end)
 
         # for samples, targets in metric_logger.log_every(data_loader, print_freq, epoch, ddp_params, writer, header):
@@ -302,6 +322,11 @@ def validate_tuber_detection(cfg, model, criterion, postprocessors, data_loader,
             else:
                 outputs = model(samples)
 
+        # print(outputs)
+        # print(targets)
+        # print(rr)
+
+
         loss_dict = criterion(outputs, targets)
 
         weight_dict = criterion.weight_dict