diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..53f8173 --- /dev/null +++ b/.gitignore @@ -0,0 +1,12 @@ +cache/* +*.txt +datasets/assets/* +pretrained_weights/* + +*.pyc + +cache/* +cache_ROAD/* +cache_ROADpp/* + + diff --git a/configuration/TubeR_CSN152_AVA22.yaml b/configuration/TubeR_CSN152_AVA22.yaml index 9ae6f33..8990f6b 100644 --- a/configuration/TubeR_CSN152_AVA22.yaml +++ b/configuration/TubeR_CSN152_AVA22.yaml @@ -1,13 +1,13 @@ DDP_CONFIG: WORLD_SIZE: 1 WORLD_RANK: 0 - GPU_WORLD_SIZE: 8 + GPU_WORLD_SIZE: 4 GPU_WORLD_RANK: 0 - DIST_URL: 'tcp://xxx.xxx.xxx.xxx:xxxx' - WOLRD_URLS: ['xxx.xxx.xxx.xxx'] - AUTO_RANK_MATCH: True + DIST_URL: 'tcp://161.73.173.217:23456' + WOLRD_URLS: ['161.73.173.217'] + AUTO_RANK_MATCH: False DIST_BACKEND: 'nccl' - GPU: 0 + GPU: 4 DISTRIBUTED: True CONFIG: @@ -17,7 +17,7 @@ CONFIG: USE_LOCATION: False TRAIN: EPOCH_NUM: 20 - BATCH_SIZE: 2 + BATCH_SIZE: 1 LR: 1e-4 MIN_LR: 1e-5 LR_BACKBONE: 1e-5 @@ -39,9 +39,9 @@ CONFIG: DATA: DATASET_NAME: 'ava' - LABEL_PATH: '/xxx/datasets/ava_action_list_v2.1_for_activitynet_2018.pbtxt' - ANNO_PATH: '/xxx/datasets/ava_{}_v22.json' - DATA_PATH: '/xxx/ava/frames/{}/' + LABEL_PATH: 'datasets/assets/ava_action_list_v2.1_for_activitynet_2018.pbtxt' + ANNO_PATH: 'datasets/assets/ava_{}_v22.json' + DATA_PATH: '/mnt/pluto-theta/salman/ROAD/Datasets/ava_download/frames/{}/' NUM_CLASSES: 80 MULTIGRID: False IMG_SIZE: 256 @@ -69,9 +69,9 @@ CONFIG: TEMP_LEN: 32 SAMPLE_RATE: 2 PRETRAINED: False - PRETRAIN_BACKBONE_DIR: "/xxx/irCSN_152_ft_kinetics_from_ig65m_f126851907.mat" - PRETRAIN_TRANSFORMER_DIR: "/xxx/detr.pth" - PRETRAINED_PATH: "/xxx/ADTR_CSN_152_decode_ava_22.pth" + PRETRAIN_BACKBONE_DIR: "pretrained_weights/irCSN_152_ft_kinetics_from_ig65m_f126851907.mat" + PRETRAIN_TRANSFORMER_DIR: "pretrained_weights/detr.pth" + PRETRAINED_PATH: "pretrained_weights/ADTR_CSN_152_decode_ava_22.pth" LOAD: True LOAD_FC: True @@ -94,9 +94,9 @@ CONFIG: CLIPS_MAX_NORM: 0.1 LOG: - BASE_PATH: '/xxx/AVA_Tuber' + BASE_PATH: 'cache/AVA_Tuber' LOG_DIR: 'tb_log' SAVE_DIR: 'checkpoints' - EVAL_DIR: '/xxx/AVA_Tuber/eval' + EVAL_DIR: 'cache/AVA_Tuber/eval' SAVE_FREQ: 1 RES_DIR: 'tmp2' \ No newline at end of file diff --git a/configuration/TubeR_CSN152_ROAD.yaml b/configuration/TubeR_CSN152_ROAD.yaml new file mode 100644 index 0000000..4aa78f8 --- /dev/null +++ b/configuration/TubeR_CSN152_ROAD.yaml @@ -0,0 +1,116 @@ +DDP_CONFIG: + WORLD_SIZE: 1 + WORLD_RANK: 0 + GPU_WORLD_SIZE: 4 + GPU_WORLD_RANK: 0 + DIST_URL: 'tcp://161.73.173.217:23457' + WOLRD_URLS: ['161.73.173.217'] + AUTO_RANK_MATCH: False + DIST_BACKEND: 'nccl' + GPU: 4 + DISTRIBUTED: True + +CONFIG: + EVAL_ONLY: False + TWO_STREAM: False + USE_LFB: False + USE_LOCATION: False + TRAIN: + EPOCH_NUM: 20 + BATCH_SIZE: 1 + LR: 1e-4 + MIN_LR: 1e-5 + LR_BACKBONE: 1e-5 + MOMENTUM: 0.9 + W_DECAY: 1e-4 + LR_POLICY: 'step' + USE_WARMUP: False + WARMUP_START_LR: 1e-5 + WARMUP_EPOCHS: 4 + LR_MILESTONE: [10, 15] + STEP: 0.1 + OPTIMIZER: + NAME: SGD + AUX_LOSS: True + + VAL: + BATCH_SIZE: 1 + FREQ: 2 + + + DATA: + DATASET: 'road' + DATASET_NAME: 'ava' + TRAIN_SUBSETS: [train_3] + VAL_SUBSETS: [train_3] + SEQ_LEN: 12 + MIN_SEQ_STEP: 1 + MAX_SEQ_STEP: 1 + DATA_ROOT: '/mnt/pluto-gamma/salman/ROAD/Datasets/' + ANNO_ROOT: '/mnt/pluto-gamma/salman/ROAD/Datasets/' + train_skip_step: 1 + skip_step: 1 + + # DATASET_NAME: 'ava' + LABEL_PATH: 'datasets/road_labels.pbtxt' + # ANNO_PATH: 'datasets/assets/ava_{}_v22.json' + # DATA_PATH: '/mnt/pluto-gamma/salman/ROAD/Datasets/ava_download/frames/{}/' + NUM_CLASSES: 41 + MULTIGRID: False + IMG_SIZE: 680 + IMG_RESHAPE_SIZE: 512 + TEMP_LEN: 12 + FRAME_RATE: 2 + + + MODEL: + SINGLE_FRAME: True + BACKBONE_NAME: CSN-152 + TEMPORAL_DS_STRATEGY: decode + LAST_STRIDE: False + GENERATE_LFB: False + NAME: 'ava_detr_9_224' + ENC_LAYERS: 6 + DEC_LAYERS: 6 + D_MODEL: 256 + NHEAD: 8 + NUM_ENCODER_LAYERS: 12 + DIM_FEEDFORWARD: 2048 + QUERY_NUM: 15 + NORMALIZE_BEFORE: False + DROPOUT: 0.1 + DS_RATE: 8 + TEMP_LEN: 12 + SAMPLE_RATE: 2 + PRETRAINED: True + PRETRAIN_BACKBONE_DIR: "pretrained_weights/irCSN_152_ft_kinetics_from_ig65m_f126851907.mat" + PRETRAIN_TRANSFORMER_DIR: "pretrained_weights/detr.pth" + PRETRAINED_PATH: "pretrained_weights/_TubeR_CSN152_AVA22.pth" + LOAD: True + LOAD_FC: True + + MATCHER: + COST_CLASS: 12 + COST_BBOX: 5 + COST_GIOU: 2 + BNY_LOSS: True + BEFORE: False + + LOSS_COFS: + MASK_COF: 1 + DICE_COF: 12 + BBOX_COF: 5 + GIOU_COF: 2 + EOS_COF: 0.1 + WEIGHT: 10 + WEIGHT_CHANGE: 1000 + LOSS_CHANGE_COF: 2 + CLIPS_MAX_NORM: 0.1 + + LOG: + BASE_PATH: 'cache_ROAD/AVA_Tuber' + LOG_DIR: 'tb_log' + SAVE_DIR: 'checkpoints' + EVAL_DIR: 'cache_ROAD/AVA_Tuber/eval' + SAVE_FREQ: 1 + RES_DIR: 'tmp2' \ No newline at end of file diff --git a/configuration/TubeR_CSN152_ROADpp.yaml b/configuration/TubeR_CSN152_ROADpp.yaml new file mode 100644 index 0000000..d319f57 --- /dev/null +++ b/configuration/TubeR_CSN152_ROADpp.yaml @@ -0,0 +1,117 @@ +DDP_CONFIG: + WORLD_SIZE: 1 + WORLD_RANK: 0 + GPU_WORLD_SIZE: 4 + GPU_WORLD_RANK: 0 + DIST_URL: 'tcp://161.73.173.213:23457' + WOLRD_URLS: ['161.73.173.213'] + AUTO_RANK_MATCH: False + DIST_BACKEND: 'nccl' + GPU: 4 + DISTRIBUTED: True + +CONFIG: + EVAL_ONLY: False + TWO_STREAM: False + USE_LFB: False + USE_LOCATION: False + TRAIN: + EPOCH_NUM: 20 + BATCH_SIZE: 1 + LR: 1e-4 + MIN_LR: 1e-5 + LR_BACKBONE: 1e-5 + MOMENTUM: 0.9 + W_DECAY: 1e-4 + LR_POLICY: 'step' + USE_WARMUP: False + WARMUP_START_LR: 1e-5 + WARMUP_EPOCHS: 4 + LR_MILESTONE: [10, 15] + STEP: 0.1 + OPTIMIZER: + NAME: SGD + AUX_LOSS: True + + VAL: + BATCH_SIZE: 1 + FREQ: 2 + + + DATA: + DATASET: 'roadpp' + DATASET_NAME: 'ava' + TRAIN_SUBSETS: [train] + VAL_SUBSETS: [val] + SEQ_LEN: 10 + MIN_SEQ_STEP: 1 + MAX_SEQ_STEP: 1 + DATA_ROOT: '../' + ANNO_ROOT: '../' + train_skip_step: 1 + skip_step: 1 + + # DATASET_NAME: 'ava' + LABEL_PATH: 'datasets/roadpp.pbtxt' + # ANNO_PATH: 'datasets/assets/ava_{}_v22.json' + # DATA_PATH: '/mnt/pluto-gamma/salman/ROAD/Datasets/ava_download/frames/{}/' + NUM_CLASSES: 43 + MULTIGRID: False + IMG_SIZE: 680 + IMG_RESHAPE_SIZE: 512 + TEMP_LEN: 10 + FRAME_RATE: 10 + + + MODEL: + SINGLE_FRAME: True + BACKBONE_NAME: CSN-152 + TEMPORAL_DS_STRATEGY: decode + LAST_STRIDE: False + GENERATE_LFB: False + NAME: 'ava_detr_9_224' + ENC_LAYERS: 6 + DEC_LAYERS: 6 + D_MODEL: 256 + NHEAD: 8 + NUM_ENCODER_LAYERS: 12 + DIM_FEEDFORWARD: 2048 + QUERY_NUM: 15 + NORMALIZE_BEFORE: False + DROPOUT: 0.1 + DS_RATE: 8 + TEMP_LEN: 12 + SAMPLE_RATE: 2 + PRETRAINED: True + PRETRAIN_BACKBONE_DIR: "pretrained_weights/irCSN_152_ft_kinetics_from_ig65m_f126851907.mat" + PRETRAIN_TRANSFORMER_DIR: "pretrained_weights/detr.pth" + PRETRAINED_PATH: "-.pth" + # PRETRAINED_PATH: "cache_ROADpp/roadpp_Tuber/2023-04-08-23-36-21/checkpoints/ckpt_epoch_6.pth" + LOAD: True + LOAD_FC: True + + MATCHER: + COST_CLASS: 12 + COST_BBOX: 5 + COST_GIOU: 2 + BNY_LOSS: True + BEFORE: False + + LOSS_COFS: + MASK_COF: 1 + DICE_COF: 12 + BBOX_COF: 5 + GIOU_COF: 2 + EOS_COF: 0.1 + WEIGHT: 10 + WEIGHT_CHANGE: 1000 + LOSS_CHANGE_COF: 2 + CLIPS_MAX_NORM: 0.1 + + LOG: + BASE_PATH: 'cache_ROADpp/roadpp_Tuber' + LOG_DIR: 'tb_log' + SAVE_DIR: 'checkpoints' + EVAL_DIR: 'cache_ROADpp/roadpp_Tuber/eval' + SAVE_FREQ: 1 + RES_DIR: 'tmp2' \ No newline at end of file diff --git a/datasets/ava_frame.py b/datasets/ava_frame.py index 3b322dc..30b87d3 100644 --- a/datasets/ava_frame.py +++ b/datasets/ava_frame.py @@ -71,6 +71,12 @@ def __getitem__(self, index): imgs = torch.stack(imgs, dim=0) imgs = imgs.permute(1, 0, 2, 3) + print('img',imgs.shape) + print('tar',target) + print('tar shape',target.shape) + print(rr) + + return imgs, target def load_annotation(self, sample_id, video_frame_list): @@ -131,7 +137,7 @@ def load_annotation(self, sample_id, video_frame_list): return target def loadvideo(self, start_img, vid, frame_key): - video_frame_path = self.frame_path + video_frame_path = self.frame_path.format(vid) video_frame_list = sorted(glob(video_frame_path + '/*.jpg')) if len(video_frame_list) == 0: diff --git a/datasets/road_frames.py b/datasets/road_frames.py new file mode 100644 index 0000000..f0ad0cd --- /dev/null +++ b/datasets/road_frames.py @@ -0,0 +1,1231 @@ +import pandas as pd +import cv2 +import torch.utils.data as data +from glob import glob +import numpy as np +from utils.misc import collate_fn +import torch +import random +from PIL import Image +import torch.nn.functional as F +import datasets.video_transforms as T +import json + + +# class VideoDataset(data.Dataset): + +# def __init__(self, frame_path, video_frame_bbox, frame_keys_list, clip_len, frame_sample_rate, +# transforms, crop_size=224, resize_size=256, mode="train", class_num=80): +# self.video_frame_bbox = video_frame_bbox +# self.video_frame_list = frame_keys_list +# self.frame_path = frame_path + +# self.video_frame_list = self.video_frame_list + +# self.crop_size = crop_size +# self.clip_len = clip_len +# self.frame_sample_rate = frame_sample_rate +# self.class_num = class_num +# self.resize_size = resize_size + +# self.index_cnt = 0 +# self._transforms = transforms +# self.mode = mode + +# print("rescale size: {}, crop size: {}".format(resize_size, crop_size)) + +# def __getitem__(self, index): + +# frame_key = self.video_frame_list[index] +# print(frame_key) + + +# vid, frame_second = frame_key.split(",") +# timef = int(frame_second) - 900 + +# start_img = np.max((timef * 30 - self.clip_len // 2 * self.frame_sample_rate, 0)) + +# imgs, target = self.loadvideo(start_img, vid, frame_key) + +# if len(target) == 0 or target['boxes'].shape[0] == 0: +# pass +# else: +# if self._transforms is not None: +# imgs, target = self._transforms(imgs, target) + +# while len(target) == 0 or target['boxes'].shape[0] == 0: +# print('resample.') +# self.index_cnt -= 1 +# index = np.random.randint(len(self.video_frame_list)) +# frame_key = self.video_frame_list[index] +# vid, frame_second = frame_key.split(",") +# timef = int(frame_second) - 900 + +# start_img = np.max((timef * 30 - self.clip_len // 2 * self.frame_sample_rate, 0)) + +# imgs, target = self.loadvideo(start_img, vid, frame_key) + +# if len(target)==0 or target['boxes'].shape[0] == 0: +# pass +# else: +# if self._transforms is not None: +# imgs, target = self._transforms(imgs, target) + +# imgs = torch.stack(imgs, dim=0) +# imgs = imgs.permute(1, 0, 2, 3) + + +# print(imgs.shape) + +# print(target['image_id']) +# print(target['boxes']) +# print(target['raw_boxes']) +# print(target['labels']) +# print(target['size']) +# print(target['orig_size']) +# print(target['area']) + +# print(rr) +# return imgs, target + +# def load_annotation(self, sample_id, video_frame_list): + +# num_classes = self.class_num +# boxes, classes = [], [] +# target = {} + +# first_img = cv2.imread(video_frame_list[0]) + +# oh = first_img.shape[0] +# ow = first_img.shape[1] +# if oh <= ow: +# nh = self.resize_size +# nw = self.resize_size * (ow / oh) +# else: +# nw = self.resize_size +# nh = self.resize_size * (oh / ow) + +# p_t = int(self.clip_len // 2) +# key_pos = p_t +# anno_entity = self.video_frame_bbox[sample_id] + +# for i, bbox in enumerate(anno_entity["bboxes"]): +# label_tmp = np.zeros((num_classes, )) +# acts_p = anno_entity["acts"][i] +# for l in acts_p: +# label_tmp[l] = 1 + +# if np.sum(label_tmp) == 0: continue +# p_x = np.int(bbox[0] * nw) +# p_y = np.int(bbox[1] * nh) +# p_w = np.int(bbox[2] * nw) +# p_h = np.int(bbox[3] * nh) + +# boxes.append([p_t, p_x, p_y, p_w, p_h]) +# classes.append(label_tmp) + +# boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 5) +# boxes[:, 1::3].clamp_(min=0, max=int(nw)) +# boxes[:, 2::3].clamp_(min=0, max=nh) + +# if boxes.shape[0]: +# raw_boxes = F.pad(boxes, (1, 0, 0, 0), value=self.index_cnt) +# else: +# raw_boxes = boxes + +# classes = torch.as_tensor(classes, dtype=torch.float32).reshape(-1, num_classes) + +# target["image_id"] = [str(sample_id).replace(",", "_"), key_pos] +# target['boxes'] = boxes +# target['raw_boxes'] = raw_boxes +# target["labels"] = classes +# target["orig_size"] = torch.as_tensor([int(nh), int(nw)]) +# target["size"] = torch.as_tensor([int(nh), int(nw)]) +# self.index_cnt = self.index_cnt + 1 + +# return target + +# def loadvideo(self, start_img, vid, frame_key): +# video_frame_path = self.frame_path.format(vid) +# video_frame_list = sorted(glob(video_frame_path + '/*.jpg')) + +# if len(video_frame_list) == 0: +# print("path doesnt exist", video_frame_path) +# return [], [] + +# target = self.load_annotation(frame_key, video_frame_list) + +# start_img = np.max(start_img, 0) +# end_img = start_img + self.clip_len * self.frame_sample_rate +# indx_img = list(np.clip(range(start_img, end_img, self.frame_sample_rate), 0, len(video_frame_list) - 1)) +# buffer = [] +# for frame_idx in indx_img: +# tmp = Image.open(video_frame_list[frame_idx]) +# tmp = tmp.resize((target['orig_size'][1], target['orig_size'][0])) +# buffer.append(tmp) + +# return buffer, target + +# def __len__(self): +# return len(self.video_frame_list) + + +def make_transforms(image_set, cfg): + normalize = T.Compose([ + T.ToTensor(), + T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ]) + print("transform image crop: {}".format(cfg.CONFIG.DATA.IMG_SIZE)) + if image_set == 'train': + return T.Compose([ + T.RandomHorizontalFlip(), + T.RandomSizeCrop_Custom(cfg.CONFIG.DATA.IMG_SIZE), + T.ColorJitter(), + normalize, + ]) + + if image_set == 'val': + return T.Compose([ + T.Resize_Custom(cfg.CONFIG.DATA.IMG_SIZE), + normalize, + ]) + + if image_set == 'visual': + return T.Compose([ + T.Resize_Custom(cfg.CONFIG.DATA.IMG_SIZE), + normalize, + ]) + raise ValueError(f'unknown {image_set}') + +# def obtain_generated_bboxes_training(input_csv="/xxx/AVA_v2.2/ava_{}_v2.2.csv", +# eval_only=False, +# frame_root="/xxx/frames", +# mode="train"): +# import os +# from glob import glob +# used=[] +# input_csv = input_csv.format(mode) +# # frame_root = frame_root.format(mode) + +# video_frame_bbox = {} +# gt_sheet = pd.read_csv(input_csv, header=None) +# count = 0 +# frame_keys_list = set() +# missed_videos = set() + +# for index, row in gt_sheet.iterrows(): +# vid = row[0] +# if not os.path.isdir(frame_root + "/" + vid + ""): +# missed_videos.add(vid) +# continue + +# frame_second = row[1] + +# bbox_conf = row[7] +# if bbox_conf < 0.8: +# continue +# frame_key = "{},{}".format(vid, str(frame_second).zfill(4)) + +# frame_keys_list.add(frame_key) + +# count += 1 +# bbox = [row[2], row[3], row[4], row[5]] +# gt = int(row[6]) + +# if frame_key not in video_frame_bbox.keys(): +# video_frame_bbox[frame_key] = {} +# video_frame_bbox[frame_key]["bboxes"] = [bbox] +# video_frame_bbox[frame_key]["acts"] = [[gt - 1]] +# else: +# if bbox not in video_frame_bbox[frame_key]["bboxes"]: +# video_frame_bbox[frame_key]["bboxes"].append(bbox) +# video_frame_bbox[frame_key]["acts"].append([gt - 1]) +# else: +# idx = video_frame_bbox[frame_key]["bboxes"].index(bbox) +# video_frame_bbox[frame_key]["acts"][idx].append(gt - 1) + +# print("missed vids:") +# print(missed_videos) +# return video_frame_bbox, list(frame_keys_list) + + +# def make_image_key(video_id, timestamp): +# """Returns a unique identifier for a video id & timestamp.""" +# return "%s,%04d" % (video_id, int(timestamp)) + + + + + + + + + + + + + + + + +""" + +Target is in xmin, ymin, xmax, ymax, label +coordinates are in range of [0, 1] normlised height and width + +""" + +import json, os +import torch +import pdb, time +import torch.utils as tutils +import pickle +# from .transforms import get_clip_list_resized +import torch.nn.functional as F +import numpy as np +from PIL import ImageFile +ImageFile.LOAD_TRUNCATED_IMAGES = True +from PIL import Image, ImageDraw +from modules.tube_helper import make_gt_tube +import random as random +from modules import utils +from random import shuffle + +logger = utils.get_logger(__name__) + + +def make_box_anno(llist): + box = [llist[2], llist[3], llist[4], llist[5]] + return [float(b) for b in box] + + +def read_ava_annotations(anno_file): + # print(anno_file) + lines = open(anno_file, 'r').readlines() + annotations = {} + is_train = anno_file.find('train') > -1 + + cc = 0 + for line in lines: + cc += 1 + # if cc>500: + # break + line = line.rstrip('\n') + line_list = line.split(',') + # print(line_list) + video_name = line_list[0] + if video_name not in annotations: + annotations[video_name] = {} + time_stamp = float(line_list[1]) + # print(line_list) + numf = float(line_list[7]) ## or score + ts = str(int(time_stamp)) + if len(line_list) > 2: + box = make_box_anno(line_list) + label = int(line_list[6]) + if ts not in annotations[video_name]: + annotations[video_name][ts] = [[time_stamp, box, label, numf]] + else: + annotations[video_name][ts] += [[time_stamp, box, label, numf]] + elif not is_train: + if video_name not in annotations: + annotations[video_name][ts] = [[time_stamp, None, None, numf]] + else: + annotations[video_name][ts] += [[time_stamp, None, None, numf]] + + # for video_name in annotations: + # print(video_name) + return annotations + + + +def read_labelmap(labelmap_file): + """Read label map and class ids.""" + + labelmap = {} + class_ids_map = {} + name = "" + class_id = "" + class_names = [] + print('load label map from ', labelmap_file) + count = 0 + with open(labelmap_file, "r") as f: + for line in f: + # print(line) + if line.startswith(" name:"): + name = line.split('"')[1] + elif line.startswith(" id:") or line.startswith(" label_id:"): + class_id = int(line.strip().split(" ")[-1]) + labelmap[name] = {'org_id':class_id, 'used_id': count} + class_ids_map[class_id] = {'used_id':count, 'clsname': name} + count += 1 + # print(class_id, name) + class_names.append(name) + + # class_names[0] + print('NUmber of classes are ', count) + + return class_names, class_ids_map, labelmap + + +def get_box(box, counts): + box = box.astype(np.float32) - 1 + box[2] += box[0] #convert width to xmax + box[3] += box[1] #converst height to ymax + for bi in range(4): + scale = 320 if bi % 2 == 0 else 240 + box[bi] /= scale + assert 0<=box[bi]<=1.01, box + # if add_one ==0: + box[bi] = min(1.0, max(0, box[bi])) + if counts is None: + box[bi] = box[bi]*682 if bi % 2 == 0 else box[bi]*512 + + return box, counts + +def get_frame_level_annos_ucf24(annotations, numf, num_classes, counts=None): + frame_level_annos = [ {'labeled':True,'ego_label':0,'boxes':[],'labels':[]} for _ in range(numf)] + add_one = 1 + # if num_classes == 24: + # add_one = 0 + for tubeid, tube in enumerate(annotations): + # print('numf00', numf, tube['sf'], tube['ef']) + for frame_index, frame_num in enumerate(np.arange(tube['sf'], tube['ef'], 1)): # start of the tube to end frame of the tube + label = tube['label'] + # assert action_id == label, 'Tube label and video label should be same' + box, counts = get_box(tube['boxes'][frame_index, :].copy(), counts) # get the box as an array + frame_level_annos[frame_num]['boxes'].append(box) + box_labels = np.zeros(num_classes) + # if add_one == 1: + box_labels[0] = 1 + box_labels[label+add_one] = 1 + frame_level_annos[frame_num]['labels'].append(box_labels) + frame_level_annos[frame_num]['ego_label'] = label+1 + # frame_level_annos[frame_index]['ego_label'][] = 1 + if counts is not None: + counts[0,0] += 1 + counts[label,1] += 1 + + return frame_level_annos, counts + + +def get_frame_level_annos_ava(annotations, numf, num_classes, class_ids_map, counts=None, split='val'): + frame_level_annos = [ {'labeled':False,'ego_label':-1,'boxes':[],'labels':[]} for _ in range(numf)] + + keyframes = [] + skip_count = 0 + timestamps = [ str(i) for i in range(902, 1799)] + + if split == 'train': + timestamps = [ts for ts in annotations] + + for ts in timestamps: + boxes = {} + time_stamp = int(ts) + frame_num = int((time_stamp - 900) * 30 + 1) + + if ts in annotations: + # pdb.set_trace() + assert time_stamp == int(annotations[ts][0][0]) + + for anno in annotations[ts]: + box_key = '_'.join('{:0.3f}'.format(b) for b in anno[1]) + assert 80>=anno[2]>=1, 'label should be between 1 and 80 but it is {} '.format(anno[2]) + if anno[2] not in class_ids_map: + skip_count += 1 + continue + + class_id = class_ids_map[anno[2]]['used_id'] + # print(class_id) + if box_key not in boxes: + boxes[box_key] = {'box':anno[1], 'labels':np.zeros(num_classes)} + + boxes[box_key]['labels'][class_id+1] = 1 + boxes[box_key]['labels'][0] = 1 + counts[class_id,1] += 1 + + new_boxes = [] + labels = [] + for box_key in boxes: + new_boxes.append(boxes[box_key]['box']) + labels.append(boxes[box_key]['labels']) + + if len(new_boxes): + new_boxes = np.asarray(new_boxes) + frame_level_annos[frame_num]['boxes'] = new_boxes + + labels = np.asarray(labels) + frame_level_annos[frame_num]['labels'] = labels + + frame_level_annos[frame_num]['labeled'] = True + frame_level_annos[frame_num]['ego_label'] = 1 + + + keyframes.append(frame_num) + if not frame_level_annos[frame_num]['labeled']: + frame_level_annos[frame_num]['ego_label'] = 0 + + return frame_level_annos, counts, keyframes, skip_count + + +def get_filtered_tubes_ucf24(annotations): + filtered_tubes = [] + for tubeid, tube in enumerate(annotations): + frames = [] + boxes = [] + label = tube['label'] + count = 0 + for frame_index, frame_num in enumerate(np.arange(tube['sf'], tube['ef'], 1)): + frames.append(frame_num+1) + box, _ = get_box(tube['boxes'][frame_index, :].copy(), None) + boxes.append(box) + count += 1 + assert count == tube['boxes'].shape[0], 'numb: {} count ={}'.format(tube['boxes'].shape[0], count) + temp_tube = make_gt_tube(frames, boxes, label) + filtered_tubes.append(temp_tube) + return filtered_tubes + + +def resize(image, size): + image = F.interpolate(image.unsqueeze(0), size=size, mode="nearest").squeeze(0) + return image + + +def filter_labels(ids, all_labels, used_labels): + """Filter the used ids""" + used_ids = [] + for id in ids: + label = all_labels[id] + if label in used_labels: + used_ids.append(used_labels.index(label)) + + return used_ids + + +def get_gt_video_list(anno_file, SUBSETS): + """Get video list form ground truth videos used in subset + and their ground truth tubes """ + + with open(anno_file, 'r') as fff: + final_annots = json.load(fff) + + video_list = [] + for videoname in final_annots['db']: + if is_part_of_subsets(final_annots['db'][videoname]['split_ids'], SUBSETS): + video_list.append(videoname) + + return video_list + + +def get_filtered_tubes(label_key, final_annots, videoname): + + key_tubes = final_annots['db'][videoname][label_key] + all_labels = final_annots['all_'+label_key.replace('tubes','labels')] + labels = final_annots[label_key.replace('tubes','labels')] + filtered_tubes = [] + for _ , tube in key_tubes.items(): + label_id = tube['label_id'] + label = all_labels[label_id] + if label in labels: + new_label_id = labels.index(label) + # temp_tube = GtTube(new_label_id) + frames = [] + boxes = [] + if 'annos' in tube.keys(): + for fn, anno_id in tube['annos'].items(): + frames.append(int(fn)) + anno = final_annots['db'][videoname]['frames'][fn]['annos'][anno_id] + box = anno['box'].copy() + for bi in range(4): + assert 0<=box[bi]<=1.01, box + box[bi] = min(1.0, max(0, box[bi])) + box[bi] = box[bi]*682 if bi % 2 == 0 else box[bi]*512 + boxes.append(box) + else: + for fn in tube['frames']: + frames.append(int(fn)) + + temp_tube = make_gt_tube(frames, boxes, new_label_id) + filtered_tubes.append(temp_tube) + + return filtered_tubes + + +def get_filtered_frames(label_key, final_annots, videoname, filtered_gts): + + frames = final_annots['db'][videoname]['frames'] + if label_key == 'agent_ness': + all_labels = [] + labels = [] + else: + all_labels = final_annots['all_'+label_key+'_labels'] + labels = final_annots[label_key+'_labels'] + + for frame_id , frame in frames.items(): + frame_name = '{:05d}'.format(int(frame_id)) + if frame['annotated']>0: + all_boxes = [] + if 'annos' in frame: + frame_annos = frame['annos'] + for key in frame_annos: + anno = frame_annos[key] + box = np.asarray(anno['box'].copy()) + for bi in range(4): + assert 0<=box[bi]<=1.01, box + box[bi] = min(1.0, max(0, box[bi])) + box[bi] = box[bi]*682 if bi % 2 == 0 else box[bi]*512 + if label_key == 'agent_ness': + filtered_ids = [0] + else: + filtered_ids = filter_labels(anno[label_key+'_ids'], all_labels, labels) + + if len(filtered_ids)>0: + all_boxes.append([box, filtered_ids]) + + filtered_gts[videoname+frame_name] = all_boxes + + return filtered_gts + +def get_av_actions(final_annots, videoname): + label_key = 'av_action' + frames = final_annots['db'][videoname]['frames'] + all_labels = final_annots['all_'+label_key+'_labels'] + labels = final_annots[label_key+'_labels'] + + filtered_gts = {} + for frame_id , frame in frames.items(): + frame_name = '{:05d}'.format(int(frame_id)) + if frame['annotated']>0: + gts = filter_labels(frame[label_key+'_ids'], all_labels, labels) + filtered_gts[videoname+frame_name] = gts + + return filtered_gts + +def get_video_tubes(final_annots, videoname): + + tubes = {} + for key in final_annots['db'][videoname].keys(): + if key.endswith('tubes'): + filtered_tubes = get_filtered_tubes(key, final_annots, videoname) + tubes[key] = filtered_tubes + + return tubes + + +def is_part_of_subsets(split_ids, SUBSETS): + + is_it = False + for subset in SUBSETS: + + if subset in split_ids: + is_it = True + + return is_it + + +class VideoDataset(tutils.data.Dataset): + """ + ROAD Detection dataset class for pytorch dataloader + """ + + def __init__(self, args, train=True, input_type='rgb', transform=None, + skip_step=1, full_test=False,crop_size=224, resize_size=256): + + self.num_of_classes = args.CONFIG.DATA.NUM_CLASSES + self.DATASET = args.CONFIG.DATA.DATASET + if train == True: + self.SUBSETS = args.CONFIG.DATA.TRAIN_SUBSETS + else: + self.SUBSETS = args.CONFIG.DATA.VAL_SUBSETS + + self.SEQ_LEN = args.CONFIG.DATA.SEQ_LEN + self.index_cnt = 0 + self.MIN_SEQ_STEP = args.CONFIG.DATA.MIN_SEQ_STEP + self.MAX_SEQ_STEP = args.CONFIG.DATA.MAX_SEQ_STEP + # self.MULIT_SCALE = args.MULIT_SCALE + self.full_test = full_test + self.skip_step = skip_step #max(skip_step, self.SEQ_LEN*self.MIN_SEQ_STEP/2) + self.num_steps = max(1, int(self.MAX_SEQ_STEP - self.MIN_SEQ_STEP + 1 )//2) + # self.input_type = input_type + self.input_type = input_type+'-images' + self.train = train + self.root = args.CONFIG.DATA.DATA_ROOT + args.CONFIG.DATA.DATASET + '/' + self._imgpath = os.path.join(self.root, self.input_type) + self.anno_root = self.root + if len(args.CONFIG.DATA.ANNO_ROOT)>1: + self.anno_root = args.CONFIG.DATA.ANNO_ROOT + + self.crop_size = crop_size + self.resize_size = resize_size + + + # self.image_sets = image_sets + self._transforms = transform + self.ids = list() + if self.DATASET == 'road': + self._make_lists_road() + elif self.DATASET == 'roadpp': + self._make_lists_roadpp() + elif self.DATASET == 'ucf24': + self._make_lists_ucf24() + else: + raise Exception('Specfiy corect dataset') + + self.num_label_types = len(self.label_types) + + + + + def _make_lists_ucf24(self): + + self.anno_file = os.path.join(self.anno_root, 'pyannot_with_class_names.pkl') + + with open(self.anno_file,'rb') as fff: + final_annots = pickle.load(fff) + + database = final_annots['db'] + self.trainvideos = final_annots['trainvideos'] + ucf_classes = final_annots['classes'] + self.label_types = ['action_ness', 'action'] # + # pdb.set_trace() + self.num_classes_list = [1, 24] + self.num_classes = 25 # one for action_ness + + self.ego_classes = ['Non_action'] + ucf_classes + self.num_ego_classes = len(self.ego_classes) + + counts = np.zeros((24, 2), dtype=np.int32) + + ratios = [1.0, 1.1, 1.1, 0.9, 1.1, 0.8, 0.7, 0.8, 1.1, 1.4, 1.0, 0.8, 0.7, 1.2, 1.0, 0.8, 0.7, 1.2, 1.2, 1.0, 0.9] + + self.video_list = [] + self.numf_list = [] + + frame_level_list = [] + + default_ego_label = np.zeros(self.num_ego_classes) + default_ego_label[0] = 1 + total_labeled_frame = 0 + total_num_frames = 0 + + for videoname in sorted(database.keys()): + is_part = 1 + if 'train' in self.SUBSETS and videoname not in self.trainvideos: + continue + elif 'test' in self.SUBSETS and videoname in self.trainvideos: + continue + # print(database[videoname].keys()) + action_id = database[videoname]['label'] + annotations = database[videoname]['annotations'] + + numf = database[videoname]['numf'] + self.numf_list.append(numf) + self.video_list.append(videoname) + + # frames = database[videoname]['frames'] + + frame_level_annos, counts = get_frame_level_annos_ucf24(annotations, numf, self.num_classes, counts) + + frames_with_boxes = 0 + for frame_index in range(numf): #frame_level_annos: + if len(frame_level_annos[frame_index]['labels'])>0: + frames_with_boxes += 1 + frame_level_annos[frame_index]['labels'] = np.asarray(frame_level_annos[frame_index]['labels'], dtype=np.float32) + frame_level_annos[frame_index]['boxes'] = np.asarray(frame_level_annos[frame_index]['boxes'], dtype=np.float32) + + total_labeled_frame += frames_with_boxes + total_num_frames += numf + + # logger.info('Frames with Boxes are {:d} out of {:d} in {:s}'.format(frames_with_boxes, numf, videoname)) + frame_level_list.append(frame_level_annos) + ## make ids + start_frames = [ f for f in range(numf-self.MIN_SEQ_STEP*self.SEQ_LEN, -1, -self.skip_step)] + + if self.full_test and 0 not in start_frames: + start_frames.append(0) + # logger.info('number of start frames: '+ str(len(start_frames))) + for frame_num in start_frames: + step_list = [s for s in range(self.MIN_SEQ_STEP, self.MAX_SEQ_STEP+1) if numf-s*self.SEQ_LEN>=frame_num] + shuffle(step_list) + # print(len(step_list), self.num_steps) + for s in range(min(self.num_steps, len(step_list))): + video_id = self.video_list.index(videoname) + self.ids.append([video_id, frame_num ,step_list[s]]) + + logger.info('Labeled frames {:d}/{:d}'.format(total_labeled_frame, total_num_frames)) + # pdb.set_trace() + ptrstr = '\n' + self.frame_level_list = frame_level_list + self.all_classes = [['action_ness'], ucf_classes.copy()] + for k, name in enumerate(self.label_types): + labels = self.all_classes[k] + # self.num_classes_list.append(len(labels)) + for c, cls_ in enumerate(labels): # just to see the distribution of train and test sets + ptrstr += '-'.join(self.SUBSETS) + ' {:05d} label: ind={:02d} name:{:s}\n'.format( + counts[c,k] , c, cls_) + + ptrstr += 'Number of ids are {:d}\n'.format(len(self.ids)) + ptrstr += 'Labeled frames {:d}/{:d}'.format(total_labeled_frame, total_num_frames) + self.childs = {} + self.num_videos = len(self.video_list) + self.print_str = ptrstr + + + def _make_lists_roadpp(self): + + # if self.MODE =='train': + # self.anno_file = os.path.join(self.root, 'road_plus_plus_trainval_v1.0.json') + # else: + # self.anno_file = os.path.join(self.root, 'road_plus_plus_test_v1.0.json') + + self.anno_file = os.path.join(self.root, 'road_plus_plus_trainval_v1.0.json') + with open(self.anno_file,'r') as fff: + final_annots = json.load(fff) + + database = final_annots['db'] + + # self.label_types = final_annots['label_types'] #['agent', 'action', 'loc', 'duplex', 'triplet'] # + self.label_types = ['agent', 'action', 'loc'] # + # print(self.label_types) + # print(rr) + + num_label_type = len(self.label_types) + self.num_classes = 1 ## one for presence + self.num_classes_list = [1] + for name in self.label_types: + logger.info('Number of {:s}: all :: {:d} to use: {:d}'.format(name, + len(final_annots['all_'+name+'_labels']),len(final_annots[name+'_labels']))) + numc = len(final_annots[name+'_labels']) + self.num_classes_list.append(numc) + self.num_classes += numc + + self.ego_classes = final_annots['av_action_labels'] + self.num_ego_classes = len(self.ego_classes) + + # counts = np.zeros((len(final_annots[self.label_types[-1] + '_labels']), num_label_type), dtype=np.int32) + counts = np.zeros((len(final_annots[self.label_types[0] + '_labels']) + len(final_annots[self.label_types[1] + '_labels']) +len(final_annots[self.label_types[2] + '_labels']) , num_label_type), dtype=np.int32) + + + self.video_list = [] + self.numf_list = [] + frame_level_list = [] + + for videoname in sorted(database.keys()): + # print(is_part_of_subsets(final_annots['db'][videoname]['split_ids'], self.SUBSETS)) + if not is_part_of_subsets(final_annots['db'][videoname]['split_ids'], self.SUBSETS): + continue + + numf = database[videoname]['numf'] + self.numf_list.append(numf) + self.video_list.append(videoname) + + frames = database[videoname]['frames'] + # print(numf) + frame_level_annos = [ {'labeled':False,'ego_label':-1,'boxes':np.asarray([]),'labels':np.asarray([])} for _ in range(numf)] + + frame_nums = [int(f) for f in frames.keys()] + frames_with_boxes = 0 + for frame_num in sorted(frame_nums): #loop from start to last possible frame which can make a legit sequence + frame_id = str(frame_num) + if frame_id in frames.keys() and frames[frame_id]['annotated']>0: + + frame_index = frame_num-1 + frame_level_annos[frame_index]['labeled'] = True + # frame_level_annos[frame_index]['ego_label'] = frames[frame_id]['av_action_ids'][0] + + frame = frames[frame_id] + if 'annos' not in frame.keys(): + frame = {'annos':{}} + + all_boxes = [] + all_labels = [] + frame_annos = frame['annos'] + # temp_img = cv2.imread('../roadpp/rgb-images/'+videoname+'/{:05d}.jpg'.format(frame_num)) + for key in frame_annos: + width, height = frame['width'], frame['height'] + anno = frame_annos[key] + box = anno['box'] + + assert box[0]0: + frames_with_boxes += 1 + frame_level_annos[frame_index]['labels'] = all_labels + frame_level_annos[frame_index]['boxes'] = all_boxes + + logger.info('Frames with Boxes are {:d} out of {:d} in {:s}'.format(frames_with_boxes, numf, videoname)) + frame_level_list.append(frame_level_annos) + + ## make ids + start_frames = [ f for f in range(numf-self.MIN_SEQ_STEP*self.SEQ_LEN, 1, -self.skip_step)] + if self.full_test and 1 not in start_frames: + start_frames.append(1) + logger.info('number of start frames: '+ str(len(start_frames))) + for frame_num in start_frames: + step_list = [s for s in range(self.MIN_SEQ_STEP, self.MAX_SEQ_STEP+1) if numf-s*self.SEQ_LEN>=frame_num] + shuffle(step_list) + # print(len(step_list), self.num_steps) + for s in range(min(self.num_steps, len(step_list))): + video_id = self.video_list.index(videoname) + if len(frame_level_list[video_id][frame_num+int(self.SEQ_LEN/2)]['boxes']) >0: + self.ids.append([video_id, frame_num ,step_list[s]]) + + # pdb.set_trace() + ptrstr = '' + self.frame_level_list = frame_level_list + self.all_classes = [['agent_ness']] + for k, name in enumerate(self.label_types): + labels = final_annots[name+'_labels'] + self.all_classes.append(labels) + # self.num_classes_list.append(len(labels)) + for c, cls_ in enumerate(labels): # just to see the distribution of train and test sets + ptrstr += '-'.join(self.SUBSETS) + ' {:05d} label: ind={:02d} name:{:s}\n'.format( + counts[c,k] , c, cls_) + + ptrstr += 'Number of ids are {:d}\n'.format(len(self.ids)) + + self.label_types = ['agent_ness'] + self.label_types + self.childs = {'duplex_childs':final_annots['duplex_childs'], 'triplet_childs':final_annots['triplet_childs']} + self.num_videos = len(self.video_list) + self.print_str = ptrstr + + + + def _make_lists_road(self): + + self.anno_file = os.path.join(self.root, 'road_trainval_v1.0.json') + + with open(self.anno_file,'r') as fff: + final_annots = json.load(fff) + + database = final_annots['db'] + + # self.label_types = final_annots['label_types'] #['agent', 'action', 'loc', 'duplex', 'triplet'] # + self.label_types = ['agent', 'action', 'loc'] + num_label_type = len(self.label_types) + self.num_classes = 1 ## one for presence + self.num_classes_list = [1] + for name in self.label_types: + logger.info('Number of {:s}: all :: {:d} to use: {:d}'.format(name, + len(final_annots['all_'+name+'_labels']),len(final_annots[name+'_labels']))) + numc = len(final_annots[name+'_labels']) + self.num_classes_list.append(numc) + self.num_classes += numc + + self.ego_classes = final_annots['av_action_labels'] + self.num_ego_classes = len(self.ego_classes) + + counts = np.zeros(((len(final_annots[self.label_types[0] + '_labels'])+len(final_annots[self.label_types[1] + '_labels'])+len(final_annots[self.label_types[2] + '_labels'])), num_label_type), dtype=np.int32) + + self.video_list = [] + self.numf_list = [] + frame_level_list = [] + + for videoname in sorted(database.keys()): + + if not is_part_of_subsets(final_annots['db'][videoname]['split_ids'], self.SUBSETS): + continue + + numf = database[videoname]['numf'] + self.numf_list.append(numf) + self.video_list.append(videoname) + + frames = database[videoname]['frames'] + frame_level_annos = [ {'labeled':False,'ego_label':-1,'boxes':np.asarray([]),'labels':np.asarray([])} for _ in range(numf)] + + frame_nums = [int(f) for f in frames.keys()] + frames_with_boxes = 0 + for frame_num in sorted(frame_nums): #loop from start to last possible frame which can make a legit sequence + frame_id = str(frame_num) + if frame_id in frames.keys() and frames[frame_id]['annotated']>0: + + frame_index = frame_num-1 + frame_level_annos[frame_index]['labeled'] = True + frame_level_annos[frame_index]['ego_label'] = frames[frame_id]['av_action_ids'][0] + + frame = frames[frame_id] + if 'annos' not in frame.keys(): + frame = {'annos':{}} + + all_boxes = [] + all_labels = [] + frame_annos = frame['annos'] + for key in frame_annos: + width, height = frame['width'], frame['height'] + anno = frame_annos[key] + box = anno['box'] + + assert box[0]0: + frames_with_boxes += 1 + frame_level_annos[frame_index]['labels'] = all_labels + frame_level_annos[frame_index]['boxes'] = all_boxes + + logger.info('Frames with Boxes are {:d} out of {:d} in {:s}'.format(frames_with_boxes, numf, videoname)) + frame_level_list.append(frame_level_annos) + + ## make ids + start_frames = [ f for f in range(numf-self.MIN_SEQ_STEP*self.SEQ_LEN, 1, -self.skip_step)] + # if self.full_test and 0 not in start_frames: + # start_frames.append(0) + logger.info('number of start frames: '+ str(len(start_frames))) + for frame_num in start_frames: + step_list = [s for s in range(self.MIN_SEQ_STEP, self.MAX_SEQ_STEP+1) if numf-s*self.SEQ_LEN>=frame_num] + shuffle(step_list) + # print(len(step_list), self.num_steps) + for s in range(min(self.num_steps, len(step_list))): + video_id = self.video_list.index(videoname) + if len(frame_level_list[video_id][frame_num+int(self.SEQ_LEN/2)]['boxes']) >0: + self.ids.append([video_id, frame_num ,step_list[s]]) + # print(rr) + # pdb.set_trace() + ptrstr = '' + self.frame_level_list = frame_level_list + self.all_classes = [['agent_ness']] + for k, name in enumerate(self.label_types): + + labels = final_annots[name+'_labels'] + self.all_classes.append(labels) + + # self.num_classes_list.append(len(labels)) + for c, cls_ in enumerate(labels): # just to see the distribution of train and test sets + ptrstr += '-'.join(self.SUBSETS) + ' {:05d} label: ind={:02d} name:{:s}\n'.format( + counts[c,k] , c, cls_) + + ptrstr += 'Number of ids are {:d}\n'.format(len(self.ids)) + + self.label_types = ['agent_ness'] + self.label_types + self.childs = {'duplex_childs':final_annots['duplex_childs'], 'triplet_childs':final_annots['triplet_childs']} + self.num_videos = len(self.video_list) + self.print_str = ptrstr + + def __len__(self): + return len(self.ids) + + def __getitem__(self, index): + id_info = self.ids[index] + + video_id, start_frame, step_size = id_info + videoname = self.video_list[video_id] + images = [] + frame_num = start_frame + ego_labels = np.zeros(self.SEQ_LEN)-1 + all_boxes = [] + labels = [] + ego_labels = [] + mask = np.zeros(self.SEQ_LEN, dtype=np.int) + indexs = [] + target = {} + + first_img = cv2.imread(self._imgpath + '/{:s}/{:05d}.jpg'.format(videoname, frame_num+int(self.SEQ_LEN/2))) + + oh = first_img.shape[0] + ow = first_img.shape[1] + if oh <= ow: + nh = self.resize_size + nw = self.resize_size * (ow / oh) + else: + nw = self.resize_size + nh = self.resize_size * (oh / ow) + + + p_t = int(self.SEQ_LEN // 2) + key_pos = p_t + target["image_id"] = [videoname+"_"+str(frame_num+key_pos), key_pos] + target["orig_size"] = torch.as_tensor([int(nh), int(nw)]) + target["size"] = torch.as_tensor([int(nh), int(nw)]) + + + for i in range(self.SEQ_LEN): + indexs.append(frame_num) + img_name = self._imgpath + '/{:s}/{:05d}.jpg'.format(videoname, frame_num) + # img_name = self._imgpath + '/{:s}/img_{:05d}.jpg'.format(videoname, frame_num) + img = Image.open(img_name) + img = img.resize((target['orig_size'][1], target['orig_size'][0])) + images.append(img) + if self.frame_level_list[video_id][frame_num]['labeled']: + mask[i] = 1 + all_boxes.append(self.frame_level_list[video_id][frame_num]['boxes'].copy()) + labels.append(self.frame_level_list[video_id][frame_num]['labels'].copy()) + # ego_labels.append(self.frame_level_list[video_id][frame_num]['ego_label']) + else: + self.index_cnt -= 1 + all_boxes.append(np.asarray([])) + labels.append(np.asarray([])) + # ego_labels.append(-1) + frame_num += step_size + + imgs, target = self._transforms(images, target) + + + imgs = torch.stack(imgs, dim=0) + imgs = imgs.permute(1, 0, 2, 3) + + + keyframe_box = all_boxes[key_pos] + keyframe_label = labels[key_pos] + + boxes = [] + for i, bbox in enumerate(keyframe_box): + boxes.append([p_t, bbox[0],bbox[1],bbox[2],bbox[3]]) + + boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 5) + boxes[:, 1::3].clamp_(min=0, max=int(nw)) + boxes[:, 2::3].clamp_(min=0, max=nh) + + if boxes.shape[0]: + raw_boxes = F.pad(boxes, (1, 0, 0, 0), value=self.index_cnt) + else: + raw_boxes = boxes + + for i, bbox in enumerate(raw_boxes): + raw_boxes[i][2] = np.int(raw_boxes[i][2] * nw) + raw_boxes[i][3] = np.int(raw_boxes[i][3] * nh) + raw_boxes[i][4] = np.int(raw_boxes[i][4] * nw) + raw_boxes[i][5] = np.int(raw_boxes[i][5] * nh) + + + + classes = torch.as_tensor(keyframe_label, dtype=torch.float32).reshape(-1, self.num_of_classes) + + target['boxes'] = boxes + target['raw_boxes'] = raw_boxes + target["labels"] = classes + self.index_cnt = self.index_cnt + 1 + + # print('img',imgs.shape) + # print('tar',target) + # print('tar shape',target.shape) + # print(rr) + + return imgs, target + + +def build_dataloader(cfg): + + + train_dataset = VideoDataset(cfg, train=True, skip_step=cfg.CONFIG.DATA.train_skip_step, transform=make_transforms("train", cfg),resize_size=cfg.CONFIG.DATA.IMG_RESHAPE_SIZE,crop_size=cfg.CONFIG.DATA.IMG_SIZE) + + + val_dataset = VideoDataset(cfg, train=False, transform=make_transforms("val", cfg), skip_step=cfg.CONFIG.DATA.skip_step, full_test=True,resize_size=cfg.CONFIG.DATA.IMG_SIZE,crop_size=cfg.CONFIG.DATA.IMG_SIZE) + + # train_bbox_json = json.load(open(cfg.CONFIG.DATA.ANNO_PATH.format("train"))) + # train_video_frame_bbox, train_frame_keys_list = train_bbox_json["video_frame_bbox"], train_bbox_json["frame_keys_list"] + + # train_dataset = VideoDataset(cfg.CONFIG.DATA.DATA_PATH, + # train_video_frame_bbox, + # train_frame_keys_list, + # transforms=make_transforms("train", cfg), + # frame_sample_rate=cfg.CONFIG.DATA.FRAME_RATE, + # clip_len=cfg.CONFIG.DATA.TEMP_LEN, + # resize_size=cfg.CONFIG.DATA.IMG_RESHAPE_SIZE, + # crop_size=cfg.CONFIG.DATA.IMG_SIZE, + # mode="train") + + # val_bbox_json = json.load(open(cfg.CONFIG.DATA.ANNO_PATH.format("val"))) + # val_video_frame_bbox, val_frame_keys_list = val_bbox_json["video_frame_bbox"], val_bbox_json["frame_keys_list"] + + # val_dataset = VideoDataset(cfg.CONFIG.DATA.DATA_PATH, + # val_video_frame_bbox, + # val_frame_keys_list, + # transforms=make_transforms("val", cfg), + # frame_sample_rate=cfg.CONFIG.DATA.FRAME_RATE, + # clip_len=cfg.CONFIG.DATA.TEMP_LEN, + # resize_size=cfg.CONFIG.DATA.IMG_SIZE, + # crop_size=cfg.CONFIG.DATA.IMG_SIZE, + # mode="val") + + if cfg.DDP_CONFIG.DISTRIBUTED: + train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) + val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset) + batch_sampler_train = torch.utils.data.BatchSampler(train_sampler, cfg.CONFIG.TRAIN.BATCH_SIZE, drop_last=True) + else: + train_sampler = None + val_sampler = None + batch_sampler_train = None + + train_loader = torch.utils.data.DataLoader(train_dataset, shuffle=(train_sampler is None), + num_workers=9, pin_memory=True, batch_sampler=batch_sampler_train, + collate_fn=collate_fn) + val_loader = torch.utils.data.DataLoader( + val_dataset, batch_size=cfg.CONFIG.VAL.BATCH_SIZE, shuffle=(val_sampler is None), + num_workers=9, sampler=val_sampler, pin_memory=True, collate_fn=collate_fn) + + # print(cfg.CONFIG.DATA.ANNO_PATH.format("train"), cfg.CONFIG.DATA.ANNO_PATH.format("val")) + + return train_loader, val_loader, train_sampler, val_sampler, None + +def reverse_norm(imgs): + img = imgs + mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1)) + std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1)) + img = (img * std + mean) * 255.0 + img = img.transpose((1, 2, 0))[..., ::-1].astype(np.uint8) + return img + + + + + + + + diff --git a/datasets/road_labels.pbtxt b/datasets/road_labels.pbtxt new file mode 100644 index 0000000..d8faa3a --- /dev/null +++ b/datasets/road_labels.pbtxt @@ -0,0 +1,164 @@ +item { + name: "Ped" + id: 1 +} +item { + name: "Car" + id: 2 +} +item { + name: "Cyc" + id: 3 +} +item { + name: "Mobike" + id: 4 +} +item { + name: "MedVeh" + id: 5 +} +item { + name: "LarVeh" + id: 6 +} +item { + name: "Bus" + id: 7 +} +item { + name: "EmVeh" + id: 8 +} +item { + name: "TL" + id: 9 +} +item { + name: "OthTL" + id: 10 +} +item { + name: "Red" + id: 11 +} +item { + name: "Amber" + id: 12 +} +item { + name: "Green" + id: 13 +} +item { + name: "MovAway" + id: 14 +} +item { + name: "MovTow" + id: 15 +} +item { + name: "Mov" + id: 16 +} +item { + name: "Brake" + id: 17 +} +item { + name: "Stop" + id: 18 +} +item { + name: "IncatLft" + id: 19 +} +item { + name: "IncatRht" + id: 20 +} +item { + name: "HazLit" + id: 21 +} +item { + name: "TurLft" + id: 22 +} +item { + name: "TurRht" + id: 23 +} +item { + name: "Ovtak" + id: 24 +} +item { + name: "Wait2X" + id: 25 +} +item { + name: "XingFmLft" + id: 26 +} +item { + name: "XingFmRht" + id: 27 +} +item { + name: "Xing" + id: 28 +} +item { + name: "PushObj" + id: 29 +} +item { + name: "VehLane" + id: 30 +} +item { + name: "OutgoLane" + id: 31 +} +item { + name: "OutgoCycLane" + id: 32 +} +item { + name: "IncomLane" + id: 33 +} +item { + name: "IncomCycLane" + id: 34 +} +item { + name: "Pav" + id: 35 +} +item { + name: "LftPav" + id: 36 +} +item { + name: "RhtPav" + id: 37 +} +item { + name: "Jun" + id: 38 +} +item { + name: "xing" + id: 39 +} +item { + name: "BusStop" + id: 40 +} +item { + name: "parking" + id: 41 +} \ No newline at end of file diff --git a/datasets/roadpp.pbtxt b/datasets/roadpp.pbtxt new file mode 100644 index 0000000..de6c7e5 --- /dev/null +++ b/datasets/roadpp.pbtxt @@ -0,0 +1,172 @@ +item { + name: "Ped" + id: 1 +} +item { + name: "Car" + id: 2 +} +item { + name: "Mobike" + id: 3 +} +item { + name: "SmalVeh" + id: 4 +} +item { + name: "MedVeh" + id: 5 +} +item { + name: "LarVeh" + id: 6 +} +item { + name: "Bus" + id: 7 +} +item { + name: "EmVeh" + id: 8 +} +item { + name: "MovAway" + id: 9 +} +item { + name: "MovTow" + id: 10 +} +item { + name: "Mov" + id: 11 +} +item { + name: "Rev" + id: 12 +} +item { + name: "Brake" + id: 13 +} +item { + name: "Stop" + id: 14 +} +item { + name: "IncatLft" + id: 15 +} +item { + name: "IncatRht" + id: 16 +} +item { + name: "HazLit" + id: 17 +} +item { + name: "TurLft" + id: 18 +} +item { + name: "TurRht" + id: 19 +} +item { + name: "MovRht" + id: 20 +} +item { + name: "MovLft" + id: 21 +} +item { + name: "Ovtak" + id: 22 +} +item { + name: "Wait2X" + id: 23 +} +item { + name: "XingFmLft" + id: 24 +} +item { + name: "XingFmRht" + id: 25 +} +item { + name: "Xing" + id: 26 +} +item { + name: "PushObj" + id: 27 +} +item { + name: "VehLane" + id: 28 +} +item { + name: "OutgoLane" + id: 29 +} +item { + name: "OutgoCycLane" + id: 30 +} +item { + name: "OutgoBusLane" + id: 31 +} +item { + name: "IncomLane" + id: 32 +} +item { + name: "IncomCycLane" + id: 33 +} +item { + name: "IncomBusLane" + id: 34 +} +item { + name: "Pav" + id: 35 +} +item { + name: "LftPav" + id: 36 +} +item { + name: "RhtPav" + id: 37 +} +item { + name: "Jun" + id: 38 +} +item { + name: "xing" + id: 39 +} +item { + name: "BusStop" + id: 40 +} +item { + name: "parking" + id: 41 +} +item { + name: "LftParking" + id: 42 +} +item { + name: "rightParking" + id: 43 +} \ No newline at end of file diff --git a/datasets/roadpp_labels.pbtxt b/datasets/roadpp_labels.pbtxt new file mode 100644 index 0000000..361391c --- /dev/null +++ b/datasets/roadpp_labels.pbtxt @@ -0,0 +1,172 @@ +item { + name: "Ped" + id: 1 +} +item { + name: "Car" + id: 2 +} +item { + name: "Mobike" + id: 3 +} +item { + name: "SmalVeh" + id: 4 +} +item { + name: "MedVeh" + id: 5 +} +item { + name: "LarVeh" + id: 6 +} +item { + name: "Bus" + id: 7 +} +item { + name: "EmVeh" + id: 8 +} +item { + name: "MovAway" + id: 9 +} +item { + name: "MovTow" + id: 10 +} +item { + name: "Mov" + id: 11 +} +item { + name: "Rev" + id: 12 +} +item { + name: "Brake" + id: 13 +} +item { + name: "Stop" + id: 14 +} +item { + name: "IncatLft" + id: 15 +} +item { + name: "IncatRht" + id: 16 +} +item { + name: "HazLit" + id: 17 +} +item { + name: "TurLft" + id: 18 +} +item { + name: "TurRht" + id: 19 +} +item { + name: "MovRht" + id: 20 +} +item { + name: "MovLft" + id: 21 +} +item { + name: "Ovtak" + id: 22 +} +item { + name: "Wait2X" + id: 23 +} +item { + name: "XingFmLft" + id: 24 +} +item { + name: "XingFmRht" + id: 25 +} +item { + name: "Xing" + id: 26 +} +item { + name: "PushObj" + id: 27 +} +item { + name: "VehLane" + id: 28 +} +item { + name: "OutgoLane" + id: 29 +} +item { + name: "OutgoCycLane" + id: 30 +} +item { + name: "OutgoBusLane" + id: 31 +} +item { + name: "IncomLane" + id: 32 +} +item { + name: "IncomCycLane" + id: 33 +} +item { + name: "IncomBusLane" + id: 34 +} +item { + name: "Pav" + id: 35 +} +item { + name: "LftPav" + id: 36 +} +item { + name: "RhtPav" + id: 37 +} +item { + name: "Jun" + id: 38 +} +item { + name: "xing" + id: 39 +} +item { + name: "BusStop" + id: 40 +} +item { + name: "parking" + id: 41 +} +item { + name: "LftParking" + id: 42 +} +item { + name: "rightParking" + id: 43 +} \ No newline at end of file diff --git a/eval_tuber_roadpp.py b/eval_tuber_roadpp.py new file mode 100644 index 0000000..785f801 --- /dev/null +++ b/eval_tuber_roadpp.py @@ -0,0 +1,59 @@ +import argparse +import datetime +import time + +import torch +import torch.optim +from tensorboardX import SummaryWriter + +from models.tuber_ava import build_model +from utils.model_utils import deploy_model, load_model, save_checkpoint +from utils.video_action_recognition import validate_tuber_detection +from pipelines.video_action_recognition_config import get_cfg_defaults +from pipelines.launch import spawn_workers +from utils.utils import build_log_dir +from datasets.road_frames import build_dataloader + + +def main_worker(cfg): + # create tensorboard and logs + if cfg.DDP_CONFIG.GPU_WORLD_RANK == 0: + tb_logdir = build_log_dir(cfg) + writer = SummaryWriter(log_dir=tb_logdir) + else: + writer = None + # cfg.freeze() + + # create model + print('Creating TubeR model: %s' % cfg.CONFIG.MODEL.NAME) + model, criterion, postprocessors = build_model(cfg) + model = deploy_model(model, cfg, is_tuber=True) + num_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) + print('Number of parameters in the model: %6.2fM' % (num_parameters / 1000000)) + + # create dataset and dataloader + _, test_loader, _, test_sampler,_ = build_dataloader(cfg) + + # docs: add resume option + if not cfg.CONFIG.MODEL.LOAD: raise ("model dir not found") + model, _ = load_model(model, cfg, load_fc=cfg.CONFIG.MODEL.LOAD_FC) + + print('Start Validation...') + start_time = time.time() + validate_tuber_detection(cfg, model, criterion, postprocessors, test_loader, 0, writer) + + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print('testing time {}'.format(total_time_str)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Train video action recognition transformer models.') + parser.add_argument('--config-file', + default='/xxx/TubeR_AVA_v2.2_CSN-152.yaml', + help='path to config file.') + args = parser.parse_args() + + cfg = get_cfg_defaults() + cfg.merge_from_file(args.config_file) + spawn_workers(main_worker, cfg) diff --git a/evaluates/evaluate_ava.py b/evaluates/evaluate_ava.py index dbb7849..4acd2f1 100644 --- a/evaluates/evaluate_ava.py +++ b/evaluates/evaluate_ava.py @@ -29,11 +29,18 @@ class STDetectionEvaluater(object): def __init__(self, label_path, tiou_thresholds=[0.5], load_from_dataset=False, class_num=60): self.label_path = label_path + # print('lab_path', self.label_path) categories, class_whitelist = read_labelmap(self.label_path) + # print('categories', categories) + # print('class_whitelist', class_whitelist) + self.class_num = class_num + # print('self.class_num', self.class_num) + + if class_num == 80: self.exclude_keys = [] - f = open("/xxx/datasets/ava_val_excluded_timestamps_v2.1.csv") + f = open("datasets/assets/ava_val_excluded_timestamps_v2.1.csv") while True: line = f.readline().strip() if not line: break diff --git a/evaluates/utils/object_detection_evaluation.py b/evaluates/utils/object_detection_evaluation.py index 63bd217..892c35e 100644 --- a/evaluates/utils/object_detection_evaluation.py +++ b/evaluates/utils/object_detection_evaluation.py @@ -132,9 +132,10 @@ def __init__(self, Raises: ValueError: If the category ids are not 1-indexed. """ + super(ObjectDetectionEvaluator, self).__init__(categories) self._num_classes = max([cat['id'] for cat in categories]) - + if min(cat['id'] for cat in categories) < 1: raise ValueError('Classes should be 1-indexed.') diff --git a/modules/__init__.py b/modules/__init__.py new file mode 100644 index 0000000..7a00503 --- /dev/null +++ b/modules/__init__.py @@ -0,0 +1,19 @@ +class AverageMeter(object): + """Computes and stores the average and current value""" + def __init__(self, momentum=0.95): + self.momentum = momentum + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.count = 0 + + def update(self, val, n=1): + if n>0: + self.val = val + if self.count == 0: + self.avg = self.val + else: + self.avg = self.avg*self.momentum + (1-self.momentum)* val + self.count += n diff --git a/modules/anchor_box_kmeans.py b/modules/anchor_box_kmeans.py new file mode 100644 index 0000000..0f98d5a --- /dev/null +++ b/modules/anchor_box_kmeans.py @@ -0,0 +1,72 @@ +import torch +from math import sqrt as sqrt +from itertools import product as product +import numpy as np +from modules.utils import BufferList + + +class anchorBox(torch.nn.Module): + """Compute anchorbox coordinates in center-offset form for each source + feature map. + """ + def __init__(self, aspect_ratios =[0.5, 1 / 1., 1.5], + scale_ratios = [1.,]): + + super(anchorBox, self).__init__() + self.aspect_ratios = aspect_ratios + self.scale_ratios = scale_ratios + self.default_sizes= [0.01, 0.06, 0.2, 0.4, 0.85] + self.anchor_boxes = len(self.aspect_ratios)*len(self.scale_ratios) + self.ar = self.anchor_boxes + self.num_anchors = self.ar + self.cell_anchors = BufferList(self._get_cell_anchors()) + + def _get_cell_anchors(self): + anchors = [] + base_anchors = np.asarray([[0.0000, 0.0000, 0.0141, 0.0365], + [0.0000, 0.0000, 0.0178, 0.0614], + [0.0000, 0.0000, 0.0343, 0.0487], + [0.0000, 0.0000, 0.0450, 0.1475], + [0.0000, 0.0000, 0.0284, 0.0986], + [0.0000, 0.0000, 0.0667, 0.0691], + [0.0000, 0.0000, 0.0699, 0.2465], + [0.0000, 0.0000, 0.1629, 0.1744], + [0.0000, 0.0000, 0.1110, 0.1124], + [0.0000, 0.0000, 0.1349, 0.3740], + [0.0000, 0.0000, 0.2773, 0.3713], + [0.0000, 0.0000, 0.2406, 0.2320], + [0.0000, 0.0000, 0.3307, 0.6395], + [0.0000, 0.0000, 0.7772, 0.6261], + [0.0000, 0.0000, 0.4732, 0.3153]]) + + for s1 in range(len(self.default_sizes)): + p_anchors = base_anchors[s1*3:(s1+1)*3,:] + p_anchors[:,:2] = p_anchors[:,:2]-p_anchors[:,2:]/2.0 + p_anchors[:,2:] = p_anchors[:,2:]/2.0 + p_anchors = torch.FloatTensor(p_anchors).cuda() + # print(p_anchors) + anchors.append(p_anchors) + + return anchors + + # based on forward from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/modeling/rpn/anchor_generator.py + def forward(self, grid_sizes): + + anchors = [] + for size, base_anchors in zip(grid_sizes, self.cell_anchors): + grid_height, grid_width = size + stride_h = 1.0/grid_height + stride_w = 1.0/grid_width + device = base_anchors.device + shifts_x = torch.arange(0, grid_width, dtype=torch.float32, device=device).cuda() + shifts_y = torch.arange(0, grid_height, dtype=torch.float32, device=device).cuda() + shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x) + shift_x = (shift_x.reshape(-1) + 0.5) * stride_w + shift_y = (shift_y.reshape(-1) + 0.5) * stride_h + shifts = torch.stack((shift_x, shift_y, shift_x, shift_y), dim=1) + anchors.append( (shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)).reshape(-1, 4) ) + + anchors = torch.cat(anchors, 0) + anchors.clamp_(max=1, min=0) + return anchors + diff --git a/modules/anchor_box_retinanet.py b/modules/anchor_box_retinanet.py new file mode 100644 index 0000000..2b79619 --- /dev/null +++ b/modules/anchor_box_retinanet.py @@ -0,0 +1,80 @@ +import torch +from math import sqrt as sqrt +from itertools import product as product +import numpy as np +from modules.utils import BufferList + +class anchorBox(torch.nn.Module): + """Compute anchorbox coordinates in center-offset form for each source + feature map. + """ + def __init__(self, sizes = [32, 64, 128, 256, 512], + ratios = np.asarray([0.5, 1 / 1., 2.0]), + strides = [8, 16, 32, 64, 128], + scales = np.array([1, 1.25992, 1.58740])): + + super(anchorBox, self).__init__() + self.sizes = sizes + self.ratios = ratios + self.scales = scales + self.strides = strides + self.ar = len(self.ratios)*len(self.ratios) + self.cell_anchors = BufferList(self._get_cell_anchors()) + + def _get_cell_anchors(self): + anchors = [] + for s1 in self.sizes: + p_anchors = np.asarray(self._gen_generate_anchors_on_one_level(s1)) + p_anchors = torch.FloatTensor(p_anchors).cuda() + anchors.append(p_anchors) + + return anchors + + # modified from https://github.com/fizyr/keras-retinanet/blob/master/keras_retinanet/utils/anchors.py + # Copyright 2017-2018 Fizyr (https://fizyr.com) + def _gen_generate_anchors_on_one_level(self, base_size=32): + + """ + Generate anchor (reference) windows by enumerating aspect ratios X + scales w.r.t. a reference window. + + """ + + num_anchors = len(self.ratios) * len(self.scales) + + # initialize output anchors + anchors = np.zeros((num_anchors, 4)) + + # print(self.scales) + # scale base_size + anchors[:, 2:] = base_size * np.tile(self.scales, (2, len(self.ratios))).T + # print(anchors) + # compute areas of anchors + areas = anchors[:, 2] * anchors[:, 3] + + anchors[:, 2] = np.sqrt(areas / np.repeat(self.ratios, len(self.scales))) + anchors[:, 3] = anchors[:, 2] * np.repeat(self.ratios, len(self.scales)) + # print(anchors) + # transform from (x_ctr, y_ctr, w, h) -> (x1, y1, x2, y2) + anchors[:, 0::2] -= np.tile(anchors[:, 2] * 0.5, (2, 1)).T + anchors[:, 1::2] -= np.tile(anchors[:, 3] * 0.5, (2, 1)).T + # print(anchors) + return anchors + + # forward from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/modeling/rpn/anchor_generator.py + def forward(self, grid_sizes): + + anchors = [] + for size, stride, base_anchors in zip(grid_sizes, self.strides, self.cell_anchors): + grid_height, grid_width = size + device = base_anchors.device + shifts_x = torch.arange(0, grid_width, dtype=torch.float32, device=device) + shifts_y = torch.arange(0, grid_height, dtype=torch.float32, device=device) + shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x) + shift_x = (shift_x.reshape(-1) + 0.5) * stride + shift_y = (shift_y.reshape(-1) + 0.5) * stride + shifts = torch.stack((shift_x, shift_y, shift_x, shift_y), dim=1) + anchors.append( (shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)).reshape(-1, 4) ) + + return torch.cat(anchors, 0) + diff --git a/modules/box_utils.py b/modules/box_utils.py new file mode 100644 index 0000000..94a973a --- /dev/null +++ b/modules/box_utils.py @@ -0,0 +1,401 @@ +import torch, pdb, math +import numpy as np +import torchvision + + +def match_anchors_wIgnore(gt_boxes, gt_labels, anchors, pos_th=0.5, nge_th=0.4, variances=[0.1, 0.2], seq_len=1): + # pdb.set_trace() + # pdb.set_trace() + num_mt = int(gt_labels.size(0)/seq_len) + + # pdb.set_trace() + seq_overlaps =[] + inds = torch.LongTensor([m*seq_len for m in range(num_mt)]) + # print('indexs device', inds.device) + # print(inds, num_mt) + ## get indexes of first frame in seq for each microtube + gt_labels = gt_labels[inds] + # print('gtb', gt_boxes) + # print('anchors', anchors[:10]) + + for s in range(seq_len): + seq_overlaps.append(jaccard(gt_boxes[inds+s, :], anchors)) + # pdb.set_trace() + overlaps = seq_overlaps[0] + # print('overlap max ', overlaps.max()) + ## Compute average overlap + for s in range(seq_len-1): + overlaps = overlaps + seq_overlaps[s+1] + overlaps = overlaps/float(seq_len) + # pdb.set_trace() + best_anchor_overlap, best_anchor_idx = overlaps.max(1, keepdim=True) + + # print('MIN VAL::', best_anchor_overlap.min().item()) + # if best_anchor_overlap.min().item()<0.25: + # print('MIN VAL::', best_anchor_overlap.min().item()) + # print('lower than o.5', best_anchor_overlap, gt_boxes) + # [1,num_anchors] best ground truth for each anchor + + best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True) + best_truth_idx.squeeze_(0) + best_truth_overlap.squeeze_(0) + best_anchor_idx.squeeze_(1) + best_anchor_overlap.squeeze_(1) + best_truth_overlap.index_fill_(0, best_anchor_idx, 2) # ensure best anchor + # ensure every gt matches with its anchor of max overlap + for j in range(best_anchor_idx.size(0)): + best_truth_idx[best_anchor_idx[j]] = j + + conf = gt_labels[best_truth_idx] + 1 # assigned nearest class label + conf[best_truth_overlap < pos_th] = -1 # label as ignore + conf[best_truth_overlap < nge_th] = 0 # label as background + + for s in range(seq_len): + st = gt_boxes[inds + s, :] + matches = st[best_truth_idx] # Shape: [num_anchors,4] + if s == 0: + loc = encode(matches, anchors[:, s * 4:(s + 1) * 4], variances) + # Shape: [num_anchors, 4] -- encode the gt boxes for frame i + else: + temp = encode(matches, anchors[:, s * 4:(s + 1) * 4], variances) + loc = torch.cat([loc, temp], 1) # shape: [num_anchors x 4 * seql_len] : stacking the location targets for different frames + # pdb.set_trace() + return conf, loc + + +def hard_negative_mining(loss, labels, neg_pos_ratio): + """ + It used to suppress the presence of a large number of negative prediction. + It works on image level not batch level. + For any example/image, it keeps all the positive predictions and + cut the number of negative predictions to make sure the ratio + between the negative examples and positive examples is no more + the given ratio for an image. + Args: + loss (N, num_anchors): the loss for each example. + labels (N, num_anchors): the labels. + neg_pos_ratio: the ratio between the negative examples and positive examples. + + """ + + pos_mask = labels > 0 + num_pos = pos_mask.long().sum(dim=1, keepdim=True) + num_neg = num_pos * neg_pos_ratio + + loss[pos_mask] = -math.inf + _, indexes = loss.sort(dim=1, descending=True) + _, orders = indexes.sort(dim=1) + neg_mask = orders < num_neg + return pos_mask | neg_mask + + +def point_form(boxes): + """ Convert anchor_boxes to (xmin, ymin, xmax, ymax) + representation for comparison to point form ground truth data. + Args: + boxes: (tensor) center-size default boxes from anchorbox layers. + Return: + boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes. + """ + return torch.cat((boxes[:, :2] - boxes[:, 2:]/2, # xmin, ymin + boxes[:, :2] + boxes[:, 2:]/2), 1) # xmax, ymax + + +def center_size(boxes): + """ Convert anchor_boxes to (cx, cy, w, h) + representation for comparison to center-size form ground truth data. + Args: + boxes: (tensor) point_form boxes + Return: + boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes. + """ + return torch.cat((boxes[:, 2:] + boxes[:, :2])/2, # cx, cy + boxes[:, 2:] - boxes[:, :2], 1) # w, h + + +def intersect(box_a, box_b): + """ + + We resize both tensors to [A,B,2] without new malloc: + [A,2] -> [A,1,2] -> [A,B,2] + [B,2] -> [1,B,2] -> [A,B,2] + Then we compute the area of intersect between box_a and box_b. + Args: + box_a: (tensor) bounding boxes, Shape: [A,4]. + box_b: (tensor) bounding boxes, Shape: [B,4]. + Return: + (tensor) intersection area, Shape: [A,B]. + + """ + # print(box_a, box_b) + A = box_a.size(0) + B = box_b.size(0) + # pdb.set_trace() + # print(box_a.type(), box_b.type()) + max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2), + box_b[:, 2:].unsqueeze(0).expand(A, B, 2)) + min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2), + box_b[:, :2].unsqueeze(0).expand(A, B, 2)) + inter = torch.clamp((max_xy - min_xy), min=0) + return inter[:, :, 0] * inter[:, :, 1] + + +def jaccard(box_a, box_b): + """Compute the jaccard overlap of two sets of boxes. The jaccard overlap + is simply the intersection over union of two boxes. Here we operate on + ground truth boxes and default boxes. + E.g.: + A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B) + Args: + box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4] + box_b: (tensor) anchor boxes from anchorbox layers, Shape: [num_anchors,4] + Return: + jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)] + """ + # pdb.set_trace() + inter = intersect(box_a, box_b) + area_a = ((box_a[:, 2]-box_a[:, 0]) * + (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B] + area_b = ((box_b[:, 2]-box_b[:, 0]) * + (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B] + + union = area_a + area_b - inter + min_union = union.min() + + # print('minnin ', min_union, union) + + return inter / union # [A,B] + + +def get_ovlp_cellwise(overlaps): + feature_maps = [38, 19, 10, 5, 3, 1] + aratios = [4, 6, 6, 6, 4, 4] + dim = 0 + for f in feature_maps: + dim += f*f + out_ovlp = np.zeros(dim) + count = 0 + st = 0 + for k, f in enumerate(feature_maps): + ar = aratios[k] + for i in range(f*f): + et = st+ar + ovlps_tmp = overlaps[0, st:et] + #pdb.set_trace() + out_ovlp[count] = max(ovlps_tmp) + count += 1 + st = et + assert count == dim + + return out_ovlp + + +def encode(matched, anchors, variances): + + """ + + Encode the variances from the anchorbox layers into the ground truth boxes + we have matched (based on jaccard overlap) with the anchor boxes. + Args: + matched: (tensor) Coords of ground truth for each anchor in point-form + Shape: [num_anchors, 4]. + anchors: (tensor) anchor boxes in center-offset form + Shape: [num_anchors,4]. + variances: (list[float]) Variances of anchorboxes + + Return: + encoded boxes (tensor), Shape: [num_anchors, 4] + + """ + + TO_REMOVE = 1 if anchors[0,2]>1 else 0 # TODO remove + ex_widths = anchors[:, 2] - anchors[:, 0] + TO_REMOVE + ex_heights = anchors[:, 3] - anchors[:, 1] + TO_REMOVE + ex_ctr_x = anchors[:, 0] + 0.5 * ex_widths + ex_ctr_y = anchors[:, 1] + 0.5 * ex_heights + + gt_widths = matched[:, 2] - matched[:, 0] + TO_REMOVE + gt_heights = matched[:, 3] - matched[:, 1] + TO_REMOVE + gt_ctr_x = matched[:, 0] + 0.5 * gt_widths + gt_ctr_y = matched[:, 1] + 0.5 * gt_heights + + + targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths / variances[0] + targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights / variances[0] + targets_dw = torch.log(gt_widths / ex_widths) / variances[1] + targets_dh = torch.log(gt_heights / ex_heights) / variances[1] + + targets = torch.stack((targets_dx, targets_dy, targets_dw, targets_dh), dim=1) + + return targets + +def decode(loc, anchors, variances=[0.1, 0.2], bbox_xform_clip=math.log(1000. / 16)): +# """ +# Decode locations from predictions using anchors to undo +# the encoding we did for offset regression at train time. +# Args: +# loc (tensor): location predictions for loc layers, +# Shape: [num_anchors,4] +# anchors (tensor): anchor boxes in center-offset form. +# Shape: [num_anchors,4]. +# variances: (list[float]) Variances of anchorboxes +# Return: +# decoded bounding box predictions +# """ +# #pdb.set_trace() + + TO_REMOVE = 1 if anchors[0,2]>1 else 0 # TODO remove + widths = anchors[:, 2] - anchors[:, 0] + TO_REMOVE + heights = anchors[:, 3] - anchors[:, 1] + TO_REMOVE + ctr_x = anchors[:, 0] + 0.5 * widths + ctr_y = anchors[:, 1] + 0.5 * heights + + dx = loc[:, 0::4] * variances[0] + dy = loc[:, 1::4] * variances[0] + dw = loc[:, 2::4] * variances[1] + dh = loc[:, 3::4] * variances[1] + + # Prevent sending too large values into torch.exp() + dw = torch.clamp(dw, max=bbox_xform_clip) + dh = torch.clamp(dh, max=bbox_xform_clip) + + pred_ctr_x = dx * widths[:, None] + ctr_x[:, None] + pred_ctr_y = dy * heights[:, None] + ctr_y[:, None] + pred_w = torch.exp(dw) * widths[:, None] + pred_h = torch.exp(dh) * heights[:, None] + + pred_boxes = torch.zeros_like(loc) + # x1 + pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w + # y1 + pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h + # x2 (note: "- 1" is correct; don't be fooled by the asymmetry) + pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w - TO_REMOVE + # y2 (note: "- 1" is correct; don't be fooled by the asymmetry) + pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h - TO_REMOVE + + return pred_boxes + + +# Adapted from https://github.com/Hakuyume/chainer-ssd +def decode_01(loc, anchors, variances): + """Decode locations from predictions using anchors to undo + the encoding we did for offset regression at train time. + Args: + loc (tensor): location predictions for loc layers, + Shape: [num_anchors,4] + anchors (tensor): anchor boxes in center-offset form. + Shape: [num_anchors,4]. + variances: (list[float]) Variances of anchorboxes + Return: + decoded bounding box predictions + """ + #pdb.set_trace() + boxes = torch.cat(( + anchors[:, :2] + loc[:, :2] * variances[0] * anchors[:, 2:], + anchors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1) + boxes[:, :2] -= boxes[:, 2:] / 2 + boxes[:, 2:] += boxes[:, :2] + return boxes + +def decode_seq(loc, anchors, variances, seq_len): + boxes = [] + #print('variances', variances) + for s in range(seq_len): + if s == 0: + boxes = decode(loc[:, :4], anchors[:, :4], variances) + else: + boxes = torch.cat((boxes,decode(loc[:,s*4:(s+1)*4], anchors[:,s*4:(s+1)*4], variances)),1) + + return boxes + + +def log_sum_exp(x): + """Utility function for computing log_sum_exp while determining + This will be used to determine unaveraged confidence loss across + all examples in a batch. + Args: + x (Variable(tensor)): conf_preds from conf layers + """ + x_max = x.data.max() + return torch.log(torch.sum(torch.exp(x-x_max), 1, keepdim=True)) + x_max + + + +# def nms_pt(boxes, scores, overlap=0.5): +# keep = torchvision.ops.nms(boxes, scores, overlap) +# return keep + # gpu_keep = torchvision.ops.nms(boxes_for_nms.to('cuda'), scores.to('cuda'), iou_threshold) + +# Original author: Francisco Massa: +# https://github.com/fmassa/object-detection.torch +# Ported to PyTorch by Max deGroot (02/01/2017) +def nms(boxes, scores, overlap=0.5, top_k=20, use_old_code=False): + """Apply non-maximum suppression at test time to avoid detecting too many + overlapping bounding boxes for a given object. + Args: + boxes: (tensor) The location preds for the img, Shape: [num_anchors,4]. + scores: (tensor) The class predscores for the img, Shape:[num_anchors]. + overlap: (float) The overlap thresh for suppressing unnecessary boxes. + top_k: (int) The Maximum number of box preds to consider. + Return: + The indices of the kept boxes with respect to num_anchors. + """ + if use_old_code: + keep = scores.new(scores.size(0)).zero_().long() + if boxes.numel() == 0: + return keep, 0 + x1 = boxes[:, 0] + y1 = boxes[:, 1] + x2 = boxes[:, 2] + y2 = boxes[:, 3] + area = torch.mul(x2 - x1, y2 - y1) + v, idx = scores.sort(0) # sort in ascending order + # I = I[v >= 0.01] + idx = idx[-top_k:] # indices of the top-k largest vals + xx1 = boxes.new() + yy1 = boxes.new() + xx2 = boxes.new() + yy2 = boxes.new() + w = boxes.new() + h = boxes.new() + + # keep = torch.Tensor() + count = 0 + while idx.numel() > 0: + i = idx[-1] # index of current largest val + # keep.append(i) + keep[count] = i + count += 1 + if idx.size(0) == 1: + break + idx = idx[:-1] # remove kept element from view + # load bboxes of next highest vals + torch.index_select(x1, 0, idx, out=xx1) + torch.index_select(y1, 0, idx, out=yy1) + torch.index_select(x2, 0, idx, out=xx2) + torch.index_select(y2, 0, idx, out=yy2) + # store element-wise max with next highest score + xx1 = torch.clamp(xx1, min=x1[i]) + yy1 = torch.clamp(yy1, min=y1[i]) + xx2 = torch.clamp(xx2, max=x2[i]) + yy2 = torch.clamp(yy2, max=y2[i]) + w.resize_as_(xx2) + h.resize_as_(yy2) + w = xx2 - xx1 + h = yy2 - yy1 + # check sizes of xx1 and xx2.. after each iteration + w = torch.clamp(w, min=0.0) + h = torch.clamp(h, min=0.0) + inter = w*h + # IoU = i / (area(a) + area(b) - i) + rem_areas = torch.index_select(area, 0, idx) # load remaining areas) + union = (rem_areas - inter) + area[i] + IoU = inter/union # store result in iou + # keep only elements with an IoU <= overlap + idx = idx[IoU.le(overlap)] + else: + keep = torchvision.ops.nms(boxes, scores, overlap) + count = keep.shape[0] + + return keep, count diff --git a/modules/detection_loss.py b/modules/detection_loss.py new file mode 100644 index 0000000..f0a3a9f --- /dev/null +++ b/modules/detection_loss.py @@ -0,0 +1,165 @@ +""" + +Copyright (c) 2019 Gurkirt Singh + All Rights Reserved. + +""" + +import torch.nn as nn +import torch.nn.functional as F +import torch, pdb, time +from modules import box_utils + + +# Credits:: from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/layers/smooth_l1_loss.py +# smooth l1 with beta +def smooth_l1_loss(input, target, beta=1. / 9, reduction='sum'): + n = torch.abs(input - target) + cond = n < beta + loss = torch.where(cond, 0.5 * n ** 2 / beta, n - 0.5 * beta) + if reduction == 'mean': + return loss.mean() + return loss.sum() + + +def sigmoid_focal_loss(preds, labels, num_pos, alpha, gamma): + '''Args:: + preds: sigmoid activated predictions + labels: one hot encoded labels + num_pos: number of positve samples + alpha: weighting factor to baclence +ve and -ve + gamma: Exponent factor to baclence easy and hard examples + Return:: + loss: computed loss and reduced by sum and normlised by num_pos + ''' + loss = F.binary_cross_entropy(preds, labels, reduction='none') + alpha_factor = alpha * labels + (1.0 - alpha) * (1.0 - labels) + pt = preds * labels + (1.0 - preds) * (1.0 - labels) + focal_weight = alpha_factor * ((1-pt) ** gamma) + loss = (loss * focal_weight).sum() / num_pos + return loss + +def get_one_hot_labels(tgt_labels, numc): + new_labels = torch.zeros([tgt_labels.shape[0], numc], device=tgt_labels.device) + new_labels[:, tgt_labels] = 1.0 + return new_labels + + + +class FocalLoss(nn.Module): + def __init__(self, args, alpha=0.25, gamma=2.0): + """Implement YOLO Loss. + Basically, combines focal classification loss + and Smooth L1 regression loss. + """ + super(FocalLoss, self).__init__() + self.positive_threshold = args.POSTIVE_THRESHOLD + self.negative_threshold = args.NEGTIVE_THRESHOLD + self.num_classes = args.num_classes + self.num_label_types = args.num_label_types + self.num_classes_list = args.num_classes_list + self.alpha = 0.25 + self.gamma = 2.0 + + + def forward(self, confidence, predicted_locations, gt_boxes, gt_labels, counts, anchors, ego_preds, ego_labels): + ## gt_boxes, gt_labels, counts, ancohor_boxes + + """ + + Compute classification loss and smooth l1 loss. + Args: + confidence (batch_size, num_anchors, num_classes): class predictions. + locations (batch_size, num_anchors, 4): predicted locations. + boxes list of len = batch_size and nx4 arrarys + anchors: (num_anchors, 4) + + """ + ego_preds = torch.sigmoid(ego_preds) + ps = confidence.shape + preds = torch.sigmoid(confidence) + # ps = predicted_locations.shape + # predicted_locations = predicted_locations.view(ps[0],ps[1], -1, [-1]) + ball_labels = [] + bgt_locations = [] + blabels_bin = [] + # mask = torch.zeros([preds.shape[0],preds.shape[1]], dtype=torch.int) + + with torch.no_grad(): + # gt_boxes = gt_boxes.cpu() + # gt_labels = gt_labels.cpu() + # anchors = anchors.cpu() + # device = torch.device("cpu") + device = preds.device + zeros_tensor = torch.zeros(1, gt_labels.shape[-1], device=device) + for b in range(gt_boxes.shape[0]): + all_labels = [] + gt_locations = [] + labels_bin = [] + for s in range(gt_boxes.shape[1]): + gt_boxes_batch = gt_boxes[b, s, :counts[b,s], :] + gt_labels_batch = gt_labels[b, s, :counts[b,s], :] + if counts[b,s]>0: + gt_dumy_labels_batch = torch.LongTensor([i for i in range(counts[b,s])]).to(device) + conf, loc = box_utils.match_anchors_wIgnore(gt_boxes_batch, gt_dumy_labels_batch, + anchors, pos_th=self.positive_threshold, nge_th=self.negative_threshold ) + else: + loc = torch.zeros_like(anchors, device=device) + conf = ego_labels.new_zeros(anchors.shape[0], device=device) - 1 + + # print(conf.device) + # print(loc.device) + gt_locations.append(loc) + labels_bin.append(conf) + + dumy_conf = conf.clone() + dumy_conf[dumy_conf<0] = 0 + labels_bs = torch.cat((zeros_tensor, gt_labels_batch),0) + batch_labels = labels_bs[dumy_conf,:] + all_labels.append(batch_labels) + + all_labels = torch.stack(all_labels, 0).float() + gt_locations = torch.stack(gt_locations, 0) + labels_bin = torch.stack(labels_bin, 0).float() + ball_labels.append(all_labels) + bgt_locations.append(gt_locations) + blabels_bin.append(labels_bin) + + all_labels = torch.stack(ball_labels, 0) + gt_locations = torch.stack(bgt_locations, 0) + labels_bin = torch.stack(blabels_bin, 0) + # mask = labels_bin > -1 + # device = ego_preds.device + # all_labels = all_labels.to(device) + # gt_locations = gt_locations.to(device) + # labels_bin = labels_bin.to(device) + + # bgt_locations = [] + # blabels_bin = [] + pos_mask = labels_bin > 0 + num_pos = max(1.0, float(pos_mask.sum())) + + gt_locations = gt_locations[pos_mask].reshape(-1, 4) + predicted_locations = predicted_locations[pos_mask].reshape(-1, 4) + regression_loss = smooth_l1_loss(predicted_locations, gt_locations)/(num_pos * 4.0) + + # if regression_loss.item()>40: + # pdb.set_trace() + + mask = labels_bin > -1 # Get mask to remove ignore examples + + masked_labels = all_labels[mask].reshape(-1, self.num_classes) # Remove Ignore labels + masked_preds = preds[mask].reshape(-1, self.num_classes) # Remove Ignore preds + cls_loss = sigmoid_focal_loss(masked_preds, masked_labels, num_pos, self.alpha, self.gamma) + + mask = ego_labels>-1 + numc = ego_preds.shape[-1] + masked_preds = ego_preds[mask].reshape(-1, numc) # Remove Ignore preds + masked_labels = ego_labels[mask].reshape(-1) # Remove Ignore labels + one_hot_labels = get_one_hot_labels(masked_labels, numc) + ego_loss = 0 + if one_hot_labels.shape[0]>0: + ego_loss = sigmoid_focal_loss(masked_preds, one_hot_labels, one_hot_labels.shape[0], self.alpha, self.gamma) + + # print(regression_loss, cls_loss, ego_loss) + return regression_loss, cls_loss/8.0 + ego_loss/4.0 \ No newline at end of file diff --git a/modules/evaluation.py b/modules/evaluation.py new file mode 100644 index 0000000..14d3b09 --- /dev/null +++ b/modules/evaluation.py @@ -0,0 +1,692 @@ +''' + +Author:: Gurkirt Singh + +''' +import copy +import os +import json +import time +import pdb +import pickle +import numpy as np +import scipy.io as io # to save detection as mat files +from data.datasets import is_part_of_subsets, get_filtered_tubes, get_filtered_frames, filter_labels, read_ava_annotations +from data.datasets import get_frame_level_annos_ucf24, get_filtered_tubes_ucf24, read_labelmap +from modules.tube_helper import get_tube_3Diou, make_det_tube +from modules import utils +logger = utils.get_logger(__name__) + +def voc_ap(rec, prec, use_07_metric=False): + """ ap = voc_ap(rec, prec, [use_07_metric]) + Compute VOC AP given precision and recall. + If use_07_metric is true, uses the + VOC 07 11 point method (default:False). + """ + if use_07_metric: + # 11 point metric + ap = 0. + for t in np.arange(0., 1.1, 0.1): + if np.sum(rec >= t) == 0: + p = 0 + else: + p = np.max(prec[rec >= t]) + ap = ap + p / 11. + else: + # correct AP calculation + # first append sentinel values at the end + mrec = np.concatenate(([0.], rec, [1.])) + mpre = np.concatenate(([0.], prec, [0.])) + + # compute the precision envelope + for i in range(mpre.size - 1, 0, -1): + mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) + + # to calculate area under PR curve, look for points + # where X axis (recall) changes value + i = np.where(mrec[1:] != mrec[:-1])[0] + + # and sum (\Delta recall) * prec + ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) + return ap*100 + + +def pr_to_ap(pr): + """ + Compute AP given precision-recall + pr is a Nx2 array with first row being precision and second row being recall + """ + + prdif = pr[1:, 1] - pr[:-1, 1] + prsum = pr[1:, 0] + pr[:-1, 0] + + return np.sum(prdif * prsum * 0.5) + + +def get_gt_of_cls(gt_boxes, cls): + cls_gt_boxes = [] + for i in range(gt_boxes.shape[0]): + if len(gt_boxes.shape) > 1 and int(gt_boxes[i, -1]) == cls: + cls_gt_boxes.append(gt_boxes[i, :-1]) + return np.asarray(cls_gt_boxes) + +def compute_iou_dict(det, cls_gt_boxes): + # print(cls_gt_boxes, type(cls_gt_boxes)) + cls_gt_boxes = cls_gt_boxes.reshape(-1,4) + # print(cls_gt_boxes, type(cls_gt_boxes)) + return compute_iou(det['box'], cls_gt_boxes)[0] + +def compute_iou(box, cls_gt_boxes): + + ious = np.zeros(cls_gt_boxes.shape[0]) + + for m in range(cls_gt_boxes.shape[0]): + gtbox = cls_gt_boxes[m] + + xmin = max(gtbox[0], box[0]) + ymin = max(gtbox[1], box[1]) + xmax = min(gtbox[2], box[2]) + ymax = min(gtbox[3], box[3]) + iw = np.maximum(xmax - xmin, 0.) + ih = np.maximum(ymax - ymin, 0.) + if iw > 0 and ih > 0: + intsc = iw*ih + else: + intsc = 0.0 + union = (gtbox[2] - gtbox[0]) * (gtbox[3] - gtbox[1]) + \ + (box[2] - box[0]) * (box[3] - box[1]) - intsc + ious[m] = intsc/union + + return ious + + +def evaluate_detections(gt_boxes, det_boxes, classes=[], iou_thresh=0.5): + + ap_strs = [] + num_frames = len(gt_boxes) + logger.info('Evaluating for '+ str(num_frames) + ' frames') + ap_all = np.zeros(len(classes), dtype=np.float32) + # loop over each class 'cls' + for cls_ind, class_name in enumerate(classes): + scores = np.zeros(num_frames * 2000) + istp = np.zeros(num_frames * 2000) + det_count = 0 + num_postives = 0.0 + for nf in range(num_frames): # loop over each frame 'nf' + # if len(gt_boxes[nf])>0 and len(det_boxes[cls_ind][nf]): + # get frame detections for class cls in nf + frame_det_boxes = np.copy(det_boxes[cls_ind][nf]) + # get gt boxes for class cls in nf frame + cls_gt_boxes = get_gt_of_cls(np.copy(gt_boxes[nf]), cls_ind) + num_postives += cls_gt_boxes.shape[0] + # check if there are dection for class cls in nf frame + if frame_det_boxes.shape[0] > 0: + # sort in descending order + sorted_ids = np.argsort(-frame_det_boxes[:, -1]) + for k in sorted_ids: # start from best scoring detection of cls to end + box = frame_det_boxes[k, :-1] # detection bounfing box + score = frame_det_boxes[k, -1] # detection score + ispositive = False # set ispostive to false every time + # we can only find a postive detection + if cls_gt_boxes.shape[0] > 0: + # if there is atleast one gt bounding for class cls is there in frame nf + # compute IOU between remaining gt boxes + iou = compute_iou(box, cls_gt_boxes) + # and detection boxes + # get the max IOU window gt index + maxid = np.argmax(iou) + # check is max IOU is greater than detection threshold + if iou[maxid] >= iou_thresh: + ispositive = True # if yes then this is ture positive detection + # remove assigned gt box + cls_gt_boxes = np.delete(cls_gt_boxes, maxid, 0) + # fill score array with score of current detection + scores[det_count] = score + if ispositive: + # set current detection index (det_count) + istp[det_count] = 1 + # to 1 if it is true postive example + det_count += 1 + + if num_postives < 1: + num_postives = 1 + + scores = scores[:det_count] + istp = istp[:det_count] + argsort_scores = np.argsort(-scores) # sort in descending order + istp = istp[argsort_scores] # reorder istp's on score sorting + fp = np.cumsum(istp == 0) # get false positives + tp = np.cumsum(istp == 1) # get true positives + fp = fp.astype(np.float64) + tp = tp.astype(np.float64) + recall = tp / float(num_postives) # compute recall + # compute precision + precision = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) + # compute average precision using voc2007 metric + cls_ap = voc_ap(recall, precision) + ap_all[cls_ind] = cls_ap + ap_str = class_name + ' : ' + \ + str(num_postives) + ' : ' + str(det_count) + ' : ' + str(cls_ap) + ap_strs.append(ap_str) + + mAP = np.mean(ap_all) + logger.info('Mean ap '+ str(mAP)) + return mAP, ap_all, ap_strs + + +def evaluate(gts, dets, all_classes, iou_thresh=0.5): + # np.mean(ap_all), ap_all, ap_strs + aps, aps_all, ap_strs = [], [], [] + for nlt in range(len(gts)): + a, b, c = evaluate_detections( + gts[nlt], dets[nlt], all_classes[nlt], iou_thresh) + aps.append(a) + aps_all.append(b) + ap_strs.append(c) + return aps, aps_all, ap_strs + + +def get_class_ap_from_scores(scores, istp, num_postives): + # num_postives = np.sum(istp) + if num_postives < 1: + num_postives = 1 + argsort_scores = np.argsort(-scores) # sort in descending order + istp = istp[argsort_scores] # reorder istp's on score sorting + fp = np.cumsum(istp == 0) # get false positives + tp = np.cumsum(istp == 1) # get true positives + fp = fp.astype(np.float64) + tp = tp.astype(np.float64) + recall = tp / float(num_postives) # compute recall + # compute precision + precision = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) + # compute average precision using voc2007 metric + cls_ap = voc_ap(recall, precision) + return cls_ap + + +def evaluate_ego(gts, dets, classes): + ap_strs = [] + num_frames = gts.shape[0] + logger.info('Evaluating for ' + str(num_frames) + ' frames') + + if num_frames<1: + return 0, [0, 0], ['no gts present','no gts present'] + + ap_all = [] + sap = 0.0 + for cls_ind, class_name in enumerate(classes): + scores = dets[:, cls_ind] + istp = np.zeros_like(gts) + istp[gts == cls_ind] = 1 + det_count = num_frames + num_postives = np.sum(istp) + cls_ap = get_class_ap_from_scores(scores, istp, num_postives) + ap_all.append(cls_ap) + sap += cls_ap + ap_str = class_name + ' : ' + \ + str(num_postives) + ' : ' + str(det_count) + ' : ' + str(cls_ap) + ap_strs.append(ap_str) + + mAP = sap/len(classes) + ap_strs.append('FRAME Mean AP:: {:0.2f}'.format(mAP)) + + return mAP, ap_all, ap_strs + + +def get_gt_tubes_ucf(final_annots, subset, label_type): + """Get video list form ground truth videos used in subset + and their ground truth tubes """ + + video_list = [] + tubes = {} + for videoname in final_annots['db']: + if videoname not in final_annots['trainvideos']: + video_list.append(videoname) + tubes[videoname] = get_filtered_tubes( + label_type+'_tubes', final_annots, videoname) + + return video_list, tubes + + +def get_gt_tubes(final_annots, subset, label_type, dataset): + """Get video list form ground truth videos used in subset + and their ground truth tubes """ + + video_list = [] + tubes = {} + for videoname in final_annots['db']: + if dataset == 'road': + cond = is_part_of_subsets(final_annots['db'][videoname]['split_ids'], [subset]) + else: + cond = videoname not in final_annots['trainvideos'] + if cond: + video_list.append(videoname) + if dataset == 'road': + tubes[videoname] = get_filtered_tubes( + label_type+'_tubes', final_annots, videoname) + else: + tubes[videoname] = get_filtered_tubes_ucf24(final_annots['db'][videoname]['annotations']) + + return video_list, tubes + + +def get_det_class_tubes(tubes, cl_id): + class_tubes = [] + for video, video_tubes in tubes.items(): + for tube in video_tubes: + if tube['label_id'] == cl_id: + # scores, boxes = tube['scores'], tube['boxes'] + # frames, label_id = tube['frames'], tube['label_id'] + class_tubes.append([video, tube]) #make_det_tube(scores, boxes, frames, label_id)]) + return class_tubes + + +def get_gt_class_tubes(tubes, cl_id): + class_tubes = {} + for video, video_tubes in tubes.items(): + class_tubes[video] = [] + for tube in video_tubes: + if tube['label_id'] == cl_id: + class_tubes[video].append(tube) + return class_tubes + +def compute_class_ap(class_dets, class_gts, match_func, iou_thresh, metric_type=None): + + fn = max(1, sum([len(class_gts[iid]) + for iid in class_gts])) # false negatives + num_postives = fn + + if len(class_dets) == 0: + return 0,num_postives ,0,0 + pr = np.empty((len(class_dets) + 1, 2), dtype=np.float32) + pr[0, 0] = 1.0 + pr[0, 1] = 0.0 + + + fp = 0 # false positives + tp = 0 # true positives + + scores = np.zeros(len(class_dets)) + istp = np.zeros(len(class_dets)) + + inv_det_scores = np.asarray([-det[1]['score'] for det in class_dets]) + indexs = np.argsort(inv_det_scores) + count = 0 + for count, det_id in enumerate(indexs): + is_positive = False + detection = class_dets[det_id] + iid, det = detection + score = det['score'] + # pdb.set_trace() + if len(class_gts[iid]) > 0: + if metric_type is None: + ious = np.asarray([match_func(det, gt) + for gt in class_gts[iid]]) + else: + ious = np.asarray([match_func(det, gt, metric_type) + for gt in class_gts[iid]]) + # print(ious) + max_iou_id = np.argmax(ious) + if ious[max_iou_id] >= iou_thresh: + is_positive = True + del class_gts[iid][max_iou_id] + + scores[count] = score + + if is_positive: + istp[count] = 1 + tp += 1 + fn -= 1 + else: + fp += 1 + + pr[count+1, 0] = float(tp) / float(tp + fp) + pr[count+1, 1] = float(tp) / float(tp + fn) + + class_ap = float(100*pr_to_ap(pr)) + + return class_ap, num_postives, count, pr[count+1, 1] + + +def evaluate_tubes(anno_file, det_file, subset='val_3', dataset='road', iou_thresh=0.2, metric_type='stiou'): + + logger.info('Evaluating tubes for datasets '+ dataset) + logger.info('GT FILE:: '+ anno_file) + logger.info('Result File:: '+ det_file) + + if dataset == 'road': + with open(anno_file, 'r') as fff: + final_annots = json.load(fff) + else: + with open(anno_file, 'rb') as fff: + final_annots = pickle.load(fff) + + with open(det_file, 'rb') as fff: + detections = pickle.load(fff) + + if dataset == 'road': + label_types = final_annots['label_types'] + else: + label_types = ['action'] + + results = {} + for _, label_type in enumerate(label_types): + + if dataset != 'road': + classes = final_annots['classes'] + else: + classes = final_annots[label_type+'_labels'] + + logger.info('Evaluating {} {}'.format(label_type, len(classes))) + ap_all = [] + re_all = [] + ap_strs = [] + sap = 0.0 + video_list, gt_tubes = get_gt_tubes(final_annots, subset, label_type, dataset) + det_tubes = {} + + for videoname in video_list: + det_tubes[videoname] = detections[label_type][videoname] + + for cl_id, class_name in enumerate(classes): + + class_dets = get_det_class_tubes(det_tubes, cl_id) + class_gts = get_gt_class_tubes(gt_tubes, cl_id) + + class_ap, num_postives, count, recall = compute_class_ap(class_dets, class_gts, get_tube_3Diou, iou_thresh, metric_type=metric_type) + + recall = recall*100 + sap += class_ap + ap_all.append(class_ap) + re_all.append(recall) + ap_str = class_name + ' : ' + str(num_postives) + \ + ' : ' + str(count) + ' : ' + str(class_ap) +\ + ' : ' + str(recall) + ap_strs.append(ap_str) + mAP = sap/len(classes) + mean_recall = np.mean(np.asarray(re_all)) + ap_strs.append('\nMean AP:: {:0.2f} mean Recall {:0.2f}'.format(mAP,mean_recall)) + results[label_type] = {'mAP':mAP, 'ap_all':ap_all, 'ap_strs':ap_strs, 'recalls':re_all, 'mR':mean_recall} + logger.info('MAP:: {}'.format(mAP)) + + return results + + + +def get_gt_frames_ucf24(final_annots, label_type): + """Get video list form ground truth videos used in subset + and their ground truth frames """ + + frames = {} + trainvideos = final_annots['trainvideos'] + # labels = final_annots['classes'] + labels = ['action_ness'] + final_annots['classes'] + num_classes = len(labels) + database = final_annots['db'] + for videoname in final_annots['db']: + if videoname not in trainvideos: + numf = database[videoname]['numf'] + fframe_level_annos, _ = get_frame_level_annos_ucf24(database[videoname]['annotations'], numf, num_classes) + for frame_id , frame in enumerate(fframe_level_annos): + frame_name = '{:05d}'.format(int(frame_id+1)) + all_boxes = [] + label = 0 if label_type == 'action_ness' else database[videoname]['label'] + for k in range(len(frame['boxes'])): + all_boxes.append([frame['boxes'][k], [label]]) + frames[videoname+frame_name] = all_boxes + + return frames + + +def get_gt_frames_ava(final_annots, label_type): + """Get video list form ground truth videos used in subset + and their ground truth frames """ + + assert label_type in ['action_ness', 'actions'], 'only valid for action classes not for actionness but TODO: should be easy to incorprate just add to eval_framewise_ego_actions_ucf24 as preds are same but gt in this format {}'.format(label_type) + + frames = {} + # trainvideos = final_annots['trainvideos'] + # labels = final_annots['classes'] + # labels = ['action_ness'] + final_annots['classes'] + # num_classes = len(labels) + # database = final_annots['db'] + for videoname in final_annots: + # class_ids_map + for ts in final_annots[videoname]: + boxes = {} + time_stamp = int(ts) + frame_num = int((time_stamp - 900) * 30 + 1) + frame_name = '{:05d}'.format(frame_num) + if ts in final_annots[videoname]: + # assert time_stamp == int(annotations[ts][0][0]) + for anno in final_annots[videoname][ts]: + box_key = '_'.join('{:0.3f}'.format(b) for b in anno[1]) + box = copy.deepcopy(anno[1]) + for bi in range(4): + assert 0<=box[bi]<=1.01, box + box[bi] = min(1.0, max(0, box[bi])) + box[bi] = box[bi]*682 if bi % 2 == 0 else box[bi]*512 + + box = np.asarray(box) + + assert 80>=anno[2]>=1, 'label should be between 1 and 80 but it is {} '.format(anno[2]) + + if box_key not in boxes: + boxes[box_key] = {'box':box, 'labels':[]} + if label_type == 'action_ness': + boxes[box_key]['labels'].append(0) + else: + boxes[box_key]['labels'].append(anno[2]) + + + all_boxes = [] + for box_key in boxes: + all_boxes.append([boxes[box_key]['box'], boxes[box_key]['labels']]) + frames[videoname+frame_name] = all_boxes + + return frames + + +def get_gt_frames(final_annots, subsets, label_type, dataset): + """Get video list form ground truth videos used in subset + and their ground truth frames """ + if dataset == 'road': + # video_list = [] + frames = {} + if not isinstance(subsets, list): + subsets = [subsets] + for videoname in final_annots['db']: + if is_part_of_subsets(final_annots['db'][videoname]['split_ids'], subsets): + # video_list.append(videoname) + frames = get_filtered_frames( + label_type, final_annots, videoname, frames) + elif dataset == 'ucf24': + return get_gt_frames_ucf24(final_annots, label_type) + else: + return get_gt_frames_ava(final_annots, label_type) + + return frames + + +def get_det_class_frames(dets, cl_id, frame_ids, dataset): + class_dets = [] + for frame_id in dets: + if dataset == 'ucf24' or frame_id in frame_ids: + all_frames_dets = dets[frame_id][cl_id] + for i in range(all_frames_dets.shape[0]): + det = {'box':all_frames_dets[i,:4], 'score':all_frames_dets[i,4]} + class_dets.append([frame_id, det]) + return class_dets + + +def get_gt_class_frames(gts, cl_id): + frames = {} + for frame_id, frame in gts.items(): + boxes = [] + for anno in frame: + if cl_id in anno[1]: + boxes.append(anno[0].copy()) + frames[frame_id] = boxes + + return frames + + +def eval_framewise_ego_actions_road(final_annots, detections, subsets): + """Get video list form ground truth videos used in subset + and their ground truth frames """ + + + if not isinstance(subsets, list): + subsets = [subsets] + + label_key = 'av_action' + filtered_gts = [] + filtered_preds = [] + all_labels = final_annots['all_'+label_key+'_labels'] + labels = final_annots[label_key+'_labels'] + for videoname in final_annots['db']: + if is_part_of_subsets(final_annots['db'][videoname]['split_ids'], subsets): + # label_key = 'av_actions' + frames = final_annots['db'][videoname]['frames'] + + for frame_id , frame in frames.items(): + # frame_name = '{:05d}'.format(int(frame_id)) + frame_name = '{:05d}'.format(int(frame_id)) + if frame['annotated']>0: + gts = filter_labels(frame[label_key+'_ids'], all_labels, labels) + filtered_gts.append(gts) + frame_name = '{:05d}'.format(int(frame_id)) + filtered_preds.append(detections[videoname+frame_name]) + + gts = np.asarray(filtered_gts) + preds = np.asarray(filtered_preds) + return evaluate_ego(gts, preds, labels) + + +def eval_framewise_ego_actions_ucf24(final_annots, detections, subsets): + """Get video list form ground truth videos used in subset + and their ground truth frames """ + + filtered_gts = [] + filtered_preds = [] + trainvideos = final_annots['trainvideos'] + labels = ['Non_action'] + final_annots['classes'] + num_classes = len(labels) + database = final_annots['db'] + for videoname in final_annots['db']: + if videoname not in trainvideos: + numf = database[videoname]['numf'] + fframe_level_annos, _ = get_frame_level_annos_ucf24(database[videoname]['annotations'], numf, num_classes) + for frame_id , frame in enumerate(fframe_level_annos): + frame_name = '{:05d}'.format(int(frame_id+1)) + gts = [frame['ego_label']] + filtered_gts.append(gts) + filtered_preds.append(detections[videoname+frame_name]) + + gts = np.asarray(filtered_gts) + preds = np.asarray(filtered_preds) + + return evaluate_ego(gts, preds, labels) + + +def eval_framewise_ego_actions(final_annots, detections, subsets, dataset='road'): + if dataset == 'road': + return eval_framewise_ego_actions_road(final_annots, detections, subsets) + else: + return eval_framewise_ego_actions_ucf24(final_annots, detections, subsets) + + +def evaluate_frames(anno_file, det_file, subset, iou_thresh=0.5, dataset='road'): + + + logger.info('Evaluating frames for datasets '+ dataset) + t0 = time.perf_counter() + if dataset == 'road': + with open(anno_file, 'r') as fff: + final_annots = json.load(fff) + elif dataset == 'ucf24': + with open(anno_file, 'rb') as fff: + final_annots = pickle.load(fff) + elif dataset == 'ava': + final_annots = read_ava_annotations(anno_file) + labelmap_file = os.path.join(os. path. dirname(anno_file), 'ava_actions.pbtxt') + class_names_ava, class_ids_map, label_map = read_labelmap(labelmap_file) + + with open(det_file, 'rb') as fff: + detections = pickle.load(fff) + + results = {} + if dataset == 'road': + label_types = ['av_actions'] + ['agent_ness'] + final_annots['label_types'] + elif dataset == 'ucf24': + label_types = ['frame_actions', 'action_ness', 'action'] + elif dataset == 'ava': + label_types = ['action_ness', 'actions'] + else: + raise Exception('Define data type prpperly follwong is not in the list ::: '+dataset) + + t1 = time.perf_counter() + logger.info('Time taken to load for evaluation {}'.format(t1-t0)) + for nlt, label_type in enumerate(label_types): + if label_type in ['av_actions', 'frame_actions']: + mAP, ap_all, ap_strs = eval_framewise_ego_actions(final_annots, detections[label_type], subset, dataset) + re_all = [1.0 for _ in range(len(ap_all))] + for apstr in ap_strs: + logger.info(apstr) + else: + # t0 = time.perf_counter() + ap_all = [] + ap_strs = [] + re_all = [] + sap = 0.0 + gt_frames = get_gt_frames(final_annots, subset, label_type, dataset) + t1 = time.perf_counter() + # logger.info('Time taken to get GT frame for evaluation {}'.format(t0-t1)) + if label_type == 'agent_ness': + classes = ['agent_ness'] + elif label_type == 'action_ness': + classes = ['action_ness'] + elif dataset == 'ava': + classes = class_names_ava + elif dataset != 'road': + classes = final_annots['classes'] ## valid for ucf24 + else: + classes = final_annots[label_type+'_labels'] + + for cl_id, class_name in enumerate(classes): + t1 = time.perf_counter() + # print(cl_id, class_name, label_type) + ## gather gt of class "class_name" from frames which are not marked igonre + if dataset == 'ava' and label_type != 'action_ness': + class_gts = get_gt_class_frames(gt_frames, label_map[class_name]['org_id']) + else: + class_gts = get_gt_class_frames(gt_frames, cl_id) + + t2 = time.perf_counter() + + frame_ids = [f for f in class_gts.keys()] + ## gather detection from only that are there in gt or not marked ignore + class_dets = get_det_class_frames(detections[label_type], cl_id, frame_ids, dataset) + t3 = time.perf_counter() + + class_ap, num_postives, count, recall = compute_class_ap(class_dets, class_gts, compute_iou_dict, iou_thresh) + + recall = recall*100 + sap += class_ap + ap_all.append(class_ap) + re_all.append(recall) + ap_str = class_name + ' : ' + str(num_postives) + \ + ' : ' + str(count) + ' : ' + str(class_ap) +\ + ' : ' + str(recall) + ap_strs.append(ap_str) + t4 = time.perf_counter() + + + mAP = sap/len(classes) + mean_recall = np.mean(np.asarray(re_all)) + ap_strs.append('\nMean AP:: {:0.2f} mean Recall {:0.2f}'.format(mAP,mean_recall)) + results[label_type] = {'mAP':mAP, 'ap_all':ap_all, 'ap_strs':ap_strs, 'recalls':re_all, 'mR':mean_recall} + logger.info('{} MAP:: {}'.format(label_type, mAP)) + t1 = time.perf_counter() + logger.info('Time taken to complete evaluation {}'.format(t1-t0)) + return results \ No newline at end of file diff --git a/modules/gen_agent_paths.py b/modules/gen_agent_paths.py new file mode 100644 index 0000000..dce06b4 --- /dev/null +++ b/modules/gen_agent_paths.py @@ -0,0 +1,276 @@ +import numpy as np +import pdb + +def update_agent_paths(live_paths, dead_paths, dets, num_classes_to_use, time_stamp, iouth=0.1, costtype='scoreiou', jumpgap=5, min_len=5): ## trim_threshold=100, keep_num=60, + num_box = dets['boxes'].shape[0] + if len(live_paths) == 0: + # Start a path for each box in first frame + for b in range(num_box): + live_paths.append({'boxes': None, 'scores': [], 'allScores': None, 'foundAt': [], 'count': 1}) + live_paths[b]['boxes'] = dets['boxes'][b, :].reshape(1,-1) # bth box x0,y0,x1,y1 at frame t + live_paths[b]['scores'].append(dets['scores'][b]) # action score of bth box at frame t + live_paths[b]['allScores'] = dets['allScores'][b, :].reshape(1,-1) # scores for all action for bth box at frame t + live_paths[b]['foundAt'].append(time_stamp) # frame box was found in + live_paths[b]['count'] = 1 # current box count for bth box tube + else: + # Link each path to detections at frame t + lp_count = len(live_paths) # total paths at time t + dead_count = 0 + covered_boxes = np.zeros(num_box) + path_order_score = np.zeros(lp_count) + avoid_dets = [] + for lp in range(lp_count): + # Check whether path has gone stale + if time_stamp - live_paths[lp]['foundAt'][-1] <= jumpgap: + # IoU scores for path lp + as1 = live_paths[lp]['allScores'][-1,:num_classes_to_use] + as2 = dets['allScores'][:,:num_classes_to_use] + box_to_lp_score = score_of_edge(live_paths[lp], dets, iouth, costtype, avoid_dets, as1, as2, jumpgap) + + if np.sum(box_to_lp_score) > 0.1: + # print('We are here', np.sum(box_to_lp_score)) + # check if there's at least one match to detection in this frame + maxInd = np.argmax(box_to_lp_score) + # m_score = np.max(box_to_lp_score) + live_paths[lp]['count'] = live_paths[lp]['count'] + 1 + live_paths[lp]['boxes'] = np.vstack((live_paths[lp]['boxes'], dets['boxes'][maxInd, :])) + live_paths[lp]['scores'].append(dets['scores'][maxInd]) + live_paths[lp]['allScores'] = np.vstack((live_paths[lp]['allScores'], dets['allScores'][maxInd, :])) + live_paths[lp]['foundAt'].append(time_stamp) + avoid_dets.append(maxInd) + covered_boxes[maxInd] = 1 + + # else: + # live_paths[lp]['lastfound'] += 1 + scores = sorted(np.asarray(live_paths[lp]['scores'])) + num_sc = len(scores) + path_order_score[lp] = np.mean(np.asarray(scores[int(max(0, num_sc - jumpgap-1)):num_sc])) + else: + # If the path is stale, increment the dead_count + dead_count += 1 + + # Sort the path based on score of the boxes and terminate dead path + if len(path_order_score)>1 or dead_count>0: + # print('sorting path') + live_paths, dead_paths = sort_live_paths(live_paths, path_order_score, dead_paths, jumpgap, time_stamp) + + + # start new paths using boxes that are not assigned + lp_count = len(live_paths) + if np.sum(covered_boxes) < num_box: + for b in range(num_box): + if covered_boxes[b] < 0.99: + # print('numb and covered ', num_box, covered_boxes) + live_paths.append({'boxes': [], 'scores': [], 'allScores': None, 'foundAt': [], 'count': 1}) + live_paths[lp_count]['boxes'] = dets['boxes'][b, :].reshape(1,-1) # bth box x0,y0,x1,y1 at frame t + live_paths[lp_count]['scores'].append(dets['scores'][b]) # action score of bth box at frame t + live_paths[lp_count]['allScores'] = dets['allScores'][b, :].reshape(1,-1) # scores for all action for bth box at frame t + live_paths[lp_count]['count'] = 1 # current box count for bth box tube + live_paths[lp_count]['foundAt'].append(time_stamp) # frame box was found in + lp_count += 1 + + # live_paths = trim_paths(live_paths, trim_threshold, keep_num) + # dead_paths = remove_dead_paths(dead_paths, min_len, time_stamp) + + return live_paths, dead_paths + +def trim_paths(live_paths, trim_threshold, keep_num): + lp_count = len(live_paths) + for lp in range(lp_count): + # print(live_paths[lp]['boxes'].shape, live_paths[lp]['allScores'].shape) + if len(live_paths[lp]['boxes']) > trim_threshold: + live_paths[lp]['boxes'] = live_paths[lp]['boxes'][-keep_num:, :] + live_paths[lp]['scores'] = live_paths[lp]['scores'][-keep_num:] + live_paths[lp]['allScores'] = live_paths[lp]['allScores'][-keep_num:, :] + live_paths[lp]['foundAt'] = live_paths[lp]['foundAt'][-keep_num:] + return live_paths + + +def remove_dead_paths(live_paths, min_len, time_stamp): + dead_paths = [] + dp_count = 0 + for olp in range(len(dead_paths)): + if len(dead_paths[olp]['boxes']) >= min_len: + dead_paths.append({'boxes': None, 'scores': None, 'allScores': None, + 'foundAt': None, 'count': None}) + dead_paths[dp_count]['boxes'] = live_paths[olp]['boxes'] + dead_paths[dp_count]['scores'] = live_paths[olp]['scores'] + dead_paths[dp_count]['allScores'] = live_paths[olp]['allScores'] + dead_paths[dp_count]['foundAt'] = live_paths[olp]['foundAt'] + dead_paths[dp_count]['count'] = live_paths[olp]['count'] + dp_count += 1 + + return dead_paths + +def sort_live_paths(live_paths, path_order_score, dead_paths, jumpgap, time_stamp): + inds = path_order_score.flatten().argsort()[::-1] + sorted_live_paths = [] + lpc = 0 + dp_count = len(dead_paths) + for lp in range(len(live_paths)): + olp = inds[lp] + if time_stamp-live_paths[olp]['foundAt'][-1] <= jumpgap: + sorted_live_paths.append({'boxes': None, 'scores': None, 'allScores': None, + 'foundAt': None, 'count': None}) + sorted_live_paths[lpc]['boxes'] = live_paths[olp]['boxes'] + sorted_live_paths[lpc]['scores'] = live_paths[olp]['scores'] + sorted_live_paths[lpc]['allScores'] = live_paths[olp]['allScores'] + sorted_live_paths[lpc]['foundAt'] = live_paths[olp]['foundAt'] + sorted_live_paths[lpc]['count'] = live_paths[olp]['count'] + lpc += 1 + else: + dead_paths.append({'boxes': None, 'scores': None, 'allScores': None, + 'foundAt': None, 'count': None}) + dead_paths[dp_count]['boxes'] = live_paths[olp]['boxes'] + dead_paths[dp_count]['scores'] = live_paths[olp]['scores'] + dead_paths[dp_count]['allScores'] = live_paths[olp]['allScores'] + dead_paths[dp_count]['foundAt'] = live_paths[olp]['foundAt'] + dead_paths[dp_count]['count'] = live_paths[olp]['count'] + dp_count = dp_count + 1 + + return sorted_live_paths, dead_paths + +def copy_live_to_dead(live_paths, dead_paths, min_len): + dp_count = len(dead_paths) + for lp in range(len(live_paths)): + # path_score = np.mean(live_paths[lp]['scores']) + # if len(live_paths[lp]['boxes']) >= min_len or path_score > 0.01: + dead_paths.append({'boxes': None, 'scores': None, 'allScores': None, + 'foundAt': None, 'count': None}) + dead_paths[dp_count]['boxes'] = live_paths[lp]['boxes'] + dead_paths[dp_count]['scores'] = live_paths[lp]['scores'] + dead_paths[dp_count]['allScores'] = live_paths[lp]['allScores'] + dead_paths[dp_count]['foundAt'] = live_paths[lp]['foundAt'] + dead_paths[dp_count]['count'] = live_paths[lp]['count'] + dp_count = dp_count + 1 + + return dead_paths + + +def score_of_edge(v1, v2, iouth, costtype, avoid_dets, as1, as2, jumpgap): + + N2 = v2['boxes'].shape[0] + score = np.zeros(N2) + curent_boxes = v1['boxes'][-1,:] + tm = min(jumpgap+1, v1['boxes'].shape[0]) + past_boxes = v1['boxes'][-tm, :] + expected_boxes = curent_boxes + (curent_boxes-past_boxes)/max(1,tm-1) + ious = bbox_overlaps(expected_boxes, v2['boxes']) + if ious.any()>1: + print(ious) + # pdb.set_trace() + for i in range(0, N2): + if ious[i] >= iouth and i not in avoid_dets: + scores2 = v2['scores'][i] + if costtype == 'score': + score[i] = scores2 + elif costtype == 'scoreiou': + score[i] = (scores2 + ious[i])/2 + elif costtype == 'ioul2': + score[i] = (scores2 + ious[i])/2 + invl2_diff = 1.0/np.sqrt(np.sum((as1-as2[i,:])**2)) + score[i] += invl2_diff + elif costtype == 'iou': + score[i] = ious[i] + return score + + +def intersect(box_a, box_b): + # A = box_a.size(0) + B = box_b.shape[0] + inters = np.zeros(B) + for b in range(B): + max_x = min(box_a[2], box_b[b, 2]) + max_y = min(box_a[3], box_b[b, 3]) + min_x = max(box_a[0], box_b[b, 0]) + min_y = max(box_a[1], box_b[b, 1]) + inters[b] = (max_x-min_x)*(max_y-min_y) + return inters + +def bbox_overlaps(box_a, box_b): + + inter = intersect(box_a, box_b) + area_a = (box_a[2]-box_a[0])*(box_a[3]-box_a[1]) + B = box_b.shape[0] + ious = np.zeros(B) + for b in range(B): + if inter[b]>0: + area_b = (box_b[b,2] - box_b[b,0]) * (box_b[b,3] - box_b[b,1]) + union = area_a + area_b - inter[b] + ious[b] = inter[b]/union + return ious + +def check_if_sorted(array): + sorted = True + for i in range(len(array)-1): + if array[i]>array[i+1]: + sorted = False + break + return sorted + +def are_there_gaps(array): + gaps = False + for i in range(len(array)-1): + if array[i+1] - array[i] > 1 : + gaps = True + # print(array[i+1], array[i]) + break + return gaps + + +def fill_gaps(paths, min_len_with_gaps=8, minscore=0.3): + lp_count = len(paths) + new_paths = [] + filling_gaps = 0 + for lp in range(lp_count): + path = paths[lp] + path_score = np.mean(path['scores']) + if len(path['boxes']) >= min_len_with_gaps or path_score > minscore: + foundAt = path['foundAt'] + assert sorted(foundAt), 'foundAt should have been sorted i.e., paths should be built incremently' + if are_there_gaps(foundAt): + if len(foundAt)<=min_len_with_gaps: + continue + filling_gaps += 1 + numb = foundAt[-1] - foundAt[0] + 1 + new_path = {'boxes': np.zeros((numb,4)), 'scores': np.zeros(numb), + 'allScores': np.zeros((numb, path['allScores'].shape[1])), + 'foundAt': np.zeros(numb, dtype=np.int32)} + + count = 0 + fn = foundAt[0] + for n in range(len(foundAt)): + next_ = foundAt[n] + if fn == next_ : + new_path['foundAt'][count] = foundAt[n] + new_path['boxes'][count, :] = path['boxes'][n, :] + new_path['scores'][count] = path['scores'][n] + new_path['allScores'][count, :] = path['allScores'][n, :] + count += 1 + fn += 1 + else: + pfn = fn-1 + pcount = count -1 + while fn <= next_: + weight = (fn - pfn) / (next_ - pfn) + new_path['foundAt'][count] = fn + new_path['boxes'][count,:] = new_path['boxes'][pcount,:] + weight*(path['boxes'][n,:] - new_path['boxes'][pcount,:]) + new_path['allScores'][count,:] = new_path['allScores'][pcount,:] + weight*(path['allScores'][n,:] - new_path['allScores'][pcount,:]) + new_path['scores'][count] = new_path['scores'][pcount] + weight*(path['scores'][n] - new_path['scores'][pcount]) + # print(fn, weight, path['boxes'][n,:] - new_path['boxes'][pcount,:], foundAt) + # pdb.set_trace() + fn += 1 + count += 1 + # pdb.set_trace() + assert count == numb, 'count {:d} numb {:d} are not equal'.format(count, numb) + else: + new_path = {'boxes': path['boxes'], 'scores': path['scores'], + 'allScores': path['allScores'], + 'foundAt': path['foundAt']} + + new_paths.append(new_path) + + # paths[lp]['labels'] = paths[lp]['labels'][-keep_num:] + # print('Number of tube paths with gaps are ', filling_gaps) + + return paths diff --git a/modules/solver.py b/modules/solver.py new file mode 100644 index 0000000..4ffff27 --- /dev/null +++ b/modules/solver.py @@ -0,0 +1,76 @@ +import torch, pdb +import torch.optim as optim +# from .madamw import Adam as AdamM +# from .adamw import Adam as AdamW + +from torch.optim.lr_scheduler import MultiStepLR + +class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler): + def __init__(self, optimizer, MILESTONES, GAMMAS, last_epoch=-1): + self.MILESTONES = MILESTONES + self.GAMMAS = GAMMAS + assert len(GAMMAS) == len(MILESTONES), 'MILESTONES and GAMMAS should be of same length GAMMAS are of len ' + (len(GAMMAS)) + ' and MILESTONES '+ str(len(MILESTONES)) + super(WarmupMultiStepLR, self).__init__(optimizer, last_epoch) + + def get_lr(self): + if self.last_epoch not in self.MILESTONES: + return [group['lr'] for group in self.optimizer.param_groups] + else: + index = self.MILESTONES.index(self.last_epoch) + return [group['lr'] * self.GAMMAS[index] for group in self.optimizer.param_groups] + + #def print_lr(self): + # print([[group['name'], group['lr']] for group in self.optimizer.param_groups]) + +def get_optim(args, net): + freeze_layers = ['backbone_net.layer'+str(n) for n in range(1, args.FREEZE_UPTO+1)] + params = [] + solver_print_str = '\n\nSolver configs are as follow \n\n\n' + for key, value in net.named_parameters(): + + if args.FREEZE_UPTO>0 and (key.find('backbone.conv1')>-1 or key.find('backbone.bn1')>-1): # Freeze first conv layer and bn layer in resnet + value.requires_grad = False + continue + + if key.find('backbone')>-1: + for layer_id in freeze_layers: + if key.find(layer_id)>-1: + value.requires_grad = False + continue + + if not value.requires_grad: + continue + + lr = args.LR + wd = args.WEIGHT_DECAY + + if args.OPTIM == 'ADAM': + wd = 0.0 + + if "bias" in key: + lr = lr*2.0 + + if args.OPTIM == 'SGD': + params += [{"params": [value], "name":key, "lr": lr, "weight_decay":wd, "momentum":args.MOMENTUM}] + else: + params += [{"params": [value], "name":key, "lr": lr, "weight_decay":wd}] + + print_l = key +' is trained at the rate of ' + str(lr) + print(print_l) + solver_print_str += print_l + '\n' + + + if args.OPTIM == 'SGD': + optimizer = optim.SGD(params) + elif args.OPTIM == 'ADAM': + optimizer = optim.Adam(params) + else: + raise NotImplementedError('Define optimiser type') + + solver_print_str += 'optimizer is '+ args.OPTIM + '\nDone solver configs\n\n' + + #print(args.MILSTONES, args.GAMMAS) + #scheduler = WarmupMultiStepLR(optimizer, args.MILESTONES, args.GAMMAS) + scheduler = MultiStepLR(optimizer, args.MILESTONES, args.GAMMA) + + return optimizer, scheduler, solver_print_str diff --git a/modules/tube_helper.py b/modules/tube_helper.py new file mode 100644 index 0000000..280fccc --- /dev/null +++ b/modules/tube_helper.py @@ -0,0 +1,375 @@ +import numpy as np +import pdb +from modules import utils +import scipy.signal as signal +logger = utils.get_logger(__name__) +from scipy.signal import savgol_filter +# from gen_dets import make_joint_probs_from_marginals +from modules.utils import make_joint_probs_from_marginals + +over_s = 0.0 +under_s = 0.0 +over_e = 0.0 +under_e = 0.0 +oa_s = 0.0 +ua_s = 0.0 +oa_e = 0.0 +ua_e = 0.0 + +def make_det_tube(scores, boxes, frames, label_id): + tube = {} + tube['label_id'] =label_id + tube['scores'] = np.asarray(scores) + tube['boxes'] = np.asarray(boxes) + tube['score'] = np.mean(scores) + tube['frames'] = np.asarray(frames) + # assert tube['frames'].shape[0] == tube['boxes'].shape[0], 'must be equal' + return tube + +def get_nonnp_det_tube(scores, boxes, start, end, label_id, score=None): + tube = {} + tube['label_id'] =label_id + tube['scores'] = scores + tube['boxes'] = boxes + + if score is not None: + tube['score'] = score + else: + tube['score'] = float(np.mean(scores)) + + tube['frames'] = np.asarray([i for i in range(start, end)]) + assert len(tube['frames']) == len(tube['boxes']), 'must be equal' + + return tube + +def make_gt_tube(frames, boxes, label_id): + frames = np.asarray(frames) + indexs = np.argsort(frames) + frames = frames[indexs] + boxes = np.asarray(boxes) + if boxes.shape[0]>0: + boxes = boxes[indexs,:] + tube = {} + tube['frames'] = frames + tube['boxes'] = boxes + tube['label_id'] = label_id + return tube + +def trim_tubes(start_id, numc, paths, childs, num_classes_list, topk=5, alpha=3, min_len=3, trim_method='None'): + """ Trim the paths into tubes using DP""" + tubes = [] + for path in paths: + if len(childs)>0: + allScores = make_joint_probs_from_marginals(path['allScores'], childs, num_classes_list, start_id=0) + else: + allScores = path['allScores'] + allScores = allScores[:,start_id:start_id+numc] + path_start_frame = path['foundAt'][0] + if allScores.shape[0]<=min_len: + continue + + # print(allScores.shape) + if trim_method == 'none': # + # print('no trimming') + topk_classes, topk_scores = get_topk_classes(allScores, topk) + for i in range(topk): + label, start, end = topk_classes[i], path_start_frame, allScores.shape[0] + path_start_frame + if end-start+1 > min_len: + # tube = get_nonnp_det_tube(allScores[:,label], path['boxes'], int(start), int(end), int(label)) + tube = get_nonnp_det_tube(allScores[:,label], path['boxes'], int(start), int(end), int(label), score=topk_scores[i]) + tubes.append(tube) + elif trim_method == 'dpscores': ## standarded method Multi class-DP + allScores = path['allScores'][:,start_id:start_id+numc] + score_mat = np.transpose(allScores.copy()) + for _ in range(topk): + (segments, _) = dpEMmax(score_mat, alpha) + # print(segments) + labels, starts, ends = getLabels(segments) + # print(labels, starts, ends) + for i in range(len(labels)): + if ends[i] - starts[i] >= min_len: + scores = score_mat[labels[i], starts[i]:ends[i]+1] + boxes = path['boxes'][starts[i]:ends[i]+1, :] + start = starts[i] + path_start_frame + end = ends[i] + path_start_frame + 1 + tube = get_nonnp_det_tube(scores, boxes, int(start), int(end), int(labels[i])) + tubes.append(tube) + score_mat[labels[i], starts[i]:ends[i]+1] = 0.0 + + elif trim_method == 'dpscorestopn': ## bit fancy only select top segments + score_mat = np.transpose(allScores.copy()) + for _ in range(topk): + (segments, _) = dpEMmax(score_mat, alpha) + # print(segments) + labels, starts, ends = getLabels(segments) + # print(labels, starts, ends) + num_seg = labels.shape[0] + seg_scores = np.zeros(num_seg) + for i in range(min(2,len(labels))): + if ends[i] - starts[i] >= min_len: + scores = score_mat[labels[i], starts[i]:ends[i]+1] + seg_scores[i] = np.mean(scores) + else: + score_mat[labels[i], starts[i]:ends[i]+1] = 0.0 + seg_scores[i] = 0.0 + + inds = np.argsort(-seg_scores) + for ii in range(min(2, num_seg)): + i = inds[ii] + # if ends[i] - starts[i] >= min_len: + scores = score_mat[labels[i], starts[i]:ends[i]+1] + boxes = path['boxes'][starts[i]:ends[i]+1, :] + start = starts[i] + path_start_frame + if boxes.shape[0] != -starts[i] + ends[i] + 1: + print('We have exceptions', boxes.shape[0], -starts[i] + ends[i]+1) + end = ends[i] + path_start_frame + 1 + tube = get_nonnp_det_tube(scores, boxes, int(start), int(end), int(labels[i])) + tubes.append(tube) + score_mat[labels[i], starts[i]:ends[i]+1] = 0.0 + else: #indvidual class-wise dp + aa = 0 + if alpha == 0 and numc == 24: + # alphas = [1, 1, 16, 1, 1, 2, 16, 8, 4, 16, 6, 16, 20, 16, 1, 16, 16, 20, 16, 2, 4, 8, 1, 20] + # alphas = [1, 1, 8, 1, 1, 3, 16, 16, 2, 16, 3, 16, 20, 16, 1, 8, 8, 8, 16, 2, 2, 8, 1, 20] + # alphas = [1, 5, 16, 8, 1, 3, 16, 16, 16, 3, 8, 16, 16, 16, 1, 5, 16, 16, 5, 2, 1, 8, 3, 16] + # alphas = [1, 3, 16, 2, 1, 3, 8, 16, 16, 3, 3, 16, 16, 16, 1, 5, 16, 8, 5, 2, 1, 16, 2, 16] + alphas = [1, 1, 16, 3, 1, 8, 16, 16, 10, 10, 3, 16, 16, 10, 1, 8, 16, 16, 16, 2, 1, 8, 2, 16] + else: + alphas = np.zeros(numc)+alpha + + topk_classes, topk_scores = get_topk_classes(allScores, topk) + for idx in range(topk_classes.shape[0]): + current_label = int(topk_classes[idx]) + if numc == 24: + in_scores = path['allScores'][:,start_id-1] + else: + in_scores = allScores[:,current_label] + + smooth_scores = signal.medfilt(in_scores, 5) + smooth_scores = in_scores/np.max(smooth_scores) + score_mat = np.hstack((smooth_scores[:, np.newaxis], 1 - smooth_scores[:, np.newaxis])) + score_mat = np.transpose(score_mat.copy()) + (segments, _) = dpEMmax(score_mat, alphas[current_label]) + labels, starts, ends = getLabels(segments) + for i in range(len(labels)): + if ends[i] - starts[i] >= min_len and labels[i]==0: + scores = allScores[starts[i]:ends[i]+1, current_label] + sorted_classes = np.argsort(-scores) + sorted_scores = scores[sorted_classes] + topn = max(1,int(sorted_scores.shape[0]/2)) + mscore = np.mean(sorted_scores[:topn]) + boxes = path['boxes'][starts[i]:ends[i]+1, :] + start = starts[i] + path_start_frame + end = ends[i] + path_start_frame + 1 + sf = max(1,int(start)-aa) + ef = int(end)-(start-sf) + tube = get_nonnp_det_tube(scores, boxes, sf, ef, int(current_label), score=mscore) #topk_scores[idx]) + tubes.append(tube) + # score_mat[labels[i], starts[i]:ends[i]+1] = 0.0 + return tubes + +def getLabels(segments, cls=1): + starts = np.zeros(len(segments), dtype='int32') + ends = np.zeros(len(segments), dtype='int32') + labels = np.zeros(len(segments), dtype='int32') + fl = 0 + i=0 + starts[i]=0 + fl = segments[0] + labels[i] = segments[0] +# print segments[0] +# pdb.set_trace() + for ii in range(len(segments)): + if abs(segments[ii] -fl)>0: + ends[i]=ii-1 + fl = segments[ii] + i+=1 + starts[i]=ii + labels[i] = fl + ends[i] = len(segments)-1 + return labels[:i+1],starts[:i+1],ends[:i+1] + +def get_topk_classes(allScores, topk): + scores = np.zeros(allScores.shape[1]) + # print(scores.shape) + topn = max(1, allScores.shape[1]//4) + for k in range(scores.shape[0]): + temp_scores = allScores[:,k] + sorted_score = np.sort(-temp_scores) + # print(sorted_score[:topn]) + scores[k] = np.mean(-sorted_score[:topn]) + sorted_classes = np.argsort(-scores) + sorted_scores = scores[sorted_classes] + # sorted_scores = sorted_scores/np.sum(sorted_scores) + # print(sorted_scores) + return sorted_classes[:topk], sorted_scores[:topk] + + +def dpEMmax(M, alpha=3): + (r,c) = np.shape(M) + D = np.zeros((r, c+1)) # add an extra column + D[:,0] = 1 # % put the maximum cost + D[:, 1:(c+1)] = M + phi = np.zeros((r,c)) + for j in range(1,c): + for i in range(r): + v1 = np.ones(r)*alpha + v1[i] = 0 + values= D[:, j-1] - v1 + tb = np.argmax(values) + dmax = max(values) + D[i,j] = D[i,j]+dmax + phi[i,j] = tb + + q = c-1 + values= D[:, c-1] + p = np.argmax(values) + i = p + j = q + ps = np.zeros(c) + ps[q] = p + while j>0: + tb = phi[i,j] + j = int(j-1) + q = j + ps[q] = tb + i = int(tb) + + D = D[:,1:] + return (ps,D) + + +def intersect(box_a, box_b): + # A = box_a.size(0) + B = box_b.shape[0] + inters = np.zeros(B) + for b in range(B): + max_x = min(box_a[2], box_b[b, 2]) + max_y = min(box_a[3], box_b[b, 3]) + min_x = max(box_a[0], box_b[b, 0]) + min_y = max(box_a[1], box_b[b, 1]) + inters[b] = (max_x-min_x)*(max_y-min_y) + return inters + + +def bbox_overlaps(box_a, box_b): + + inter = intersect(box_a, box_b) + area_a = (box_a[2]-box_a[0])*(box_a[3]-box_a[1]) + B = box_b.shape[0] + ious = np.zeros(B) + for b in range(B): + if inter[b]>0: + area_b = (box_b[b,2] - box_b[b,0]) * (box_b[b,3] - box_b[b,1]) + union = area_a + area_b - inter[b] + ious[b] = inter[b]/union + return ious + + +def get_tube_3Diou(tube_a, tube_b , metric_type='stiou'): + """Compute the spatio-temporal IoU between two tubes""" + + + + tmin = max(tube_a['frames'][0], tube_b['frames'][0]) + tmax = min(tube_a['frames'][-1], tube_b['frames'][-1]) + + if tmax < tmin: return 0.0 + + temporal_inter = tmax - tmin + 1 + temporal_union = max(tube_a['frames'][-1], tube_b['frames'][-1]) - min(tube_a['frames'][0], tube_b['frames'][0]) + 1 + tiou = temporal_inter / temporal_union + if metric_type == 'tiou': + return tiou + # try: + + tube_a_boxes = tube_a['boxes'][int(np.where(tube_a['frames'] == tmin)[0][0]): int( + np.where(tube_a['frames'] == tmax)[0][0]) + 1, :] + tube_b_boxes = tube_b['boxes'][int(np.where(tube_b['frames'] == tmin)[0][0]): int( + np.where(tube_b['frames'] == tmax)[0][0]) + 1, :] + # except: + # pdb.set_trace() print('something', tube_a_boxes, tube_b_boxes, iou) + + siou = iou3d(tube_a_boxes, tube_b_boxes) + + global over_s, over_e, under_s, under_e, oa_s, oa_e, ua_s, ua_e + + if tube_a['frames'][-1]>= tube_b['frames'][-1]: + over_e += 1 + oa_e += tube_a['frames'][-1] - tube_b['frames'][-1] + else: + under_e += 1 + ua_e += tube_a['frames'][-1] - tube_b['frames'][-1] + + if tube_a['frames'][0]<= tube_b['frames'][0]: + over_s += 1 + oa_s += tube_a['frames'][0] - tube_b['frames'][0] + else: + under_s += 1 + ua_s += tube_a['frames'][0] - tube_b['frames'][0] + + # if not (tube_a['frames'][-1]>= tube_b['frames'][-1] and tube_a['frames'][0]<= tube_b['frames'][0]): + # tiou = 1.0 + # logger.info('over_s {} over_e {} under_s {} under_e {} oa_s {} oa_e {} ua_s {} ua_e {}'.format(over_s, over_e, under_s, under_e, oa_s, oa_e, ua_s, ua_e)) + # if siou>0.5 and temporal_inter>= tube_b['frames'][-1]-tube_b['frames'][0]: + # print(tube_b['frames'][0],tube_b['frames'][-1], tube_a['frames'][0],tube_a['frames'][-1], tube_a['scores']) + if metric_type == 'siou': + return siou + else: + return siou * tiou + + +def iou3d(tube_a, tube_b): + """Compute the IoU between two tubes with same temporal extent""" + + assert tube_a.shape[0] == tube_b.shape[0] + # assert np.all(b1[:, 0] == b2[:, 0]) + + ov = overlap2d(tube_a,tube_b) + + return np.mean(ov / (area2d(tube_a) + area2d(tube_b) - ov) ) + + +def area2d(b): + """Compute the areas for a set of 2D boxes""" + + return (b[:,2]-b[:,0]+1) * (b[:,3]-b[:,1]+1) + + +def overlap2d(b1, b2): + """Compute the overlaps between a set of boxes b1 and one box b2""" + + xmin = np.maximum(b1[:,0], b2[:,0]) + ymin = np.maximum(b1[:,1], b2[:,1]) + xmax = np.minimum(b1[:,2] + 1, b2[:,2] + 1) + ymax = np.minimum(b1[:,3] + 1, b2[:,3] + 1) + + width = np.maximum(0, xmax - xmin) + height = np.maximum(0, ymax - ymin) + + return width * height + +def nms3dt(tubes, overlap=0.6): + """Compute NMS of scored tubes. Tubes are given as list of (tube, score) + return the list of indices to keep + """ + + if not tubes: + return np.array([], dtype=np.int32) + + I = np.argsort([t['score'] for t in tubes]) + indices = np.zeros(I.size, dtype=np.int32) + counter = 0 + + while I.size > 0: + i = I[-1] + indices[counter] = i + counter += 1 + ious = np.array([get_tube_3Diou(tubes[ii], tubes[i]) for ii in I[:-1]]) + I = I[np.where(ious <= overlap)[0]] + indices = indices[:counter] + final_tubes = [] + for ind in indices: + final_tubes.append(tubes[ind]) + + return final_tubes diff --git a/modules/utils.py b/modules/utils.py new file mode 100644 index 0000000..df6b292 --- /dev/null +++ b/modules/utils.py @@ -0,0 +1,288 @@ +import os, sys +import shutil +import socket +import getpass +import copy +import numpy as np +from modules.box_utils import nms +import datetime +import logging +import torch +import pdb +import torchvision + +# from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/modeling/rpn/anchor_generator.py +class BufferList(torch.nn.Module): + """ + Similar to nn.ParameterList, but for buffers + """ + + def __init__(self, buffers=None): + super(BufferList, self).__init__() + if buffers is not None: + self.extend(buffers) + + def extend(self, buffers): + offset = len(self) + for i, buffer in enumerate(buffers): + self.register_buffer(str(offset + i), buffer) + return self + + def __len__(self): + return len(self._buffers) + + def __iter__(self): + return iter(self._buffers.values()) + +def setup_logger(args): + """ + Sets up the logging. + """ + log_file_name = '{:s}/{:s}-{date:%m-%d-%Hx}.log'.format(args.SAVE_ROOT, args.MODE, date=datetime.datetime.now()) + args.log_dir = 'logs/'+args.exp_name+'/' + if not os.path.isdir(args.log_dir): + os.makedirs(args.log_dir) + + added_log_file = '{}{}-{date:%m-%d-%Hx}.log'.format(args.log_dir, args.MODE, date=datetime.datetime.now()) + + + # Set up logging format. + _FORMAT = "[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s" + + logging.root.handlers = [] + logging.basicConfig( + level=logging.INFO, format=_FORMAT, stream=sys.stdout + ) + logging.getLogger().addHandler(logging.FileHandler(log_file_name, mode='a')) + # logging.getLogger().addHandler(logging.FileHandler(added_log_file, mode='a')) + + +def get_logger(name): + """ + Retrieve the logger with the specified name or, if name is None, return a + logger which is the root logger of the hierarchy. + Args: + name (string): name of the logger. + """ + return logging.getLogger(name) + +def copy_source(source_dir): + if not os.path.isdir(source_dir): + os.system('mkdir -p ' + source_dir) + + for dirpath, dirs, files in os.walk('./', topdown=True): + for file in files: + if file.endswith('.py'): #fnmatch.filter(files, filepattern): + shutil.copy2(os.path.join(dirpath, file), source_dir) + + +def set_args(args): + args.MAX_SIZE = int(args.MIN_SIZE*1.35) + args.MILESTONES = [int(val) for val in args.MILESTONES.split(',')] + #args.GAMMAS = [float(val) for val in args.GAMMAS.split(',')] + args.EVAL_EPOCHS = [int(val) for val in args.EVAL_EPOCHS.split(',')] + + args.TRAIN_SUBSETS = [val for val in args.TRAIN_SUBSETS.split(',') if len(val)>1] + args.VAL_SUBSETS = [val for val in args.VAL_SUBSETS.split(',') if len(val)>1] + args.TEST_SUBSETS = [val for val in args.TEST_SUBSETS.split(',') if len(val)>1] + args.TUBES_EVAL_THRESHS = [ float(val) for val in args.TUBES_EVAL_THRESHS.split(',') if len(val)>0.0001] + args.model_subtype = args.MODEL_TYPE.split('-')[0] + ## check if subsets are okay + possible_subets = ['test', 'train','val'] + for idx in range(1,4): + possible_subets.append('train_'+str(idx)) + possible_subets.append('val_'+str(idx)) + + if len(args.VAL_SUBSETS) < 1 and args.DATASET == 'road': + args.VAL_SUBSETS = [ss.replace('train', 'val') for ss in args.TRAIN_SUBSETS] + if len(args.TEST_SUBSETS) < 1: + # args.TEST_SUBSETS = [ss.replace('train', 'val') for ss in args.TRAIN_SUBSETS] + args.TEST_SUBSETS = args.VAL_SUBSETS + + for subsets in [args.TRAIN_SUBSETS, args.VAL_SUBSETS, args.TEST_SUBSETS]: + for subset in subsets: + assert subset in possible_subets, 'subest should from one of these '+''.join(possible_subets) + + args.DATASET = args.DATASET.lower() + args.ARCH = args.ARCH.lower() + + args.MEANS =[0.485, 0.456, 0.406] + args.STDS = [0.229, 0.224, 0.225] + + username = getpass.getuser() + hostname = socket.gethostname() + args.hostname = hostname + args.user = username + + args.model_init = 'kinetics' + + args.MODEL_PATH = args.MODEL_PATH[:-1] if args.MODEL_PATH.endswith('/') else args.MODEL_PATH + + assert args.MODEL_PATH.endswith('kinetics-pt') or args.MODEL_PATH.endswith('imagenet-pt') + args.model_init = 'imagenet' if args.MODEL_PATH.endswith('imagenet-pt') else 'kinetics' + + if args.MODEL_PATH == 'imagenet': + args.MODEL_PATH = os.path.join(args.MODEL_PATH, args.ARCH+'.pth') + else: + args.MODEL_PATH = os.path.join(args.MODEL_PATH, args.ARCH+args.MODEL_TYPE+'.pth') + + + print('Your working directories are::\nLOAD::> ', args.DATA_ROOT, '\nSAVE::> ', args.SAVE_ROOT) + print('Your model will be initialized using', args.MODEL_PATH) + + return args + + +def create_exp_name(args): + """Create name of experiment using training parameters """ + splits = ''.join([split[0]+split[-1] for split in args.TRAIN_SUBSETS]) + args.exp_name = '{:s}{:s}{:d}-P{:s}-b{:0d}s{:d}x{:d}x{:d}-{:s}{:s}-h{:d}x{:d}x{:d}'.format( + args.ARCH, args.MODEL_TYPE, + args.MIN_SIZE, args.model_init, args.BATCH_SIZE, + args.SEQ_LEN, args.MIN_SEQ_STEP, args.MAX_SEQ_STEP, + args.DATASET, splits, + args.HEAD_LAYERS, args.CLS_HEAD_TIME_SIZE, + args.REG_HEAD_TIME_SIZE, + ) + + args.SAVE_ROOT += args.DATASET+'/' + args.SAVE_ROOT = args.SAVE_ROOT+'cache/'+args.exp_name+'/' + if not os.path.isdir(args.SAVE_ROOT): + print('Create: ', args.SAVE_ROOT) + os.makedirs(args.SAVE_ROOT) + + return args + +# Freeze batch normlisation layers +def set_bn_eval(m): + classname = m.__class__.__name__ + if classname.find('BatchNorm') > -1: + m.eval() + if m.affine: + m.weight.requires_grad = False + m.bias.requires_grad = False + + +def get_individual_labels(gt_boxes, tgt_labels): + # print(gt_boxes.shape, tgt_labels.shape) + new_gts = np.zeros((gt_boxes.shape[0]*20, 5)) + ccc = 0 + for n in range(tgt_labels.shape[0]): + for t in range(tgt_labels.shape[1]): + if tgt_labels[n,t]>0: + new_gts[ccc, :4] = gt_boxes[n,:] + new_gts[ccc, 4] = t + ccc += 1 + return new_gts[:ccc,:] + + +def get_individual_location_labels(gt_boxes, tgt_labels): + return [gt_boxes, tgt_labels] + + +def filter_detections(args, scores, decoded_boxes_batch): + c_mask = scores.gt(args.CONF_THRESH) # greater than minmum threshold + scores = scores[c_mask].squeeze() + if scores.dim() == 0 or scores.shape[0] == 0: + return np.asarray([]) + + boxes = decoded_boxes_batch[c_mask, :].view(-1, 4) + ids, counts = nms(boxes, scores, args.NMS_THRESH, args.TOPK*5) # idsn - ids after nms + scores = scores[ids[:min(args.TOPK,counts)]].cpu().numpy() + boxes = boxes[ids[:min(args.TOPK,counts)]].cpu().numpy() + cls_dets = np.hstack((boxes, scores[:, np.newaxis])).astype(np.float32, copy=True) + + return cls_dets + + +def filter_detections_for_tubing(args, scores, decoded_boxes_batch, confidences): + c_mask = scores.gt(args.CONF_THRESH) # greater than minmum threshold + scores = scores[c_mask].squeeze() + if scores.dim() == 0 or scores.shape[0] == 0: + return np.zeros((0,200)) + + boxes = decoded_boxes_batch[c_mask, :].clone().view(-1, 4) + numc = confidences.shape[-1] + confidences = confidences[c_mask,:].clone().view(-1, numc) + + max_k = min(args.TOPK*60, scores.shape[0]) + ids, counts = nms(boxes, scores, args.NMS_THRESH, max_k) # idsn - ids after nms + scores = scores[ids[:min(args.TOPK,counts)]].cpu().numpy() + boxes = boxes[ids[:min(args.TOPK,counts)],:].cpu().numpy() + confidences = confidences[ids[:min(args.TOPK, counts)],:].cpu().numpy() + cls_dets = np.hstack((boxes, scores[:, np.newaxis])).astype(np.float32, copy=True) + save_data = np.hstack((cls_dets, confidences[:,1:])).astype(np.float32) + #print(save_data.shape) + return save_data + + +def filter_detections_for_dumping(args, scores, decoded_boxes_batch, confidences): + c_mask = scores.gt(args.GEN_CONF_THRESH) # greater than minmum threshold + scores = scores[c_mask].squeeze() + if scores.dim() == 0 or scores.shape[0] == 0: + return np.zeros((0,5)), np.zeros((0,200)) + + boxes = decoded_boxes_batch[c_mask, :].clone().view(-1, 4) + numc = confidences.shape[-1] + confidences = confidences[c_mask,:].clone().view(-1, numc) + + # sorted_ind = np.argsort(-scores.cpu().numpy()) + # sorted_ind = sorted_ind[:topk*10] + # boxes_np = boxes.cpu().numpy() + # confidences_np = confidences.cpu().numpy() + # save_data = np.hstack((boxes_np[sorted_ind,:], confidences_np[sorted_ind, :])) + # args.GEN_TOPK, args.GEN_NMS + + max_k = min(args.GEN_TOPK*500, scores.shape[0]) + ids, counts = nms(boxes, scores, args.GEN_NMS, max_k) # idsn - ids after nms + # keepids = torchvision.ops.nms(boxes, scores, args.GEN_NMS) + # pdb.set_trace() + scores = scores[ids[:min(args.GEN_TOPK,counts)]].cpu().numpy() + boxes = boxes[ids[:min(args.GEN_TOPK,counts)],:].cpu().numpy() + confidences = confidences[ids[:min(args.GEN_TOPK, counts)],:].cpu().numpy() + cls_dets = np.hstack((boxes, scores[:, np.newaxis])).astype(np.float32, copy=True) + save_data = np.hstack((cls_dets, confidences[:,1:])).astype(np.float32) + #print(save_data.shape) + return cls_dets, save_data + +def make_joint_probs_from_marginals(frame_dets, childs, num_classes_list, start_id=4): + + # pdb.set_trace() + + add_list = copy.deepcopy(num_classes_list[:3]) + add_list[0] = start_id+1 + add_list[1] = add_list[0]+add_list[1] + add_list[2] = add_list[1]+add_list[2] + # for ind in range(frame_dets.shape[0]): + for nlt, ltype in enumerate(['duplex','triplet']): + lchilds = childs[ltype+'_childs'] + lstart = start_id + for num in num_classes_list[:4+nlt]: + lstart += num + + for c in range(num_classes_list[4+nlt]): + tmp_scores = [] + for chid, ch in enumerate(lchilds[c]): + if len(tmp_scores)<1: + tmp_scores = copy.deepcopy(frame_dets[:,add_list[chid]+ch]) + else: + tmp_scores *= frame_dets[:,add_list[chid]+ch] + frame_dets[:,lstart+c] = tmp_scores + + return frame_dets + + + +def eval_strings(): + return ["Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = ", + "Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = ", + "Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = ", + "Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = ", + "Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = ", + "Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = ", + "Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = ", + "Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = ", + "Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = ", + "Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = ", + "Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = ", + "Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = "] diff --git a/train_tuber_road.py b/train_tuber_road.py new file mode 100644 index 0000000..083b652 --- /dev/null +++ b/train_tuber_road.py @@ -0,0 +1,104 @@ +import argparse +import datetime +import time + +import torch +import torch.optim +from tensorboardX import SummaryWriter + +from models.tuber_ava import build_model +from utils.model_utils import deploy_model, load_model, save_checkpoint +from utils.video_action_recognition import train_tuber_detection, validate_tuber_detection +from pipelines.video_action_recognition_config import get_cfg_defaults +from pipelines.launch import spawn_workers +from utils.utils import build_log_dir +from datasets.road_frames import build_dataloader +from utils.lr_scheduler import build_scheduler + + +def main_worker(cfg): + # create tensorboard and logs + if cfg.DDP_CONFIG.GPU_WORLD_RANK == 0: + tb_logdir = build_log_dir(cfg) + writer = SummaryWriter(log_dir=tb_logdir) + else: + writer = None + # cfg.freeze() + + # create model + print('Creating TubeR model: %s' % cfg.CONFIG.MODEL.NAME) + model, criterion, postprocessors = build_model(cfg) + model = deploy_model(model, cfg, is_tuber=True) + num_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) + print('Number of parameters in the model: %6.2fM' % (num_parameters / 1000000)) + + # create dataset and dataloader + train_loader, val_loader, train_sampler, val_sampler, mg_sampler = build_dataloader(cfg) + + + # create criterion + criterion = criterion.cuda() + + param_dicts = [ + {"params": [p for n, p in model.named_parameters() if "backbone" not in n and "class_embed" not in n and "query_embed" not in n and p.requires_grad]}, + { + "params": [p for n, p in model.named_parameters() if "backbone" in n and p.requires_grad], + "lr": cfg.CONFIG.TRAIN.LR_BACKBONE, + }, + { + "params": [p for n, p in model.named_parameters() if "class_embed" in n and p.requires_grad], + "lr": cfg.CONFIG.TRAIN.LR, #10 + }, + { + "params": [p for n, p in model.named_parameters() if "query_embed" in n and p.requires_grad], + "lr": cfg.CONFIG.TRAIN.LR, #10 + }, + ] + + # create optimizer + optimizer = torch.optim.AdamW(param_dicts, lr=cfg.CONFIG.TRAIN.LR, weight_decay=cfg.CONFIG.TRAIN.W_DECAY) + + # create lr scheduler + if cfg.CONFIG.TRAIN.LR_POLICY == "step": + lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[30,60], gamma=0.1) + else: + lr_scheduler = build_scheduler(cfg, optimizer, len(train_loader)) + + # docs: add resume option + if cfg.CONFIG.MODEL.LOAD: + model, _ = load_model(model, cfg, load_fc=cfg.CONFIG.MODEL.LOAD_FC) + + print('Start training...') + start_time = time.time() + max_accuracy = 0.0 + for epoch in range(cfg.CONFIG.TRAIN.START_EPOCH, cfg.CONFIG.TRAIN.EPOCH_NUM): + if cfg.DDP_CONFIG.DISTRIBUTED: + train_sampler.set_epoch(epoch) + + train_tuber_detection(cfg, model, criterion, train_loader, optimizer, epoch, cfg.CONFIG.LOSS_COFS.CLIPS_MAX_NORM, lr_scheduler, writer) + + if cfg.DDP_CONFIG.GPU_WORLD_RANK == 0 and ( + epoch % cfg.CONFIG.LOG.SAVE_FREQ == 0 or epoch == cfg.CONFIG.TRAIN.EPOCH_NUM - 1): + save_checkpoint(cfg, epoch, model, max_accuracy, optimizer, lr_scheduler) + + if epoch % cfg.CONFIG.VAL.FREQ == 0 or epoch == cfg.CONFIG.TRAIN.EPOCH_NUM - 1: + validate_tuber_detection(cfg, model, criterion, postprocessors, val_loader, epoch, writer) + + if writer is not None: + writer.close() + + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print('Training time {}'.format(total_time_str)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Train video action recognition transformer models.') + parser.add_argument('--config-file', + default='/xxx/TubeR_AVA_v2.1_CSN-152.yaml', + help='path to config file.') + args = parser.parse_args() + + cfg = get_cfg_defaults() + cfg.merge_from_file(args.config_file) + spawn_workers(main_worker, cfg) diff --git a/train_tuber_roadpp.py b/train_tuber_roadpp.py new file mode 100644 index 0000000..083b652 --- /dev/null +++ b/train_tuber_roadpp.py @@ -0,0 +1,104 @@ +import argparse +import datetime +import time + +import torch +import torch.optim +from tensorboardX import SummaryWriter + +from models.tuber_ava import build_model +from utils.model_utils import deploy_model, load_model, save_checkpoint +from utils.video_action_recognition import train_tuber_detection, validate_tuber_detection +from pipelines.video_action_recognition_config import get_cfg_defaults +from pipelines.launch import spawn_workers +from utils.utils import build_log_dir +from datasets.road_frames import build_dataloader +from utils.lr_scheduler import build_scheduler + + +def main_worker(cfg): + # create tensorboard and logs + if cfg.DDP_CONFIG.GPU_WORLD_RANK == 0: + tb_logdir = build_log_dir(cfg) + writer = SummaryWriter(log_dir=tb_logdir) + else: + writer = None + # cfg.freeze() + + # create model + print('Creating TubeR model: %s' % cfg.CONFIG.MODEL.NAME) + model, criterion, postprocessors = build_model(cfg) + model = deploy_model(model, cfg, is_tuber=True) + num_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) + print('Number of parameters in the model: %6.2fM' % (num_parameters / 1000000)) + + # create dataset and dataloader + train_loader, val_loader, train_sampler, val_sampler, mg_sampler = build_dataloader(cfg) + + + # create criterion + criterion = criterion.cuda() + + param_dicts = [ + {"params": [p for n, p in model.named_parameters() if "backbone" not in n and "class_embed" not in n and "query_embed" not in n and p.requires_grad]}, + { + "params": [p for n, p in model.named_parameters() if "backbone" in n and p.requires_grad], + "lr": cfg.CONFIG.TRAIN.LR_BACKBONE, + }, + { + "params": [p for n, p in model.named_parameters() if "class_embed" in n and p.requires_grad], + "lr": cfg.CONFIG.TRAIN.LR, #10 + }, + { + "params": [p for n, p in model.named_parameters() if "query_embed" in n and p.requires_grad], + "lr": cfg.CONFIG.TRAIN.LR, #10 + }, + ] + + # create optimizer + optimizer = torch.optim.AdamW(param_dicts, lr=cfg.CONFIG.TRAIN.LR, weight_decay=cfg.CONFIG.TRAIN.W_DECAY) + + # create lr scheduler + if cfg.CONFIG.TRAIN.LR_POLICY == "step": + lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[30,60], gamma=0.1) + else: + lr_scheduler = build_scheduler(cfg, optimizer, len(train_loader)) + + # docs: add resume option + if cfg.CONFIG.MODEL.LOAD: + model, _ = load_model(model, cfg, load_fc=cfg.CONFIG.MODEL.LOAD_FC) + + print('Start training...') + start_time = time.time() + max_accuracy = 0.0 + for epoch in range(cfg.CONFIG.TRAIN.START_EPOCH, cfg.CONFIG.TRAIN.EPOCH_NUM): + if cfg.DDP_CONFIG.DISTRIBUTED: + train_sampler.set_epoch(epoch) + + train_tuber_detection(cfg, model, criterion, train_loader, optimizer, epoch, cfg.CONFIG.LOSS_COFS.CLIPS_MAX_NORM, lr_scheduler, writer) + + if cfg.DDP_CONFIG.GPU_WORLD_RANK == 0 and ( + epoch % cfg.CONFIG.LOG.SAVE_FREQ == 0 or epoch == cfg.CONFIG.TRAIN.EPOCH_NUM - 1): + save_checkpoint(cfg, epoch, model, max_accuracy, optimizer, lr_scheduler) + + if epoch % cfg.CONFIG.VAL.FREQ == 0 or epoch == cfg.CONFIG.TRAIN.EPOCH_NUM - 1: + validate_tuber_detection(cfg, model, criterion, postprocessors, val_loader, epoch, writer) + + if writer is not None: + writer.close() + + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print('Training time {}'.format(total_time_str)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Train video action recognition transformer models.') + parser.add_argument('--config-file', + default='/xxx/TubeR_AVA_v2.1_CSN-152.yaml', + help='path to config file.') + args = parser.parse_args() + + cfg = get_cfg_defaults() + cfg.merge_from_file(args.config_file) + spawn_workers(main_worker, cfg) diff --git a/utils/video_action_recognition.py b/utils/video_action_recognition.py index 065f12a..20aa3e6 100644 --- a/utils/video_action_recognition.py +++ b/utils/video_action_recognition.py @@ -92,14 +92,18 @@ def train_tuber_detection(cfg, model, criterion, data_loader, optimizer, epoch, # metric_logger.add_meter('class_error', utils.SmoothedValue(window_size=1, fmt='{value:.2f}')) header = 'Epoch: [{}]'.format(epoch) print_freq = 10 - + skip_iter = False for idx, data in enumerate(data_loader): + # if idx > 10: + # break data_time.update(time.time() - end) # for samples, targets in metric_logger.log_every(data_loader, print_freq, epoch, ddp_params, writer, header): device = "cuda:" + str(cfg.DDP_CONFIG.GPU) samples = data[0] + + if cfg.CONFIG.TWO_STREAM: samples2 = data[1] targets = data[2] @@ -116,10 +120,23 @@ def train_tuber_detection(cfg, model, criterion, data_loader, optimizer, epoch, else: lfb_features = data[-1] lfb_features = lfb_features.to(device) + + + + # for target in targets: + # if len(target['boxes']) == 0: + # skip_iter = True + # break + # if skip_iter: + # print("skip iteration ...") + # skip_iter=False + # continue + for t in targets: del t["image_id"] samples = samples.to(device) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] + if cfg.CONFIG.TWO_STREAM: if cfg.CONFIG.USE_LFB: @@ -190,7 +207,8 @@ def train_tuber_detection(cfg, model, criterion, data_loader, optimizer, epoch, class_err.update(loss_dict_reduced['class_error'], len(targets)) if cfg.CONFIG.MATCHER.BNY_LOSS: - losses_ce_b.update(loss_dict_reduced['loss_ce_b'].item(), len(targets)) + losses_ce_b.update(loss_dict_reduced['loss_ce_b'].item(), len(targets)) + if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) @@ -254,6 +272,8 @@ def validate_tuber_detection(cfg, model, criterion, postprocessors, data_loader, print("all tmp files removed") for idx, data in enumerate(data_loader): + # if idx > 10: + # break data_time.update(time.time() - end) # for samples, targets in metric_logger.log_every(data_loader, print_freq, epoch, ddp_params, writer, header): @@ -302,6 +322,11 @@ def validate_tuber_detection(cfg, model, criterion, postprocessors, data_loader, else: outputs = model(samples) + # print(outputs) + # print(targets) + # print(rr) + + loss_dict = criterion(outputs, targets) weight_dict = criterion.weight_dict