merge uapi paddledetection (#8957)

zhangyubo0722 · web-flow · commit c84a59564cca · 2024-05-16T10:44:15.000+08:00
diff --git a/deploy/auto_compression/configs/rtdetr_r50vd_qat_dis.yaml b/deploy/auto_compression/configs/rtdetr_r50vd_qat_dis.yaml
@@ -1,6 +1,6 @@
 
 Global:
-  reader_config: conigs/rtdetr_reader.yml
+  reader_config: configs/rtdetr_reader.yml
   include_nms: True
   Evaluation: True
   model_dir: ./rtdetr_r50vd_6x_coco/ 
diff --git a/deploy/python/keypoint_infer.py b/deploy/python/keypoint_infer.py
@@ -90,7 +90,7 @@ def __init__(self,
             cpu_threads=cpu_threads,
             enable_mkldnn=enable_mkldnn,
             output_dir=output_dir,
-            threshold=threshold, 
+            threshold=threshold,
             use_fd_format=use_fd_format)
         self.use_dark = use_dark
 
diff --git a/ppdet/data/source/coco.py b/ppdet/data/source/coco.py
@@ -14,6 +14,7 @@
 
 import os
 import copy
+
 try:
     from collections.abc import Sequence
 except Exception:
@@ -23,10 +24,11 @@
 from .dataset import DetDataset
 
 from ppdet.utils.logger import setup_logger
+
 logger = setup_logger(__name__)
 
 __all__ = [
-    'COCODataSet', 'SlicedCOCODataSet', 'SemiCOCODataSet', 'COCODetDataset'
+    'COCODataSet', 'SlicedCOCODataSet', 'SemiCOCODataSet', 'COCODetDataset', 'COCOInstSegDataset'
 ]
 
 
@@ -127,7 +129,7 @@ def parse_dataset(self):
             if im_w < 0 or im_h < 0:
                 logger.warning('Illegal width: {} or height: {} in annotation, '
                                'and im_id: {} will be ignored'.format(
-                                   im_w, im_h, img_id))
+                    im_w, im_h, img_id))
                 continue
 
             coco_rec = {
@@ -334,7 +336,7 @@ def parse_dataset(self):
             if im_w < 0 or im_h < 0:
                 logger.warning('Illegal width: {} or height: {} in annotation, '
                                'and im_id: {} will be ignored'.format(
-                                   im_w, im_h, img_id))
+                    im_w, im_h, img_id))
                 continue
 
             slice_image_result = sahi.slicing.slice_image(
@@ -437,7 +439,7 @@ def parse_dataset(self):
             if im_w < 0 or im_h < 0:
                 logger.warning('Illegal width: {} or height: {} in annotation, '
                                'and im_id: {} will be ignored'.format(
-                                   im_w, im_h, img_id))
+                    im_w, im_h, img_id))
                 continue
 
             coco_rec = {
@@ -594,3 +596,10 @@ def __getitem__(self, idx):
 @serializable
 class COCODetDataset(COCODataSet):
     pass
+
+
+# for PaddleX
+@register
+@serializable
+class COCOInstSegDataset(COCODataSet):
+    pass
diff --git a/ppdet/engine/callbacks.py b/ppdet/engine/callbacks.py
@@ -182,6 +182,8 @@ def on_epoch_end(self, status):
         weight = None
         save_name = None
         if dist.get_world_size() < 2 or dist.get_rank() == 0:
+            end_epoch = self.model.cfg.epoch
+            save_name = str(epoch_id) if epoch_id != end_epoch - 1 else "model_final"
             if mode == 'train':
                 end_epoch = self.model.cfg.epoch
                 if (
@@ -191,29 +193,36 @@ def on_epoch_end(self, status):
                         epoch_id) if epoch_id != end_epoch - 1 else "model_final"
                     weight = self.weight.state_dict()
             elif mode == 'eval':
-                if 'save_best_model' in status and status['save_best_model']:
-                    for metric in self.model._metrics:
-                        map_res = metric.get_results()
-                        eval_func = "ap"
-                        if 'pose3d' in map_res:
-                            key = 'pose3d'
-                            eval_func = "mpjpe"
-                        elif 'bbox' in map_res:
-                            key = 'bbox'
-                        elif 'keypoint' in map_res:
-                            key = 'keypoint'
-                        else:
-                            key = 'mask'
-
-                        key = self.model.cfg.get('target_metrics', key)
-
-                        if key not in map_res:
-                            logger.warning("Evaluation results empty, this may be due to " \
-                                        "training iterations being too few or not " \
-                                        "loading the correct weights.")
-                            return
-                        if map_res[key][0] >= self.best_ap:
-                            self.best_ap = map_res[key][0]
+                for metric in self.model._metrics:
+                    map_res = metric.get_results()
+                    eval_func = "ap"
+                    if 'pose3d' in map_res:
+                        key = 'pose3d'
+                        eval_func = "mpjpe"
+                    elif 'bbox' in map_res:
+                        key = 'bbox'
+                    elif 'keypoint' in map_res:
+                        key = 'keypoint'
+                    else:
+                        key = 'mask'
+
+                    key = self.model.cfg.get('target_metrics', key)
+
+                    if key not in map_res:
+                        logger.warning("Evaluation results empty, this may be due to " \
+                                    "training iterations being too few or not " \
+                                    "loading the correct weights.")
+                        return
+                    epoch_ap = map_res[key][0]
+                    epoch_metric = {
+                        'metric': abs(epoch_ap),
+                        'epoch': epoch_id + 1
+                    }
+                    save_path = os.path.join(self.save_dir, f"{save_name}.pdstates")
+                    paddle.save(epoch_metric, save_path)
+                    if 'save_best_model' in status and status['save_best_model']:
+                        if epoch_ap >= self.best_ap:
+                            self.best_ap = epoch_ap
                             save_name = 'best_model'
                             weight = self.weight.state_dict()
                             best_metric = {
diff --git a/ppdet/modeling/architectures/detr.py b/ppdet/modeling/architectures/detr.py
@@ -104,7 +104,7 @@ def _forward(self):
             else:
                 bbox, bbox_num, mask = self.post_process(
                     preds, self.inputs['im_shape'], self.inputs['scale_factor'],
-                    self.inputs['image'])[2:].shape
+                    self.inputs['image'][2:].shape)
 
             output = {'bbox': bbox, 'bbox_num': bbox_num}
             if self.with_mask:
diff --git a/ppdet/modeling/transformers/mask_rtdetr_transformer.py b/ppdet/modeling/transformers/mask_rtdetr_transformer.py
@@ -53,13 +53,15 @@ def __init__(self,
                  hidden_dim,
                  decoder_layer,
                  num_layers,
-                 eval_idx=-1):
+                 eval_idx=-1,
+                 eval_topk=100):
         super(MaskTransformerDecoder, self).__init__()
         self.layers = _get_clones(decoder_layer, num_layers)
         self.hidden_dim = hidden_dim
         self.num_layers = num_layers
         self.eval_idx = eval_idx if eval_idx >= 0 \
             else num_layers + eval_idx
+        self.eval_topk = eval_topk
 
     def forward(self,
                 tgt,
diff --git a/ppdet/utils/cam_utils.py b/ppdet/utils/cam_utils.py
@@ -121,7 +121,9 @@ def __init__(self, FLAGS, cfg):
         self.num_class = cfg.num_classes
         # set hook for extraction of featuremaps and grads
         self.set_hook(cfg)
-        self.nms_idx_need_divid_numclass_arch = ['FasterRCNN', 'MaskRCNN', 'CascadeRCNN']
+        self.nms_idx_need_divid_numclass_arch = [
+            'FasterRCNN', 'MaskRCNN', 'CascadeRCNN'
+        ]
         """
         In these networks, the bbox array shape before nms contain num_class,
         the nms_keep_idx of the bbox need to divide the num_class; 
@@ -141,7 +143,7 @@ def build_trainer(self, cfg):
         trainer.load_weights(cfg.weights)
 
         # set for get extra_data before nms
-        trainer.model.use_extra_data=True
+        trainer.model.use_extra_data = True
         # set for record the bbox index before nms
         if cfg.architecture in ['FasterRCNN', 'MaskRCNN']:
             trainer.model.bbox_post_process.nms.return_index = True
@@ -152,14 +154,12 @@ def build_trainer(self, cfg):
             else:
                 # anchor free YOLOs: PP-YOLOE, PP-YOLOE+
                 trainer.model.yolo_head.nms.return_index = True
-        elif cfg.architecture=='BlazeFace' or cfg.architecture=='SSD':
+        elif cfg.architecture == 'BlazeFace' or cfg.architecture == 'SSD':
             trainer.model.post_process.nms.return_index = True
-        elif cfg.architecture=='RetinaNet':
+        elif cfg.architecture == 'RetinaNet':
             trainer.model.head.nms.return_index = True
         else:
-            print(
-                cfg.architecture+' is not supported for cam temporarily!'
-            )
+            print(cfg.architecture + ' is not supported for cam temporarily!')
             sys.exit()
         # Todo: Unify the head/post_process name in each model
 
@@ -169,19 +169,23 @@ def set_hook(self, cfg):
         # set hook for extraction of featuremaps and grads
         self.target_feats = {}
         self.target_layer_name = cfg.target_feature_layer_name
+
         # such as trainer.model.backbone, trainer.model.bbox_head.roi_extractor
 
         def hook(layer, input, output):
             self.target_feats[layer._layer_name_for_hook] = output
 
         try:
-            exec('self.trainer.'+self.target_layer_name+'._layer_name_for_hook = self.target_layer_name')
+            exec('self.trainer.' + self.target_layer_name +
+                 '._layer_name_for_hook = self.target_layer_name')
             # self.trainer.target_layer_name._layer_name_for_hook = self.target_layer_name
-            exec('self.trainer.'+self.target_layer_name+'.register_forward_post_hook(hook)')
+            exec('self.trainer.' + self.target_layer_name +
+                 '.register_forward_post_hook(hook)')
             # self.trainer.target_layer_name.register_forward_post_hook(hook)
         except:
             print("Error! "
-                  "The target_layer_name--"+self.target_layer_name+" is not in model! "
+                  "The target_layer_name--" + self.target_layer_name +
+                  " is not in model! "
                   "Please check the spelling and "
                   "the network's architecture!")
             sys.exit()
@@ -228,7 +232,7 @@ def get_bboxes_cams(self):
             # currently, only include the rcnn architectures （fasterrcnn, maskrcnn, cascadercnn);
             before_nms_indexes = extra_data['nms_keep_idx'].cpu().numpy(
             ) // self.num_class  # num_class
-        else :
+        else:
             before_nms_indexes = extra_data['nms_keep_idx'].cpu().numpy()
 
         # Calculate and visualize the heatmap of per predict bbox
@@ -240,7 +244,7 @@ def get_bboxes_cams(self):
 
             target_bbox_before_nms = int(before_nms_indexes[index])
 
-            if len(extra_data['scores'].shape)==2:
+            if len(extra_data['scores'].shape) == 2:
                 score_out = extra_data['scores'][target_bbox_before_nms]
             else:
                 score_out = extra_data['scores'][0, :, target_bbox_before_nms]
@@ -250,7 +254,6 @@ def get_bboxes_cams(self):
                 2) [num_of_image, num_classes, num_of_yolo_bboxes_before_nms], for example: [1, 80, 1000]
             """
 
-
             # construct one_hot label and do backward to get the gradients
             predicted_label = paddle.argmax(score_out)
             label_onehot = paddle.nn.functional.one_hot(
@@ -266,8 +269,8 @@ def get_bboxes_cams(self):
                     # when the featuremap contains of multiple scales,
                     # take the featuremap of the last scale
                     # Todo: fuse the cam result from multisclae featuremaps
-                    if self.target_feats[self.target_layer_name][
-                            -1].shape[-1]==1:
+                    if self.target_feats[self.target_layer_name][-1].shape[
+                            -1] == 1:
                         """
                         if the last level featuremap is 1x1 size,
                         we take the second last one
@@ -286,11 +289,12 @@ def get_bboxes_cams(self):
                         self.target_layer_name].grad.squeeze().cpu().numpy()
                     cam_feat = self.target_feats[
                         self.target_layer_name].squeeze().cpu().numpy()
-            else: # roi level feature
+            else:  # roi level feature
                 cam_grad = self.target_feats[
-                    self.target_layer_name].grad.squeeze().cpu().numpy()[target_bbox_before_nms]
-                cam_feat = self.target_feats[
-                    self.target_layer_name].squeeze().cpu().numpy()[target_bbox_before_nms]
+                    self.target_layer_name].grad.squeeze().cpu().numpy()[
+                        target_bbox_before_nms]
+                cam_feat = self.target_feats[self.target_layer_name].squeeze(
+                ).cpu().numpy()[target_bbox_before_nms]
 
             # grad_cam:
             exp = grad_cam(cam_feat, cam_grad)
@@ -305,23 +309,25 @@ def get_bboxes_cams(self):
                 # reshape the cam image to the input image size
                 resized_exp = resize_cam(exp, (img.shape[1], img.shape[0]))
                 mask = np.zeros((img.shape[0], img.shape[1], 3))
-                mask[int(target_bbox[3]):int(target_bbox[5]), int(target_bbox[2]):
-                     int(target_bbox[4]), :] = 1
+                mask[int(target_bbox[3]):int(target_bbox[5]), int(target_bbox[
+                    2]):int(target_bbox[4]), :] = 1
                 resized_exp = resized_exp * mask
                 # add the bbox cam back to the input image
                 overlay_vis = np.uint8(resized_exp * 0.4 + img * 0.6)
             elif 'roi' in self.target_layer_name:
                 # get the bbox part of the image
-                bbox_img = copy.deepcopy(img[int(target_bbox[3]):int(target_bbox[5]),
-                                         int(target_bbox[2]):int(target_bbox[4]), :])
+                bbox_img = copy.deepcopy(img[int(target_bbox[3]):int(
+                    target_bbox[5]), int(target_bbox[2]):int(target_bbox[
+                        4]), :])
                 # reshape the cam image to the bbox size
-                resized_exp = resize_cam(exp, (bbox_img.shape[1], bbox_img.shape[0]))
+                resized_exp = resize_cam(exp,
+                                         (bbox_img.shape[1], bbox_img.shape[0]))
                 # add the bbox cam back to the bbox image
                 bbox_overlay_vis = np.uint8(resized_exp * 0.4 + bbox_img * 0.6)
                 # put the bbox_cam image to the original image
                 overlay_vis = copy.deepcopy(img)
-                overlay_vis[int(target_bbox[3]):int(target_bbox[5]),
-                    int(target_bbox[2]):int(target_bbox[4]), :] = bbox_overlay_vis
+                overlay_vis[int(target_bbox[3]):int(target_bbox[5]), int(
+                    target_bbox[2]):int(target_bbox[4]), :] = bbox_overlay_vis
             else:
                 print(
                     'Only supported cam for  backbone/neck feature and roi feature,  the others are not supported temporarily!'
diff --git a/ppdet/utils/visualizer.py b/ppdet/utils/visualizer.py
@@ -89,7 +89,7 @@ def draw_bbox(image, im_id, catid2name, bboxes, threshold):
     Draw bbox on image
     """
     font_url = "https://paddledet.bj.bcebos.com/simfang.ttf"
-    font_path , _ = get_path(font_url, "~/.cache/paddle/")
+    font_path, _ = get_path(font_url, "~/.cache/paddle/")
     font_size = 18
     font = ImageFont.truetype(font_path, font_size, encoding="utf-8")
 
diff --git a/tools/box_distribution.py b/tools/box_distribution.py