face_recognizer/face_utils.py at main · JurekChleb/face_recognizer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
from itertools import product
from typing import Dict, List, Optional, Tuple, Union, Any

import numpy as np
import tensorflow as tf
from detection_tools.core.post_processing import batch_multiclass_non_max_suppression


class SCRFDPostProc(object):
    """A class for post-processing face detection results from the SCRFD model.

    This class handles the post-processing of face detection results from the SCRFD model,
    including bounding box and landmark decoding, non-maximum suppression, and score thresholding.

    Attributes:
        NUM_CLASSES (int): Number of classes in the detection model
        NUM_LANDMARKS (int): Number of facial landmarks to detect
        LABEL_OFFSET (int): Offset to add to class predictions
    """

    # The following params are corresponding to those used for training the model
    NUM_CLASSES = 1
    NUM_LANDMARKS = 10
    LABEL_OFFSET = 1

    def __init__(
        self,
        image_dims: Tuple[int, int] = (300, 300),
        nms_iou_thresh: float = 0.6,
        score_threshold: float = 0.3,
        anchors: Optional[Dict[str, Any]] = None
    ) -> None:
        """Initialize the SCRFD post-processor.

        Args:
            image_dims (Tuple[int, int]): Input image dimensions (height, width)
            nms_iou_thresh (float): IoU threshold for non-maximum suppression
            score_threshold (float): Minimum confidence score for detections
            anchors (Optional[Dict[str, Any]]): Dictionary containing anchor box information
                with keys 'min_sizes' and 'steps'

        Raises:
            ValueError: If anchors dictionary is not provided
        """
        self._image_dims = image_dims
        self._nms_iou_thresh = nms_iou_thresh
        self._score_threshold = score_threshold
        self._num_branches = len(anchors["steps"]) if anchors else 0
        if anchors is None:
            raise ValueError("Missing detection anchors metadata")
        self._anchors = self.extract_anchors(anchors["min_sizes"], anchors["steps"])

    def collect_box_class_predictions(
        self, output_branches: List[tf.Tensor]
    ) -> Tuple[tf.Tensor, tf.Tensor, Optional[tf.Tensor]]:
        """Collect and reshape predictions from model output branches.

        Args:
            output_branches (List[tf.Tensor]): List of model output tensors

        Returns:
            Tuple containing:
                - box_predictors (tf.Tensor): Reshaped box predictions
                - class_predictors (tf.Tensor): Reshaped class predictions
                - landmarks_predictors (Optional[tf.Tensor]): Reshaped landmark predictions if available

        Raises:
            AssertionError: If output branches don't have consistent number of nodes
        """
        box_predictors_list = []
        class_predictors_list = []
        landmarks_predictors_list = []
        sorted_output_branches = output_branches
        num_branches = self._num_branches
        assert len(sorted_output_branches) % num_branches == 0, "All branches must have the same number of output nodes"
        num_output_nodes_per_branch = len(sorted_output_branches) // num_branches
        for branch_index in range(0, len(sorted_output_branches), num_output_nodes_per_branch):
            num_of_batches, _, _, _ = tf.unstack(tf.shape(sorted_output_branches[branch_index]))
            box_predictors_list.append(tf.reshape(sorted_output_branches[branch_index], shape=[num_of_batches, -1, 4]))
            class_predictors_list.append(
                tf.reshape(sorted_output_branches[branch_index + 1], shape=[num_of_batches, -1, self.NUM_CLASSES])
            )

            if num_output_nodes_per_branch > 2:
                # Assume output is landmarks
                landmarks_predictors_list.append(
                    tf.reshape(sorted_output_branches[branch_index + 2], shape=[num_of_batches, -1, 10])
                )
        box_predictors = tf.concat(box_predictors_list, axis=1)
        class_predictors = tf.concat(class_predictors_list, axis=1)
        landmarks_predictors = tf.concat(landmarks_predictors_list, axis=1) if landmarks_predictors_list else None
        return box_predictors, class_predictors, landmarks_predictors

    def extract_anchors(self, min_sizes: List[List[int]], steps: List[int]) -> tf.Tensor:
        """Generate anchor boxes for the SCRFD model.

        Args:
            min_sizes (List[List[int]]): List of minimum sizes for anchor boxes at each feature level
            steps (List[int]): List of stride steps for each feature level

        Returns:
            tf.Tensor: Tensor containing all anchor boxes with shape [num_anchors, 4]
                      where each anchor is [center_x, center_y, width, height] in normalized coordinates
        """
        anchors = []
        for stride, min_size in zip(steps, min_sizes):
            height = self._image_dims[0] // stride
            width = self._image_dims[1] // stride
            num_anchors = len(min_size)

            anchor_centers = np.stack(np.mgrid[:height, :width][::-1], axis=-1).astype(np.float32)
            anchor_centers = (anchor_centers * stride).reshape((-1, 2))
            anchor_centers[:, 0] /= self._image_dims[0]
            anchor_centers[:, 1] /= self._image_dims[1]
            if num_anchors > 1:
                anchor_centers = np.stack([anchor_centers] * num_anchors, axis=1).reshape((-1, 2))
            anchor_scales = np.ones_like(anchor_centers, dtype=np.float32) * stride
            anchor_scales[:, 0] /= self._image_dims[0]
            anchor_scales[:, 1] /= self._image_dims[1]
            anchor = np.concatenate([anchor_centers, anchor_scales], axis=1)
            anchors.append(anchor)
        return tf.convert_to_tensor(np.concatenate(anchors, axis=0))

    def _decode_landmarks(self, landmarks_detections: tf.Tensor, anchors: tf.Tensor) -> tf.Tensor:
        """Decode landmark predictions from SCRFD model output.

        Args:
            landmarks_detections (tf.Tensor): Raw landmark predictions from model
            anchors (tf.Tensor): Anchor boxes used for prediction

        Returns:
            tf.Tensor: Decoded landmark coordinates in normalized image space
        """
        preds = []
        for i in range(0, self.NUM_LANDMARKS, 2):
            px = anchors[:, 0] + landmarks_detections[:, i] * anchors[:, 2]
            py = anchors[:, 1] + landmarks_detections[:, i + 1] * anchors[:, 3]
            preds.append(px)
            preds.append(py)
        return tf.stack(preds, axis=-1)

    def _decode_boxes(self, box_detections: tf.Tensor, anchors: tf.Tensor) -> tf.Tensor:
        """Decode bounding box predictions from SCRFD model output.

        Args:
            box_detections (tf.Tensor): Raw box predictions from model
            anchors (tf.Tensor): Anchor boxes used for prediction

        Returns:
            tf.Tensor: Decoded bounding boxes in normalized image space [y1, x1, y2, x2]
        """
        x1 = anchors[:, 0] - box_detections[:, 0] * anchors[:, 2]
        y1 = anchors[:, 1] - box_detections[:, 1] * anchors[:, 3]
        x2 = anchors[:, 0] + box_detections[:, 2] * anchors[:, 2]
        y2 = anchors[:, 1] + box_detections[:, 3] * anchors[:, 3]
        return tf.stack([x1, y1, x2, y2], axis=-1)

    def tf_postproc(self, results: Dict[str, np.ndarray]) -> Dict[str, tf.Tensor]:
        """Post-process SCRFD model outputs to get final detection results.

        This method performs the following steps:
        1. Collects and reshapes predictions from model outputs
        2. Decodes bounding boxes and landmarks
        3. Applies non-maximum suppression
        4. Filters detections by confidence score
        5. Returns final detection results

        Args:
            results (Dict[str, np.ndarray]): Dictionary containing model outputs

        Returns:
            Dict[str, tf.Tensor]: Dictionary containing:
                - detection_boxes: Final bounding boxes [y1, x1, y2, x2]
                - detection_scores: Confidence scores for each detection
                - detection_classes: Class labels for each detection
                - num_detections: Number of valid detections
                - face_landmarks: (Optional) Landmark coordinates if available
        """
        import itertools
        result_from_shape = {v.shape: np.expand_dims(v, axis=0) for v in results.values()}
        sizes = ((80, 80), (40, 40), (20, 20))
        channels = (8, 2, 20)
        endnodes = [result_from_shape[sz + (ch,)] for sz in sizes for ch in channels]

        with tf.name_scope("Postprocessor"):
            box_predictions, classes_predictions, landmarks_predictors = self.collect_box_class_predictions(endnodes)
            additional_fields = {}

            detection_scores = classes_predictions

            batch_size, num_proposals = tf.unstack(tf.slice(tf.shape(box_predictions), [0], [2]))

            tiled_anchor_boxes = tf.tile(tf.expand_dims(self._anchors, 0), [batch_size, 1, 1])
            tiled_anchors_boxlist = tf.reshape(tiled_anchor_boxes, [-1, 4])

            decoded_boxes = self._decode_boxes(tf.reshape(box_predictions, (-1, 4)), tiled_anchors_boxlist)
            detection_boxes = tf.reshape(decoded_boxes, [batch_size, num_proposals, 4])

            decoded_landmarks = None
            if tf.is_tensor(landmarks_predictors):
                decoded_landmarks = self._decode_landmarks(
                    tf.reshape(landmarks_predictors, (-1, 10)), tiled_anchors_boxlist
                )
                decoded_landmarks = tf.reshape(decoded_landmarks, [batch_size, num_proposals, 10])
                additional_fields["landmarks"] = decoded_landmarks

            detection_boxes = tf.identity(tf.expand_dims(detection_boxes, axis=[2]), "raw_box_locations")
            (nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks, nmsed_additional_fields, num_detections) = (
                batch_multiclass_non_max_suppression(
                    boxes=detection_boxes,
                    scores=detection_scores,
                    score_thresh=self._score_threshold,
                    iou_thresh=self._nms_iou_thresh,
                    additional_fields=additional_fields,
                    max_size_per_class=1000,
                    max_total_size=1000,
                )
            )
            # adding offset to the class prediction and cast to integer
            nmsed_classes = tf.cast(tf.add(nmsed_classes, self.LABEL_OFFSET), tf.int16)

        results = {
            "detection_boxes": nmsed_boxes,
            "detection_scores": nmsed_scores,
            "detection_classes": nmsed_classes,
            "num_detections": num_detections,
        }

        nmsed_additional_fields = nmsed_additional_fields or {}
        face_landmarks = nmsed_additional_fields.get("landmarks")
        if tf.is_tensor(face_landmarks):
            results["face_landmarks"] = face_landmarks

        return results