From 3059a7e81e9c78ea29be0ce2c841b82254965088 Mon Sep 17 00:00:00 2001
From: Zhao <zhao.wang@aotu.ai>
Date: Tue, 20 Apr 2021 15:41:24 -0700
Subject: [PATCH 01/21] add a base tensorrt backend

---
 vcap_utils/vcap_utils/backends/__init__.py    |   1 +
 .../vcap_utils/backends/base_tensorrt.py      | 267 ++++++++++++++++++
 2 files changed, 268 insertions(+)
 create mode 100644 vcap_utils/vcap_utils/backends/base_tensorrt.py

diff --git a/vcap_utils/vcap_utils/backends/__init__.py b/vcap_utils/vcap_utils/backends/__init__.py
index 15ce457..b38a1dc 100644
--- a/vcap_utils/vcap_utils/backends/__init__.py
+++ b/vcap_utils/vcap_utils/backends/__init__.py
@@ -8,6 +8,7 @@
 from .openface_encoder import OpenFaceEncoder
 from .base_encoder import BaseEncoderBackend
 from .backend_rpc_process import BackendRpcProcess
+from .base_tensorrt import BaseTensorRTBackend
 from .load_utils import parse_dataset_metadata_bytes, parse_tf_model_bytes
 from .predictions import (
     EncodingPrediction,
diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py
new file mode 100644
index 0000000..9df6e4c
--- /dev/null
+++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py
@@ -0,0 +1,267 @@
+import numpy as np
+import pycuda.driver as cuda
+import tensorrt as trt
+import pycuda.autoinit
+
+from typing import Dict, List, Tuple, Optional, Any
+
+from vcap import (
+    Crop,
+    DetectionNode,
+    Resize,
+    DETECTION_NODE_TYPE,
+    OPTION_TYPE,
+    BaseStreamState,
+    BaseBackend,
+    rect_to_coords,
+)
+
+
+class HostDeviceMem(object):
+    def __init__(self, host_mem, device_mem):
+        self.host = host_mem
+        self.device = device_mem
+
+    def __str__(self):
+        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
+
+    def __repr__(self):
+        return self.__str__()
+
+
+def load_engine(trt_runtime, engine_data):
+    engine = trt_runtime.deserialize_cuda_engine(engine_data)
+    return engine
+
+
+class AllocatedBuffer:
+    def __init__(self, inputs_, outputs_, bindings_, stream_):
+        self.inputs = inputs_
+        self.outputs = outputs_
+        self.bindings = bindings_
+        self.stream = stream_
+
+
+class BaseTensorRTBackend(BaseBackend):
+    def __init__(self, engine_bytes, width, height):
+        super().__init__()
+        TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
+        self.trt_runtime = trt.Runtime(TRT_LOGGER)
+        # load the engine
+        self.trt_engine = load_engine(self.trt_runtime, engine_bytes)
+        # create execution context
+        self.context = self.trt_engine.create_execution_context()
+        # create buffers for inference
+        self.buffers = {}
+        for batch_size in range(1, self.trt_engine.max_batch_size + 1):
+            inputs, outputs, bindings, stream = self.allocate_buffers(
+                batch_size=batch_size)
+            self.buffers[batch_size] = AllocatedBuffer(inputs, outputs, bindings,
+                                                       stream)
+
+        self.engine_width = width
+        self.engine_height = height
+
+        self._prepare_post_process()
+
+    def batch_predict(self, input_data_list: List[Any]) -> List[Any]:
+        task_size = len(input_data_list)
+        curr_index = 0
+        while curr_index < task_size:
+            if curr_index + self.trt_engine.max_batch_size <= task_size:
+                end_index = curr_index + self.trt_engine.max_batch_size
+            else:
+                end_index = task_size
+            batch = input_data_list[curr_index:end_index]
+            curr_index = end_index
+            for result in self._process_batch(batch):
+                yield result
+
+    def _process_batch(self, input_data: List[np.array]):
+        batch_size = len(input_data)
+        batched_image = np.concatenate(input_data, axis=0)
+        prepared_buffer = self.buffers[batch_size]
+        inputs = prepared_buffer.inputs
+        outputs = prepared_buffer.outputs
+        bindings = prepared_buffer.bindings
+        stream = prepared_buffer.stream
+        np.copyto(inputs[0].host, batched_image.ravel())
+        detections = self.do_inference(
+            bindings=bindings, inputs=inputs, outputs=outputs, stream=stream,
+            batch_size=batch_size
+        )
+        return detections
+
+    def process_frame(self, frame: np.ndarray, detection_node: DETECTION_NODE_TYPE,
+                      options: Dict[str, OPTION_TYPE],
+                      state: BaseStreamState) -> DETECTION_NODE_TYPE:
+        pass
+
+    def prepare_inputs(self, frame: np.ndarray, transpose: bool, normalize: bool,
+                       mean_subtraction: Optional[Tuple] = None) -> \
+            Tuple[np.array, Resize]:
+        resize = Resize(frame).resize(self.engine_width, self.engine_height,
+                                      Resize.ResizeType.EXACT)
+
+        if transpose:
+            resize.frame = np.transpose(resize.frame, (2, 0, 1))
+        if normalize:
+            resize.frame = (1.0 / 255.0) * resize.frame
+        if mean_subtraction is not None:
+            if len(mean_subtraction) != 3:
+                raise RuntimeError("Invalid mean subtraction")
+            resize.frame = resize.frame.astype("float64")
+            resize.frame[..., 0] -= mean_subtraction[0]
+            resize.frame[..., 1] -= mean_subtraction[1]
+            resize.frame[..., 2] -= mean_subtraction[2]
+        return resize.frame, resize
+
+    def allocate_buffers(self, batch_size=1):
+        """Allocates host and device buffer for TRT engine inference.
+        This function is similair to the one in common.py, but
+        converts network outputs (which are np.float32) appropriately
+        before writing them to Python buffer. This is needed, since
+        TensorRT plugins doesn't support output type description, and
+        in our particular case, we use NMS plugin as network output.
+        Args:
+            engine (trt.ICudaEngine): TensorRT engine
+            batch_size: batch size for the input/output memory
+        Returns:
+            inputs [HostDeviceMem]: engine input memory
+            outputs [HostDeviceMem]: engine output memory
+            bindings [int]: buffer to device bindings
+            stream (cuda.Stream): cuda stream for engine inference synchronization
+        """
+        inputs = []
+        outputs = []
+        bindings = []
+        stream = cuda.Stream()
+        for binding in self.trt_engine:
+            size = trt.volume(self.trt_engine.get_binding_shape(binding)) * batch_size
+            dtype = trt.nptype(self.trt_engine.get_binding_dtype(binding))
+            # Allocate host and device buffers
+            host_mem = cuda.pagelocked_empty(size, dtype)
+            device_mem = cuda.mem_alloc(host_mem.nbytes)
+            # Append the device buffer to device bindings.
+            bindings.append(int(device_mem))
+            # Append to the appropriate list.
+            if self.trt_engine.binding_is_input(binding):
+                inputs.append(HostDeviceMem(host_mem, device_mem))
+            else:
+                outputs.append(HostDeviceMem(host_mem, device_mem))
+        return inputs, outputs, bindings, stream
+
+    def do_inference(self, bindings, inputs, outputs, stream, batch_size=1):
+        # Transfer input data to the GPU.
+        [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
+        # Run inference.
+        # todo: try execute synchronously
+        self.context.execute_async(
+            batch_size=batch_size, bindings=bindings, stream_handle=stream.handle
+        )
+        # Transfer predictions back from the GPU.
+        [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
+        # Synchronize the stream
+        stream.synchronize()
+        # Return only the host outputs.
+        batch_outputs = []
+        for out in outputs:
+            entire_out_array = np.array(out.host)
+            out_array_by_batch = np.split(entire_out_array, batch_size)
+            out_lists = [out_array.tolist() for out_array in out_array_by_batch]
+            batch_outputs.append(out_lists)
+        final_outputs = []
+        for i in range(len(batch_outputs[0])):
+            final_output = []
+            for batch_output in batch_outputs:
+                final_output.append(batch_output[i])
+            final_outputs.append(final_output)
+        return final_outputs
+
+    def _prepare_post_process(self):
+        self.stride = 16
+        self.box_norm = 35.0
+        self.grid_h = int(self.engine_height / self.stride)
+        self.grid_w = int(self.engine_width / self.stride)
+        self.grid_size = self.grid_h * self.grid_w
+
+        self.grid_centers_w = []
+        self.grid_centers_h = []
+
+        for i in range(self.grid_h):
+            value = (i * self.stride + 0.5) / self.box_norm
+            self.grid_centers_h.append(value)
+
+        for i in range(self.grid_w):
+            value = (i * self.stride + 0.5) / self.box_norm
+            self.grid_centers_w.append(value)
+
+    def _apply_box_norm(self, o1, o2, o3, o4, x, y):
+        """
+        Applies the GridNet box normalization
+        Args:
+            o1 (float): first argument of the result
+            o2 (float): second argument of the result
+            o3 (float): third argument of the result
+            o4 (float): fourth argument of the result
+            x: row index on the grid
+            y: column index on the grid
+
+        Returns:
+            float: rescaled first argument
+            float: rescaled second argument
+            float: rescaled third argument
+            float: rescaled fourth argument
+        """
+        o1 = (o1 - self.grid_centers_w[x]) * -self.box_norm
+        o2 = (o2 - self.grid_centers_h[y]) * -self.box_norm
+        o3 = (o3 + self.grid_centers_w[x]) * self.box_norm
+        o4 = (o4 + self.grid_centers_h[y]) * self.box_norm
+        return o1, o2, o3, o4
+
+    def postprocess(self, outputs, min_confidence, analysis_classes, wh_format=True):
+        """
+        Postprocesses the inference output
+        Args:
+            outputs (list of float): inference output
+            min_confidence (float): min confidence to accept detection
+            analysis_classes (list of int): indices of the classes to consider
+
+        Returns: list of list tuple: each element is a two list tuple (x, y) representing the corners of a bb
+        """
+        # print(len(outputs))
+        bbs = []
+        class_ids = []
+        scores = []
+        for c in analysis_classes:
+
+            x1_idx = c * 4 * self.grid_size
+            y1_idx = x1_idx + self.grid_size
+            x2_idx = y1_idx + self.grid_size
+            y2_idx = x2_idx + self.grid_size
+
+            boxes = outputs[0]
+            for h in range(self.grid_h):
+                for w in range(self.grid_w):
+                    i = w + h * self.grid_w
+                    score = outputs[1][c * self.grid_size + i]
+                    if score >= min_confidence:
+                        o1 = boxes[x1_idx + w + h * self.grid_w]
+                        o2 = boxes[y1_idx + w + h * self.grid_w]
+                        o3 = boxes[x2_idx + w + h * self.grid_w]
+                        o4 = boxes[y2_idx + w + h * self.grid_w]
+
+                        o1, o2, o3, o4 = self._apply_box_norm(o1, o2, o3, o4, w, h)
+
+                        xmin = int(o1)
+                        ymin = int(o2)
+                        xmax = int(o3)
+                        ymax = int(o4)
+                        if wh_format:
+                            bbs.append([xmin, ymin, xmax - xmin, ymax - ymin])
+                        else:
+                            bbs.append([xmin, ymin, xmax, ymax])
+                        class_ids.append(c)
+                        scores.append(float(score))
+
+        return bbs, class_ids, scores

From 17d6c48118c36724f91d3407fdf4cdc77416da60 Mon Sep 17 00:00:00 2001
From: Zhao <zhao.wang@aotu.ai>
Date: Wed, 21 Apr 2021 18:26:23 -0700
Subject: [PATCH 02/21] ravel before concatenate to improve performance

---
 .../vcap_utils/backends/base_tensorrt.py      | 63 +++++++++++++++++--
 1 file changed, 59 insertions(+), 4 deletions(-)

diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py
index 9df6e4c..8e73f37 100644
--- a/vcap_utils/vcap_utils/backends/base_tensorrt.py
+++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py
@@ -1,7 +1,10 @@
 import numpy as np
+import cupy as cp
+
 import pycuda.driver as cuda
 import tensorrt as trt
 import pycuda.autoinit
+import time
 
 from typing import Dict, List, Tuple, Optional, Any
 
@@ -78,14 +81,54 @@ def batch_predict(self, input_data_list: List[Any]) -> List[Any]:
                 yield result
 
     def _process_batch(self, input_data: List[np.array]):
+        pre_batch_time = time.time()
         batch_size = len(input_data)
-        batched_image = np.concatenate(input_data, axis=0)
+        # input_data_cuppy = [cp.array(data) for data in input_data]
+        """Ideas
+        1) Try raveling before concatenate
+        2) Instead of concatenating, try generating an array of the size and shape of what the concatenated image WOULD 
+            be, then copy the raveled images into their respective places in the array
+        """
+        ravel_time = time.time()
+        raveled_input = [data.ravel() for data in input_data]
+        print("batch_size:", batch_size, "ravel time:", int(round((time.time() - ravel_time) * 1000)),
+              int(round((time.time() - ravel_time) * 1000)) / batch_size)
+
+        concatenate_time = time.time()
+        batched_image = np.concatenate(raveled_input, axis=0)
+        #image_size = len(raveled_input[0])
+        #batched_image = np.zeros((1, batch_size * image_size))
+        #for index, image in enumerate(batched_image):
+        #    batched_image[index * image_size:(index + 1) * image_size] = image
+        print("batch_size:", batch_size, "concatenate time:", int(round((time.time() - concatenate_time) * 1000)),
+              int(round((time.time() - concatenate_time) * 1000)) / batch_size)
+
+        # image_size = self.engine_height * self.engine_width
+        # batched_image = np.zeros((1, batch_size * image_size))
+        # for index, image in enumerate(batched_image):
+        #    for row_index, row in enumerate(image):
+        #        batched_image[index * image_size + row_index * self.engine_width:(index+1) * image_size + row_index * self.engine_width] = row
+
+        # for data in input_data:
+        #    batch_image_array.append(data.ravel())
+
+        # batched_image = np.concatenate(input_data, axis=0)
+        # print(type(batched_image))
         prepared_buffer = self.buffers[batch_size]
         inputs = prepared_buffer.inputs
         outputs = prepared_buffer.outputs
         bindings = prepared_buffer.bindings
         stream = prepared_buffer.stream
-        np.copyto(inputs[0].host, batched_image.ravel())
+        copy_time = time.time()
+        # raveled_image = batched_image.ravel()
+
+        # np.copyto(inputs[0].host, raveled_image)
+        np.copyto(inputs[0].host, batched_image)
+        print("batch_size:", batch_size, "copy time:", int(round((time.time() - copy_time) * 1000)),
+              int(round((time.time() - copy_time) * 1000)) / batch_size)
+
+        print("batch_size:", batch_size, "pre_batch_time:", int(round((time.time() - pre_batch_time) * 1000)),
+              int(round((time.time() - pre_batch_time) * 1000)) / batch_size)
         detections = self.do_inference(
             bindings=bindings, inputs=inputs, outputs=outputs, stream=stream,
             batch_size=batch_size
@@ -100,8 +143,12 @@ def process_frame(self, frame: np.ndarray, detection_node: DETECTION_NODE_TYPE,
     def prepare_inputs(self, frame: np.ndarray, transpose: bool, normalize: bool,
                        mean_subtraction: Optional[Tuple] = None) -> \
             Tuple[np.array, Resize]:
+        pre_process_start_time = time.time()
+        # h, w, c = frame.shape
+        # print(h, w, self.engine_height, self.engine_width)
         resize = Resize(frame).resize(self.engine_width, self.engine_height,
                                       Resize.ResizeType.EXACT)
+        # print("resize take:", int(round((time.time() - pre_process_start_time) * 1000)))
 
         if transpose:
             resize.frame = np.transpose(resize.frame, (2, 0, 1))
@@ -114,6 +161,7 @@ def prepare_inputs(self, frame: np.ndarray, transpose: bool, normalize: bool,
             resize.frame[..., 0] -= mean_subtraction[0]
             resize.frame[..., 1] -= mean_subtraction[1]
             resize.frame[..., 2] -= mean_subtraction[2]
+        # print("prepare input take:", int(round((time.time() - pre_process_start_time) * 1000)))
         return resize.frame, resize
 
     def allocate_buffers(self, batch_size=1):
@@ -152,12 +200,14 @@ def allocate_buffers(self, batch_size=1):
         return inputs, outputs, bindings, stream
 
     def do_inference(self, bindings, inputs, outputs, stream, batch_size=1):
+        inference_start_time = time.time()
+
         # Transfer input data to the GPU.
         [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
         # Run inference.
         # todo: try execute synchronously
-        self.context.execute_async(
-            batch_size=batch_size, bindings=bindings, stream_handle=stream.handle
+        self.context.execute(
+            batch_size=batch_size, bindings=bindings
         )
         # Transfer predictions back from the GPU.
         [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
@@ -176,6 +226,11 @@ def do_inference(self, bindings, inputs, outputs, stream, batch_size=1):
             for batch_output in batch_outputs:
                 final_output.append(batch_output[i])
             final_outputs.append(final_output)
+        print("batch_size:", batch_size,
+              "TensorRT inference time: {} ms".format(
+                  int(round((time.time() - inference_start_time) * 1000))
+              )
+              )
         return final_outputs
 
     def _prepare_post_process(self):

From 23196183c2f02baac9dc747ca15fa138347fa5bc Mon Sep 17 00:00:00 2001
From: Zhao Wang <zhao.wang@aotu.ai>
Date: Tue, 27 Apr 2021 14:38:13 -0700
Subject: [PATCH 03/21] don't resize if the size of image remains the same

---
 vcap/vcap/modifiers.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/vcap/vcap/modifiers.py b/vcap/vcap/modifiers.py
index a9cdeba..c03aeca 100644
--- a/vcap/vcap/modifiers.py
+++ b/vcap/vcap/modifiers.py
@@ -196,7 +196,6 @@ def __init__(self, frame: np.ndarray):
 
     def resize(self, resize_width: int, resize_height: int,
                resize_type: ResizeType):
-
         frame_width = self.frame.shape[1]
         frame_height = self.frame.shape[0]
 
@@ -251,11 +250,11 @@ def resize(self, resize_width: int, resize_height: int,
         # Account for scaling
         scale_width = new_width / frame_width
         scale_height = new_height / frame_height
-        self._operations.append(
-            (self._OperationType.SCALE, (scale_width, scale_height))
-        )
-
-        self.frame = cv2.resize(self.frame, (new_width, new_height))
+        if new_width != frame_width or new_height != frame_height:
+            self._operations.append(
+                (self._OperationType.SCALE, (scale_width, scale_height))
+            )
+            self.frame = cv2.resize(self.frame, (new_width, new_height))
 
         return self
 

From 5e12a29e06c7f259c8759e45f7b85b04ed060a10 Mon Sep 17 00:00:00 2001
From: Zhao Wang <zhao.wang@aotu.ai>
Date: Tue, 27 Apr 2021 22:52:23 -0700
Subject: [PATCH 04/21] code clean up, add type hint

---
 .../vcap_utils/backends/base_tensorrt.py      | 98 ++++---------------
 1 file changed, 20 insertions(+), 78 deletions(-)

diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py
index 8e73f37..415b5df 100644
--- a/vcap_utils/vcap_utils/backends/base_tensorrt.py
+++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py
@@ -1,10 +1,8 @@
 import numpy as np
-import cupy as cp
 
 import pycuda.driver as cuda
 import tensorrt as trt
 import pycuda.autoinit
-import time
 
 from typing import Dict, List, Tuple, Optional, Any
 
@@ -32,11 +30,6 @@ def __repr__(self):
         return self.__str__()
 
 
-def load_engine(trt_runtime, engine_data):
-    engine = trt_runtime.deserialize_cuda_engine(engine_data)
-    return engine
-
-
 class AllocatedBuffer:
     def __init__(self, inputs_, outputs_, bindings_, stream_):
         self.inputs = inputs_
@@ -51,7 +44,8 @@ def __init__(self, engine_bytes, width, height):
         TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
         self.trt_runtime = trt.Runtime(TRT_LOGGER)
         # load the engine
-        self.trt_engine = load_engine(self.trt_runtime, engine_bytes)
+        self.trt_engine = self.trt_runtime.deserialize_cuda_engine(engine_bytes)
+
         # create execution context
         self.context = self.trt_engine.create_execution_context()
         # create buffers for inference
@@ -65,6 +59,8 @@ def __init__(self, engine_bytes, width, height):
         self.engine_width = width
         self.engine_height = height
 
+        # preallocate resources for post process
+        # todo: post process is only need for detectors
         self._prepare_post_process()
 
     def batch_predict(self, input_data_list: List[Any]) -> List[Any]:
@@ -80,58 +76,18 @@ def batch_predict(self, input_data_list: List[Any]) -> List[Any]:
             for result in self._process_batch(batch):
                 yield result
 
-    def _process_batch(self, input_data: List[np.array]):
-        pre_batch_time = time.time()
+    def _process_batch(self, input_data: List[np.array]) -> List[List[float]]:
         batch_size = len(input_data)
-        # input_data_cuppy = [cp.array(data) for data in input_data]
-        """Ideas
-        1) Try raveling before concatenate
-        2) Instead of concatenating, try generating an array of the size and shape of what the concatenated image WOULD 
-            be, then copy the raveled images into their respective places in the array
-        """
-        ravel_time = time.time()
-        raveled_input = [data.ravel() for data in input_data]
-        print("batch_size:", batch_size, "ravel time:", int(round((time.time() - ravel_time) * 1000)),
-              int(round((time.time() - ravel_time) * 1000)) / batch_size)
-
-        concatenate_time = time.time()
-        batched_image = np.concatenate(raveled_input, axis=0)
-        #image_size = len(raveled_input[0])
-        #batched_image = np.zeros((1, batch_size * image_size))
-        #for index, image in enumerate(batched_image):
-        #    batched_image[index * image_size:(index + 1) * image_size] = image
-        print("batch_size:", batch_size, "concatenate time:", int(round((time.time() - concatenate_time) * 1000)),
-              int(round((time.time() - concatenate_time) * 1000)) / batch_size)
-
-        # image_size = self.engine_height * self.engine_width
-        # batched_image = np.zeros((1, batch_size * image_size))
-        # for index, image in enumerate(batched_image):
-        #    for row_index, row in enumerate(image):
-        #        batched_image[index * image_size + row_index * self.engine_width:(index+1) * image_size + row_index * self.engine_width] = row
-
-        # for data in input_data:
-        #    batch_image_array.append(data.ravel())
-
-        # batched_image = np.concatenate(input_data, axis=0)
-        # print(type(batched_image))
         prepared_buffer = self.buffers[batch_size]
         inputs = prepared_buffer.inputs
         outputs = prepared_buffer.outputs
         bindings = prepared_buffer.bindings
         stream = prepared_buffer.stream
-        copy_time = time.time()
-        # raveled_image = batched_image.ravel()
+        # todo: get dtype from engine
+        inputs[0].host = np.ascontiguousarray(input_data, dtype=np.float32)
 
-        # np.copyto(inputs[0].host, raveled_image)
-        np.copyto(inputs[0].host, batched_image)
-        print("batch_size:", batch_size, "copy time:", int(round((time.time() - copy_time) * 1000)),
-              int(round((time.time() - copy_time) * 1000)) / batch_size)
-
-        print("batch_size:", batch_size, "pre_batch_time:", int(round((time.time() - pre_batch_time) * 1000)),
-              int(round((time.time() - pre_batch_time) * 1000)) / batch_size)
         detections = self.do_inference(
-            bindings=bindings, inputs=inputs, outputs=outputs, stream=stream,
-            batch_size=batch_size
+            bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=batch_size
         )
         return detections
 
@@ -143,13 +99,8 @@ def process_frame(self, frame: np.ndarray, detection_node: DETECTION_NODE_TYPE,
     def prepare_inputs(self, frame: np.ndarray, transpose: bool, normalize: bool,
                        mean_subtraction: Optional[Tuple] = None) -> \
             Tuple[np.array, Resize]:
-        pre_process_start_time = time.time()
-        # h, w, c = frame.shape
-        # print(h, w, self.engine_height, self.engine_width)
         resize = Resize(frame).resize(self.engine_width, self.engine_height,
                                       Resize.ResizeType.EXACT)
-        # print("resize take:", int(round((time.time() - pre_process_start_time) * 1000)))
-
         if transpose:
             resize.frame = np.transpose(resize.frame, (2, 0, 1))
         if normalize:
@@ -161,18 +112,12 @@ def prepare_inputs(self, frame: np.ndarray, transpose: bool, normalize: bool,
             resize.frame[..., 0] -= mean_subtraction[0]
             resize.frame[..., 1] -= mean_subtraction[1]
             resize.frame[..., 2] -= mean_subtraction[2]
-        # print("prepare input take:", int(round((time.time() - pre_process_start_time) * 1000)))
         return resize.frame, resize
 
-    def allocate_buffers(self, batch_size=1):
+    def allocate_buffers(self, batch_size: int = 1) -> \
+            Tuple[List[HostDeviceMem], List[HostDeviceMem], List[int], cuda.Stream]:
         """Allocates host and device buffer for TRT engine inference.
-        This function is similair to the one in common.py, but
-        converts network outputs (which are np.float32) appropriately
-        before writing them to Python buffer. This is needed, since
-        TensorRT plugins doesn't support output type description, and
-        in our particular case, we use NMS plugin as network output.
         Args:
-            engine (trt.ICudaEngine): TensorRT engine
             batch_size: batch size for the input/output memory
         Returns:
             inputs [HostDeviceMem]: engine input memory
@@ -199,13 +144,14 @@ def allocate_buffers(self, batch_size=1):
                 outputs.append(HostDeviceMem(host_mem, device_mem))
         return inputs, outputs, bindings, stream
 
-    def do_inference(self, bindings, inputs, outputs, stream, batch_size=1):
-        inference_start_time = time.time()
-
+    def do_inference(self, bindings: List[int], inputs: List[HostDeviceMem], outputs: List[HostDeviceMem],
+                     stream: cuda.Stream, batch_size: int = 1) -> List[List[float]]:
         # Transfer input data to the GPU.
         [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
         # Run inference.
-        # todo: try execute synchronously
+        # todo: use async or sync api?
+        # According to https://docs.nvidia.com/deeplearning/tensorrt/best-practices/index.html#optimize-python
+        # the performance should be almost identical
         self.context.execute(
             batch_size=batch_size, bindings=bindings
         )
@@ -226,11 +172,6 @@ def do_inference(self, bindings, inputs, outputs, stream, batch_size=1):
             for batch_output in batch_outputs:
                 final_output.append(batch_output[i])
             final_outputs.append(final_output)
-        print("batch_size:", batch_size,
-              "TensorRT inference time: {} ms".format(
-                  int(round((time.time() - inference_start_time) * 1000))
-              )
-              )
         return final_outputs
 
     def _prepare_post_process(self):
@@ -251,7 +192,8 @@ def _prepare_post_process(self):
             value = (i * self.stride + 0.5) / self.box_norm
             self.grid_centers_w.append(value)
 
-    def _apply_box_norm(self, o1, o2, o3, o4, x, y):
+    def _apply_box_norm(self, o1: float, o2: float, o3: float, o4: float, x: int, y: int) -> \
+            Tuple[float, float, float, float]:
         """
         Applies the GridNet box normalization
         Args:
@@ -274,17 +216,17 @@ def _apply_box_norm(self, o1, o2, o3, o4, x, y):
         o4 = (o4 + self.grid_centers_h[y]) * self.box_norm
         return o1, o2, o3, o4
 
-    def postprocess(self, outputs, min_confidence, analysis_classes, wh_format=True):
+    def postprocess(self, outputs: List[float], min_confidence: float, analysis_classes: List[int], wh_format=True)-> \
+            Tuple[List[List[int]], List[int], List[float]]:
         """
         Postprocesses the inference output
         Args:
             outputs (list of float): inference output
             min_confidence (float): min confidence to accept detection
             analysis_classes (list of int): indices of the classes to consider
-
+            wh_format (bool): return bbox in (xmin, ymin, w, h) or (xmin, ymin, xmax, ymax)
         Returns: list of list tuple: each element is a two list tuple (x, y) representing the corners of a bb
         """
-        # print(len(outputs))
         bbs = []
         class_ids = []
         scores = []

From d2a00a0be54be77c127b04c7b264b18e0fd63a99 Mon Sep 17 00:00:00 2001
From: Zhao Wang <zhao.wang@aotu.ai>
Date: Tue, 27 Apr 2021 22:57:30 -0700
Subject: [PATCH 05/21] update dependency list

---
 vcap/setup.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vcap/setup.py b/vcap/setup.py
index fd04d33..ceba2da 100644
--- a/vcap/setup.py
+++ b/vcap/setup.py
@@ -25,6 +25,8 @@
         "scikit-learn==0.22.2",
         "numpy>=1.16,<2",
         "tensorflow-gpu==1.15.4",
+        "pycuda>=2019.1.1",
+        "tensorrt==7.2.3.4",
     ],
     extras_require={
         "tests": test_packages,

From 659d003f1b994691bafd49ae2f96bb292fe3109e Mon Sep 17 00:00:00 2001
From: Zhao Wang <zhao.wang@aotu.ai>
Date: Thu, 29 Apr 2021 18:32:57 -0700
Subject: [PATCH 06/21] add support for multi GPU

---
 .../vcap_utils/backends/base_tensorrt.py      | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py
index 415b5df..22d8349 100644
--- a/vcap_utils/vcap_utils/backends/base_tensorrt.py
+++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py
@@ -2,19 +2,15 @@
 
 import pycuda.driver as cuda
 import tensorrt as trt
-import pycuda.autoinit
 
 from typing import Dict, List, Tuple, Optional, Any
 
 from vcap import (
-    Crop,
-    DetectionNode,
     Resize,
     DETECTION_NODE_TYPE,
     OPTION_TYPE,
     BaseStreamState,
     BaseBackend,
-    rect_to_coords,
 )
 
 
@@ -39,13 +35,16 @@ def __init__(self, inputs_, outputs_, bindings_, stream_):
 
 
 class BaseTensorRTBackend(BaseBackend):
-    def __init__(self, engine_bytes, width, height):
+    def __init__(self, engine_bytes, width, height, device_id):
         super().__init__()
+        gpu_devide_id = int(device_id[4:])
+        cuda.init()
+        dev = cuda.Device(gpu_devide_id)
+        self.ctx = dev.make_context()
         TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
         self.trt_runtime = trt.Runtime(TRT_LOGGER)
         # load the engine
         self.trt_engine = self.trt_runtime.deserialize_cuda_engine(engine_bytes)
-
         # create execution context
         self.context = self.trt_engine.create_execution_context()
         # create buffers for inference
@@ -147,6 +146,7 @@ def allocate_buffers(self, batch_size: int = 1) -> \
     def do_inference(self, bindings: List[int], inputs: List[HostDeviceMem], outputs: List[HostDeviceMem],
                      stream: cuda.Stream, batch_size: int = 1) -> List[List[float]]:
         # Transfer input data to the GPU.
+        self.ctx.push()
         [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
         # Run inference.
         # todo: use async or sync api?
@@ -172,6 +172,7 @@ def do_inference(self, bindings: List[int], inputs: List[HostDeviceMem], outputs
             for batch_output in batch_outputs:
                 final_output.append(batch_output[i])
             final_outputs.append(final_output)
+        self.ctx.pop()
         return final_outputs
 
     def _prepare_post_process(self):
@@ -216,7 +217,7 @@ def _apply_box_norm(self, o1: float, o2: float, o3: float, o4: float, x: int, y:
         o4 = (o4 + self.grid_centers_h[y]) * self.box_norm
         return o1, o2, o3, o4
 
-    def postprocess(self, outputs: List[float], min_confidence: float, analysis_classes: List[int], wh_format=True)-> \
+    def postprocess(self, outputs: List[float], min_confidence: float, analysis_classes: List[int], wh_format=True) -> \
             Tuple[List[List[int]], List[int], List[float]]:
         """
         Postprocesses the inference output
@@ -262,3 +263,7 @@ def postprocess(self, outputs: List[float], min_confidence: float, analysis_clas
                         scores.append(float(score))
 
         return bbs, class_ids, scores
+
+    def close(self):
+        super().close()
+        self.ctx.pop()

From 4f28496399bd9b07ea06b94840acb257ab04ce03 Mon Sep 17 00:00:00 2001
From: Zhao Wang <zhao.wang@aotu.ai>
Date: Fri, 30 Apr 2021 00:52:03 -0700
Subject: [PATCH 07/21] update post processing api

---
 .../vcap_utils/backends/base_tensorrt.py      | 57 ++++++++++---------
 1 file changed, 31 insertions(+), 26 deletions(-)

diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py
index 22d8349..9554559 100644
--- a/vcap_utils/vcap_utils/backends/base_tensorrt.py
+++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py
@@ -2,6 +2,7 @@
 
 import pycuda.driver as cuda
 import tensorrt as trt
+import cv2
 
 from typing import Dict, List, Tuple, Optional, Any
 
@@ -11,6 +12,8 @@
     OPTION_TYPE,
     BaseStreamState,
     BaseBackend,
+    rect_to_coords,
+    DetectionNode,
 )
 
 
@@ -217,53 +220,55 @@ def _apply_box_norm(self, o1: float, o2: float, o3: float, o4: float, x: int, y:
         o4 = (o4 + self.grid_centers_h[y]) * self.box_norm
         return o1, o2, o3, o4
 
-    def postprocess(self, outputs: List[float], min_confidence: float, analysis_classes: List[int], wh_format=True) -> \
-            Tuple[List[List[int]], List[int], List[float]]:
-        """
-        Postprocesses the inference output
-        Args:
-            outputs (list of float): inference output
-            min_confidence (float): min confidence to accept detection
-            analysis_classes (list of int): indices of the classes to consider
-            wh_format (bool): return bbox in (xmin, ymin, w, h) or (xmin, ymin, xmax, ymax)
-        Returns: list of list tuple: each element is a two list tuple (x, y) representing the corners of a bb
-        """
+    def parse_detection_results(
+            self, results: List[List[float]],
+            resize: Resize,
+            label_map: Dict[int, str],
+            min_confidence: float = 0.0,
+    ) -> List[DetectionNode]:
         bbs = []
         class_ids = []
         scores = []
-        for c in analysis_classes:
+        for c in label_map.keys():
 
             x1_idx = c * 4 * self.grid_size
             y1_idx = x1_idx + self.grid_size
             x2_idx = y1_idx + self.grid_size
             y2_idx = x2_idx + self.grid_size
 
-            boxes = outputs[0]
+            boxes = results[0]
             for h in range(self.grid_h):
                 for w in range(self.grid_w):
                     i = w + h * self.grid_w
-                    score = outputs[1][c * self.grid_size + i]
+                    score = results[1][c * self.grid_size + i]
                     if score >= min_confidence:
                         o1 = boxes[x1_idx + w + h * self.grid_w]
                         o2 = boxes[y1_idx + w + h * self.grid_w]
                         o3 = boxes[x2_idx + w + h * self.grid_w]
                         o4 = boxes[y2_idx + w + h * self.grid_w]
-
                         o1, o2, o3, o4 = self._apply_box_norm(o1, o2, o3, o4, w, h)
-
                         xmin = int(o1)
                         ymin = int(o2)
                         xmax = int(o3)
                         ymax = int(o4)
-                        if wh_format:
-                            bbs.append([xmin, ymin, xmax - xmin, ymax - ymin])
-                        else:
-                            bbs.append([xmin, ymin, xmax, ymax])
+                        bbs.append([xmin, ymin, xmax - xmin, ymax - ymin])
                         class_ids.append(c)
                         scores.append(float(score))
-
-        return bbs, class_ids, scores
-
-    def close(self):
-        super().close()
-        self.ctx.pop()
+        indexes = cv2.dnn.NMSBoxes(bbs, scores, min_confidence, 0.5)
+        detections = []
+        for idx in indexes:
+            idx = int(idx)
+            xmin, ymin, w, h = bbs[idx]
+            class_id = class_ids[idx]
+            class_name = label_map[class_id]
+            detections.append(
+                DetectionNode(
+                    name=class_name,
+                    coords=rect_to_coords(
+                        [xmin, ymin, (xmin + w), (ymin + h)]
+                    ),
+                    extra_data={"detection_confidence": scores[idx]},
+                )
+            )
+        resize.scale_and_offset_detection_nodes(detections)
+        return detections

From 99e9248830e4b1bb063b97616ebd21c008516c7c Mon Sep 17 00:00:00 2001
From: Zhao Wang <zhao.wang@aotu.ai>
Date: Fri, 30 Apr 2021 11:41:37 -0700
Subject: [PATCH 08/21] add close() fucntion back, was deleted by accident

---
 vcap_utils/vcap_utils/backends/base_tensorrt.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py
index 9554559..d119283 100644
--- a/vcap_utils/vcap_utils/backends/base_tensorrt.py
+++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py
@@ -272,3 +272,7 @@ def parse_detection_results(
             )
         resize.scale_and_offset_detection_nodes(detections)
         return detections
+
+    def close(self) -> None:
+        super().close()
+        self.ctx.pop()
\ No newline at end of file

From 1901c879732e65f78baf4b7ed9b0cc6ad6091c26 Mon Sep 17 00:00:00 2001
From: Zhao Wang <zhao.wang@aotu.ai>
Date: Fri, 30 Apr 2021 21:21:31 -0700
Subject: [PATCH 09/21] remove process_frame, add type hint for __init__

---
 vcap_utils/vcap_utils/backends/base_tensorrt.py | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py
index d119283..29b0005 100644
--- a/vcap_utils/vcap_utils/backends/base_tensorrt.py
+++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py
@@ -8,9 +8,6 @@
 
 from vcap import (
     Resize,
-    DETECTION_NODE_TYPE,
-    OPTION_TYPE,
-    BaseStreamState,
     BaseBackend,
     rect_to_coords,
     DetectionNode,
@@ -38,7 +35,7 @@ def __init__(self, inputs_, outputs_, bindings_, stream_):
 
 
 class BaseTensorRTBackend(BaseBackend):
-    def __init__(self, engine_bytes, width, height, device_id):
+    def __init__(self, engine_bytes: bytes, width: int, height: int, device_id: str):
         super().__init__()
         gpu_devide_id = int(device_id[4:])
         cuda.init()
@@ -93,11 +90,6 @@ def _process_batch(self, input_data: List[np.array]) -> List[List[float]]:
         )
         return detections
 
-    def process_frame(self, frame: np.ndarray, detection_node: DETECTION_NODE_TYPE,
-                      options: Dict[str, OPTION_TYPE],
-                      state: BaseStreamState) -> DETECTION_NODE_TYPE:
-        pass
-
     def prepare_inputs(self, frame: np.ndarray, transpose: bool, normalize: bool,
                        mean_subtraction: Optional[Tuple] = None) -> \
             Tuple[np.array, Resize]:
@@ -275,4 +267,4 @@ def parse_detection_results(
 
     def close(self) -> None:
         super().close()
-        self.ctx.pop()
\ No newline at end of file
+        self.ctx.pop()

From d51139b95d9d5652afdce2b67effa5ca994c86af Mon Sep 17 00:00:00 2001
From: Zhao Wang <zhao.wang@aotu.ai>
Date: Fri, 30 Apr 2021 21:27:55 -0700
Subject: [PATCH 10/21] remove cuda and trt context

---
 vcap_utils/vcap_utils/backends/base_tensorrt.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py
index 29b0005..f576cfd 100644
--- a/vcap_utils/vcap_utils/backends/base_tensorrt.py
+++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py
@@ -40,13 +40,13 @@ def __init__(self, engine_bytes: bytes, width: int, height: int, device_id: str)
         gpu_devide_id = int(device_id[4:])
         cuda.init()
         dev = cuda.Device(gpu_devide_id)
-        self.ctx = dev.make_context()
+        self.cuda_context = dev.make_context()
         TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
         self.trt_runtime = trt.Runtime(TRT_LOGGER)
         # load the engine
         self.trt_engine = self.trt_runtime.deserialize_cuda_engine(engine_bytes)
         # create execution context
-        self.context = self.trt_engine.create_execution_context()
+        self.trt_context = self.trt_engine.create_execution_context()
         # create buffers for inference
         self.buffers = {}
         for batch_size in range(1, self.trt_engine.max_batch_size + 1):
@@ -141,13 +141,13 @@ def allocate_buffers(self, batch_size: int = 1) -> \
     def do_inference(self, bindings: List[int], inputs: List[HostDeviceMem], outputs: List[HostDeviceMem],
                      stream: cuda.Stream, batch_size: int = 1) -> List[List[float]]:
         # Transfer input data to the GPU.
-        self.ctx.push()
+        self.cuda_context.push()
         [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
         # Run inference.
         # todo: use async or sync api?
         # According to https://docs.nvidia.com/deeplearning/tensorrt/best-practices/index.html#optimize-python
         # the performance should be almost identical
-        self.context.execute(
+        self.trt_context.execute(
             batch_size=batch_size, bindings=bindings
         )
         # Transfer predictions back from the GPU.
@@ -167,7 +167,7 @@ def do_inference(self, bindings: List[int], inputs: List[HostDeviceMem], outputs
             for batch_output in batch_outputs:
                 final_output.append(batch_output[i])
             final_outputs.append(final_output)
-        self.ctx.pop()
+        self.cuda_context.pop()
         return final_outputs
 
     def _prepare_post_process(self):
@@ -267,4 +267,4 @@ def parse_detection_results(
 
     def close(self) -> None:
         super().close()
-        self.ctx.pop()
+        self.cuda_context.pop()

From 07af9dd37d0a62483605e0b5718b29eb22732fa1 Mon Sep 17 00:00:00 2001
From: Zhao Wang <zhao.wang@aotu.ai>
Date: Fri, 30 Apr 2021 21:38:45 -0700
Subject: [PATCH 11/21] make stride a local variable

---
 vcap_utils/vcap_utils/backends/base_tensorrt.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py
index f576cfd..efd8784 100644
--- a/vcap_utils/vcap_utils/backends/base_tensorrt.py
+++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py
@@ -171,21 +171,21 @@ def do_inference(self, bindings: List[int], inputs: List[HostDeviceMem], outputs
         return final_outputs
 
     def _prepare_post_process(self):
-        self.stride = 16
+        stride = 16
         self.box_norm = 35.0
-        self.grid_h = int(self.engine_height / self.stride)
-        self.grid_w = int(self.engine_width / self.stride)
+        self.grid_h = int(self.engine_height / stride)
+        self.grid_w = int(self.engine_width / stride)
         self.grid_size = self.grid_h * self.grid_w
 
         self.grid_centers_w = []
         self.grid_centers_h = []
 
         for i in range(self.grid_h):
-            value = (i * self.stride + 0.5) / self.box_norm
+            value = (i * stride + 0.5) / self.box_norm
             self.grid_centers_h.append(value)
 
         for i in range(self.grid_w):
-            value = (i * self.stride + 0.5) / self.box_norm
+            value = (i * stride + 0.5) / self.box_norm
             self.grid_centers_w.append(value)
 
     def _apply_box_norm(self, o1: float, o2: float, o3: float, o4: float, x: int, y: int) -> \

From f661509ff75d442222648943f2e4a64dffd7b9a4 Mon Sep 17 00:00:00 2001
From: Zhao Wang <zhao.wang@aotu.ai>
Date: Fri, 30 Apr 2021 21:44:18 -0700
Subject: [PATCH 12/21] update type hint for batch_predict

---
 vcap_utils/vcap_utils/backends/base_tensorrt.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py
index efd8784..7ff4702 100644
--- a/vcap_utils/vcap_utils/backends/base_tensorrt.py
+++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py
@@ -4,7 +4,7 @@
 import tensorrt as trt
 import cv2
 
-from typing import Dict, List, Tuple, Optional, Any
+from typing import Dict, List, Tuple, Optional, Generator
 
 from vcap import (
     Resize,
@@ -62,7 +62,8 @@ def __init__(self, engine_bytes: bytes, width: int, height: int, device_id: str)
         # todo: post process is only need for detectors
         self._prepare_post_process()
 
-    def batch_predict(self, input_data_list: List[Any]) -> List[Any]:
+    def batch_predict(self, input_data_list: List[np.ndarray]) \
+            -> Generator[List[DetectionNode], None, None]:
         task_size = len(input_data_list)
         curr_index = 0
         while curr_index < task_size:

From 874414b6fc00eddb8d25b2d8ce4f8a5cd3cc2e4c Mon Sep 17 00:00:00 2001
From: Zhao Wang <zhao.wang@aotu.ai>
Date: Fri, 30 Apr 2021 21:46:18 -0700
Subject: [PATCH 13/21] formatting

---
 vcap_utils/vcap_utils/backends/base_tensorrt.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py
index 7ff4702..cb2c449 100644
--- a/vcap_utils/vcap_utils/backends/base_tensorrt.py
+++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py
@@ -86,7 +86,7 @@ def _process_batch(self, input_data: List[np.array]) -> List[List[float]]:
         # todo: get dtype from engine
         inputs[0].host = np.ascontiguousarray(input_data, dtype=np.float32)
 
-        detections = self.do_inference(
+        detections = self._do_inference(
             bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=batch_size
         )
         return detections
@@ -139,8 +139,11 @@ def allocate_buffers(self, batch_size: int = 1) -> \
                 outputs.append(HostDeviceMem(host_mem, device_mem))
         return inputs, outputs, bindings, stream
 
-    def do_inference(self, bindings: List[int], inputs: List[HostDeviceMem], outputs: List[HostDeviceMem],
-                     stream: cuda.Stream, batch_size: int = 1) -> List[List[float]]:
+    def _do_inference(self, bindings: List[int],
+                      inputs: List[HostDeviceMem],
+                      outputs: List[HostDeviceMem],
+                      stream: cuda.Stream,
+                      batch_size: int = 1) -> List[List[float]]:
         # Transfer input data to the GPU.
         self.cuda_context.push()
         [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]

From 7255b8c2112d6493753deb8670e6410a1b521d8e Mon Sep 17 00:00:00 2001
From: Zhao Wang <zhao.wang@aotu.ai>
Date: Fri, 30 Apr 2021 21:55:48 -0700
Subject: [PATCH 14/21] formatting

---
 vcap_utils/vcap_utils/backends/base_tensorrt.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py
index cb2c449..dcb7b38 100644
--- a/vcap_utils/vcap_utils/backends/base_tensorrt.py
+++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py
@@ -80,16 +80,16 @@ def _process_batch(self, input_data: List[np.array]) -> List[List[float]]:
         batch_size = len(input_data)
         prepared_buffer = self.buffers[batch_size]
         inputs = prepared_buffer.inputs
-        outputs = prepared_buffer.outputs
-        bindings = prepared_buffer.bindings
-        stream = prepared_buffer.stream
         # todo: get dtype from engine
         inputs[0].host = np.ascontiguousarray(input_data, dtype=np.float32)
 
-        detections = self._do_inference(
-            bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=batch_size
+        return self._do_inference(
+            bindings=prepared_buffer.bindings,
+            inputs=inputs,
+            outputs=prepared_buffer.outputs,
+            stream=prepared_buffer.stream,
+            batch_size=batch_size
         )
-        return detections
 
     def prepare_inputs(self, frame: np.ndarray, transpose: bool, normalize: bool,
                        mean_subtraction: Optional[Tuple] = None) -> \

From 14d88f96d52e2abdebcdd530589e94125598c25b Mon Sep 17 00:00:00 2001
From: Zhao Wang <zhao.wang@aotu.ai>
Date: Mon, 3 May 2021 14:39:01 -0700
Subject: [PATCH 15/21] rename device_id to device_name

---
 vcap_utils/vcap_utils/backends/base_tensorrt.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py
index dcb7b38..460df1b 100644
--- a/vcap_utils/vcap_utils/backends/base_tensorrt.py
+++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py
@@ -35,9 +35,9 @@ def __init__(self, inputs_, outputs_, bindings_, stream_):
 
 
 class BaseTensorRTBackend(BaseBackend):
-    def __init__(self, engine_bytes: bytes, width: int, height: int, device_id: str):
+    def __init__(self, engine_bytes: bytes, width: int, height: int, device_name: str):
         super().__init__()
-        gpu_devide_id = int(device_id[4:])
+        gpu_devide_id = int(device_name[4:])
         cuda.init()
         dev = cuda.Device(gpu_devide_id)
         self.cuda_context = dev.make_context()

From 1b936981a61ac708ff064f9fed9d2f8a18cdd9ab Mon Sep 17 00:00:00 2001
From: Zhao Wang <65635224+BestDriverCN@users.noreply.github.com>
Date: Mon, 3 May 2021 15:03:11 -0700
Subject: [PATCH 16/21] _apply_box_norm will return int

Co-authored-by: Alex Thiel <alex.thiel@aotu.ai>
---
 .../vcap_utils/backends/base_tensorrt.py      | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py
index d119283..ccc2e14 100644
--- a/vcap_utils/vcap_utils/backends/base_tensorrt.py
+++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py
@@ -197,7 +197,7 @@ def _prepare_post_process(self):
             self.grid_centers_w.append(value)
 
     def _apply_box_norm(self, o1: float, o2: float, o3: float, o4: float, x: int, y: int) -> \
-            Tuple[float, float, float, float]:
+            Tuple[int, int, int, int]:
         """
         Applies the GridNet box normalization
         Args:
@@ -209,16 +209,16 @@ def _apply_box_norm(self, o1: float, o2: float, o3: float, o4: float, x: int, y:
             y: column index on the grid
 
         Returns:
-            float: rescaled first argument
-            float: rescaled second argument
-            float: rescaled third argument
-            float: rescaled fourth argument
+            int: rescaled first argument
+            int: rescaled second argument
+            int: rescaled third argument
+            int: rescaled fourth argument
         """
-        o1 = (o1 - self.grid_centers_w[x]) * -self.box_norm
-        o2 = (o2 - self.grid_centers_h[y]) * -self.box_norm
-        o3 = (o3 + self.grid_centers_w[x]) * self.box_norm
-        o4 = (o4 + self.grid_centers_h[y]) * self.box_norm
-        return o1, o2, o3, o4
+        xmin = int((o1 - self.grid_centers_w[x]) * -self.box_norm)
+        ymin = int((o2 - self.grid_centers_h[y]) * -self.box_norm)
+        xmax = int((o3 + self.grid_centers_w[x]) * self.box_norm)
+        ymax = int((o4 + self.grid_centers_h[y]) * self.box_norm)
+        return xmin, ymin, xmax, ymax
 
     def parse_detection_results(
             self, results: List[List[float]],
@@ -275,4 +275,4 @@ def parse_detection_results(
 
     def close(self) -> None:
         super().close()
-        self.ctx.pop()
\ No newline at end of file
+        self.ctx.pop()

From 57d3eeb3c4bf08001084213174dc58765a1830a7 Mon Sep 17 00:00:00 2001
From: Zhao Wang <zhao.wang@aotu.ai>
Date: Mon, 3 May 2021 15:06:50 -0700
Subject: [PATCH 17/21] converting _apply_box_norm's return type

---
 vcap_utils/vcap_utils/backends/base_tensorrt.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py
index b5b3863..b91037e 100644
--- a/vcap_utils/vcap_utils/backends/base_tensorrt.py
+++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py
@@ -226,7 +226,6 @@ def parse_detection_results(
         class_ids = []
         scores = []
         for c in label_map.keys():
-
             x1_idx = c * 4 * self.grid_size
             y1_idx = x1_idx + self.grid_size
             x2_idx = y1_idx + self.grid_size
@@ -242,11 +241,7 @@ def parse_detection_results(
                         o2 = boxes[y1_idx + w + h * self.grid_w]
                         o3 = boxes[x2_idx + w + h * self.grid_w]
                         o4 = boxes[y2_idx + w + h * self.grid_w]
-                        o1, o2, o3, o4 = self._apply_box_norm(o1, o2, o3, o4, w, h)
-                        xmin = int(o1)
-                        ymin = int(o2)
-                        xmax = int(o3)
-                        ymax = int(o4)
+                        xmin, ymin, xmax, ymax = self._apply_box_norm(o1, o2, o3, o4, w, h)
                         bbs.append([xmin, ymin, xmax - xmin, ymax - ymin])
                         class_ids.append(c)
                         scores.append(float(score))

From ef70e0d9f3a8cb1e843be5ef6f896d38004a78e2 Mon Sep 17 00:00:00 2001
From: Zhao Wang <zhao.wang@aotu.ai>
Date: Mon, 3 May 2021 15:25:11 -0700
Subject: [PATCH 18/21] refactor parse_detection_results

---
 .../vcap_utils/backends/base_tensorrt.py      | 44 +++++++------------
 1 file changed, 16 insertions(+), 28 deletions(-)

diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py
index b91037e..4038119 100644
--- a/vcap_utils/vcap_utils/backends/base_tensorrt.py
+++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py
@@ -13,6 +13,8 @@
     DetectionNode,
 )
 
+from vcap_utils import non_max_suppression
+
 
 class HostDeviceMem(object):
     def __init__(self, host_mem, device_mem):
@@ -222,11 +224,9 @@ def parse_detection_results(
             label_map: Dict[int, str],
             min_confidence: float = 0.0,
     ) -> List[DetectionNode]:
-        bbs = []
-        class_ids = []
-        scores = []
-        for c in label_map.keys():
-            x1_idx = c * 4 * self.grid_size
+        detection_nodes: List[DetectionNode] = []
+        for class_id, class_name in label_map.items():
+            x1_idx = class_id * 4 * self.grid_size
             y1_idx = x1_idx + self.grid_size
             x2_idx = y1_idx + self.grid_size
             y2_idx = x2_idx + self.grid_size
@@ -235,36 +235,24 @@ def parse_detection_results(
             for h in range(self.grid_h):
                 for w in range(self.grid_w):
                     i = w + h * self.grid_w
-                    score = results[1][c * self.grid_size + i]
+                    score = results[1][class_id * self.grid_size + i]
                     if score >= min_confidence:
                         o1 = boxes[x1_idx + w + h * self.grid_w]
                         o2 = boxes[y1_idx + w + h * self.grid_w]
                         o3 = boxes[x2_idx + w + h * self.grid_w]
                         o4 = boxes[y2_idx + w + h * self.grid_w]
                         xmin, ymin, xmax, ymax = self._apply_box_norm(o1, o2, o3, o4, w, h)
-                        bbs.append([xmin, ymin, xmax - xmin, ymax - ymin])
-                        class_ids.append(c)
-                        scores.append(float(score))
-        indexes = cv2.dnn.NMSBoxes(bbs, scores, min_confidence, 0.5)
-        detections = []
-        for idx in indexes:
-            idx = int(idx)
-            xmin, ymin, w, h = bbs[idx]
-            class_id = class_ids[idx]
-            class_name = label_map[class_id]
-            detections.append(
-                DetectionNode(
-                    name=class_name,
-                    coords=rect_to_coords(
-                        [xmin, ymin, (xmin + w), (ymin + h)]
-                    ),
-                    extra_data={"detection_confidence": scores[idx]},
-                )
-            )
-        resize.scale_and_offset_detection_nodes(detections)
-        return detections
+                        detection_nodes.append(DetectionNode(
+                            name=class_name,
+                            coords=rect_to_coords(
+                                [xmin, ymin, xmax, ymax]
+                            ),
+                            extra_data={"detection_confidence": score},
+                        ))
+        nodes = non_max_suppression(detection_nodes, max_bbox_overlap=0.5)
+        resize.scale_and_offset_detection_nodes(detection_nodes)
+        return detection_nodes
 
     def close(self) -> None:
         super().close()
         self.cuda_context.pop()
-

From a33112d3bd08415e3478047c75770fa62e73084a Mon Sep 17 00:00:00 2001
From: Zhao Wang <zhao.wang@aotu.ai>
Date: Mon, 3 May 2021 17:51:48 -0700
Subject: [PATCH 19/21] fix import path, fix variable name, simplify code

---
 .../vcap_utils/backends/base_tensorrt.py      | 26 +++++--------------
 1 file changed, 6 insertions(+), 20 deletions(-)

diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py
index 4038119..a3daf04 100644
--- a/vcap_utils/vcap_utils/backends/base_tensorrt.py
+++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py
@@ -2,7 +2,6 @@
 
 import pycuda.driver as cuda
 import tensorrt as trt
-import cv2
 
 from typing import Dict, List, Tuple, Optional, Generator
 
@@ -12,8 +11,7 @@
     rect_to_coords,
     DetectionNode,
 )
-
-from vcap_utils import non_max_suppression
+from vcap_utils.algorithms import non_max_suppression
 
 
 class HostDeviceMem(object):
@@ -167,12 +165,8 @@ def _do_inference(self, bindings: List[int],
             out_array_by_batch = np.split(entire_out_array, batch_size)
             out_lists = [out_array.tolist() for out_array in out_array_by_batch]
             batch_outputs.append(out_lists)
-        final_outputs = []
-        for i in range(len(batch_outputs[0])):
-            final_output = []
-            for batch_output in batch_outputs:
-                final_output.append(batch_output[i])
-            final_outputs.append(final_output)
+        final_outputs = list(zip(*batch_outputs))
+        final_outputs = [list(item) for item in final_outputs]
         self.cuda_context.pop()
         return final_outputs
 
@@ -183,16 +177,8 @@ def _prepare_post_process(self):
         self.grid_w = int(self.engine_width / stride)
         self.grid_size = self.grid_h * self.grid_w
 
-        self.grid_centers_w = []
-        self.grid_centers_h = []
-
-        for i in range(self.grid_h):
-            value = (i * stride + 0.5) / self.box_norm
-            self.grid_centers_h.append(value)
-
-        for i in range(self.grid_w):
-            value = (i * stride + 0.5) / self.box_norm
-            self.grid_centers_w.append(value)
+        self.grid_centers_h = [(i * stride + 0.5) / self.box_norm for i in range(self.grid_h)]
+        self.grid_centers_w = [(i * stride + 0.5) / self.box_norm for i in range(self.grid_w)]
 
     def _apply_box_norm(self, o1: float, o2: float, o3: float, o4: float, x: int, y: int) -> \
             Tuple[int, int, int, int]:
@@ -249,7 +235,7 @@ def parse_detection_results(
                             ),
                             extra_data={"detection_confidence": score},
                         ))
-        nodes = non_max_suppression(detection_nodes, max_bbox_overlap=0.5)
+        detection_nodes = non_max_suppression(detection_nodes, max_bbox_overlap=0.5)
         resize.scale_and_offset_detection_nodes(detection_nodes)
         return detection_nodes
 

From 8fe05983a25b2f0cb287785a7bccc34bdf154e6b Mon Sep 17 00:00:00 2001
From: Zhao Wang <zhao.wang@aotu.ai>
Date: Mon, 3 May 2021 18:14:26 -0700
Subject: [PATCH 20/21] simplify code

---
 vcap_utils/vcap_utils/backends/base_tensorrt.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py
index a3daf04..b66c83f 100644
--- a/vcap_utils/vcap_utils/backends/base_tensorrt.py
+++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py
@@ -79,13 +79,12 @@ def batch_predict(self, input_data_list: List[np.ndarray]) \
     def _process_batch(self, input_data: List[np.array]) -> List[List[float]]:
         batch_size = len(input_data)
         prepared_buffer = self.buffers[batch_size]
-        inputs = prepared_buffer.inputs
         # todo: get dtype from engine
-        inputs[0].host = np.ascontiguousarray(input_data, dtype=np.float32)
+        prepared_buffer.inputs[0].host = np.ascontiguousarray(input_data, dtype=np.float32)
 
         return self._do_inference(
             bindings=prepared_buffer.bindings,
-            inputs=inputs,
+            inputs=prepared_buffer.inputs,
             outputs=prepared_buffer.outputs,
             stream=prepared_buffer.stream,
             batch_size=batch_size

From 1da54367b4aaa08385f0072e66e838114a9f353a Mon Sep 17 00:00:00 2001
From: Zhao Wang <zhao.wang@aotu.ai>
Date: Wed, 5 May 2021 15:21:53 -0700
Subject: [PATCH 21/21] change log level to INFO, cast score to float

---
 vcap_utils/vcap_utils/backends/base_tensorrt.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py
index b66c83f..596e8c0 100644
--- a/vcap_utils/vcap_utils/backends/base_tensorrt.py
+++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py
@@ -41,7 +41,7 @@ def __init__(self, engine_bytes: bytes, width: int, height: int, device_name: st
         cuda.init()
         dev = cuda.Device(gpu_devide_id)
         self.cuda_context = dev.make_context()
-        TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
+        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
         self.trt_runtime = trt.Runtime(TRT_LOGGER)
         # load the engine
         self.trt_engine = self.trt_runtime.deserialize_cuda_engine(engine_bytes)
@@ -232,7 +232,7 @@ def parse_detection_results(
                             coords=rect_to_coords(
                                 [xmin, ymin, xmax, ymax]
                             ),
-                            extra_data={"detection_confidence": score},
+                            extra_data={"detection_confidence": float(score)},
                         ))
         detection_nodes = non_max_suppression(detection_nodes, max_bbox_overlap=0.5)
         resize.scale_and_offset_detection_nodes(detection_nodes)