From 3059a7e81e9c78ea29be0ce2c841b82254965088 Mon Sep 17 00:00:00 2001 From: Zhao Date: Tue, 20 Apr 2021 15:41:24 -0700 Subject: [PATCH 01/21] add a base tensorrt backend --- vcap_utils/vcap_utils/backends/__init__.py | 1 + .../vcap_utils/backends/base_tensorrt.py | 267 ++++++++++++++++++ 2 files changed, 268 insertions(+) create mode 100644 vcap_utils/vcap_utils/backends/base_tensorrt.py diff --git a/vcap_utils/vcap_utils/backends/__init__.py b/vcap_utils/vcap_utils/backends/__init__.py index 15ce457..b38a1dc 100644 --- a/vcap_utils/vcap_utils/backends/__init__.py +++ b/vcap_utils/vcap_utils/backends/__init__.py @@ -8,6 +8,7 @@ from .openface_encoder import OpenFaceEncoder from .base_encoder import BaseEncoderBackend from .backend_rpc_process import BackendRpcProcess +from .base_tensorrt import BaseTensorRTBackend from .load_utils import parse_dataset_metadata_bytes, parse_tf_model_bytes from .predictions import ( EncodingPrediction, diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py new file mode 100644 index 0000000..9df6e4c --- /dev/null +++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py @@ -0,0 +1,267 @@ +import numpy as np +import pycuda.driver as cuda +import tensorrt as trt +import pycuda.autoinit + +from typing import Dict, List, Tuple, Optional, Any + +from vcap import ( + Crop, + DetectionNode, + Resize, + DETECTION_NODE_TYPE, + OPTION_TYPE, + BaseStreamState, + BaseBackend, + rect_to_coords, +) + + +class HostDeviceMem(object): + def __init__(self, host_mem, device_mem): + self.host = host_mem + self.device = device_mem + + def __str__(self): + return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device) + + def __repr__(self): + return self.__str__() + + +def load_engine(trt_runtime, engine_data): + engine = trt_runtime.deserialize_cuda_engine(engine_data) + return engine + + +class AllocatedBuffer: + def __init__(self, inputs_, outputs_, bindings_, stream_): + self.inputs = inputs_ + self.outputs = outputs_ + self.bindings = bindings_ + self.stream = stream_ + + +class BaseTensorRTBackend(BaseBackend): + def __init__(self, engine_bytes, width, height): + super().__init__() + TRT_LOGGER = trt.Logger(trt.Logger.WARNING) + self.trt_runtime = trt.Runtime(TRT_LOGGER) + # load the engine + self.trt_engine = load_engine(self.trt_runtime, engine_bytes) + # create execution context + self.context = self.trt_engine.create_execution_context() + # create buffers for inference + self.buffers = {} + for batch_size in range(1, self.trt_engine.max_batch_size + 1): + inputs, outputs, bindings, stream = self.allocate_buffers( + batch_size=batch_size) + self.buffers[batch_size] = AllocatedBuffer(inputs, outputs, bindings, + stream) + + self.engine_width = width + self.engine_height = height + + self._prepare_post_process() + + def batch_predict(self, input_data_list: List[Any]) -> List[Any]: + task_size = len(input_data_list) + curr_index = 0 + while curr_index < task_size: + if curr_index + self.trt_engine.max_batch_size <= task_size: + end_index = curr_index + self.trt_engine.max_batch_size + else: + end_index = task_size + batch = input_data_list[curr_index:end_index] + curr_index = end_index + for result in self._process_batch(batch): + yield result + + def _process_batch(self, input_data: List[np.array]): + batch_size = len(input_data) + batched_image = np.concatenate(input_data, axis=0) + prepared_buffer = self.buffers[batch_size] + inputs = prepared_buffer.inputs + outputs = prepared_buffer.outputs + bindings = prepared_buffer.bindings + stream = prepared_buffer.stream + np.copyto(inputs[0].host, batched_image.ravel()) + detections = self.do_inference( + bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, + batch_size=batch_size + ) + return detections + + def process_frame(self, frame: np.ndarray, detection_node: DETECTION_NODE_TYPE, + options: Dict[str, OPTION_TYPE], + state: BaseStreamState) -> DETECTION_NODE_TYPE: + pass + + def prepare_inputs(self, frame: np.ndarray, transpose: bool, normalize: bool, + mean_subtraction: Optional[Tuple] = None) -> \ + Tuple[np.array, Resize]: + resize = Resize(frame).resize(self.engine_width, self.engine_height, + Resize.ResizeType.EXACT) + + if transpose: + resize.frame = np.transpose(resize.frame, (2, 0, 1)) + if normalize: + resize.frame = (1.0 / 255.0) * resize.frame + if mean_subtraction is not None: + if len(mean_subtraction) != 3: + raise RuntimeError("Invalid mean subtraction") + resize.frame = resize.frame.astype("float64") + resize.frame[..., 0] -= mean_subtraction[0] + resize.frame[..., 1] -= mean_subtraction[1] + resize.frame[..., 2] -= mean_subtraction[2] + return resize.frame, resize + + def allocate_buffers(self, batch_size=1): + """Allocates host and device buffer for TRT engine inference. + This function is similair to the one in common.py, but + converts network outputs (which are np.float32) appropriately + before writing them to Python buffer. This is needed, since + TensorRT plugins doesn't support output type description, and + in our particular case, we use NMS plugin as network output. + Args: + engine (trt.ICudaEngine): TensorRT engine + batch_size: batch size for the input/output memory + Returns: + inputs [HostDeviceMem]: engine input memory + outputs [HostDeviceMem]: engine output memory + bindings [int]: buffer to device bindings + stream (cuda.Stream): cuda stream for engine inference synchronization + """ + inputs = [] + outputs = [] + bindings = [] + stream = cuda.Stream() + for binding in self.trt_engine: + size = trt.volume(self.trt_engine.get_binding_shape(binding)) * batch_size + dtype = trt.nptype(self.trt_engine.get_binding_dtype(binding)) + # Allocate host and device buffers + host_mem = cuda.pagelocked_empty(size, dtype) + device_mem = cuda.mem_alloc(host_mem.nbytes) + # Append the device buffer to device bindings. + bindings.append(int(device_mem)) + # Append to the appropriate list. + if self.trt_engine.binding_is_input(binding): + inputs.append(HostDeviceMem(host_mem, device_mem)) + else: + outputs.append(HostDeviceMem(host_mem, device_mem)) + return inputs, outputs, bindings, stream + + def do_inference(self, bindings, inputs, outputs, stream, batch_size=1): + # Transfer input data to the GPU. + [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] + # Run inference. + # todo: try execute synchronously + self.context.execute_async( + batch_size=batch_size, bindings=bindings, stream_handle=stream.handle + ) + # Transfer predictions back from the GPU. + [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] + # Synchronize the stream + stream.synchronize() + # Return only the host outputs. + batch_outputs = [] + for out in outputs: + entire_out_array = np.array(out.host) + out_array_by_batch = np.split(entire_out_array, batch_size) + out_lists = [out_array.tolist() for out_array in out_array_by_batch] + batch_outputs.append(out_lists) + final_outputs = [] + for i in range(len(batch_outputs[0])): + final_output = [] + for batch_output in batch_outputs: + final_output.append(batch_output[i]) + final_outputs.append(final_output) + return final_outputs + + def _prepare_post_process(self): + self.stride = 16 + self.box_norm = 35.0 + self.grid_h = int(self.engine_height / self.stride) + self.grid_w = int(self.engine_width / self.stride) + self.grid_size = self.grid_h * self.grid_w + + self.grid_centers_w = [] + self.grid_centers_h = [] + + for i in range(self.grid_h): + value = (i * self.stride + 0.5) / self.box_norm + self.grid_centers_h.append(value) + + for i in range(self.grid_w): + value = (i * self.stride + 0.5) / self.box_norm + self.grid_centers_w.append(value) + + def _apply_box_norm(self, o1, o2, o3, o4, x, y): + """ + Applies the GridNet box normalization + Args: + o1 (float): first argument of the result + o2 (float): second argument of the result + o3 (float): third argument of the result + o4 (float): fourth argument of the result + x: row index on the grid + y: column index on the grid + + Returns: + float: rescaled first argument + float: rescaled second argument + float: rescaled third argument + float: rescaled fourth argument + """ + o1 = (o1 - self.grid_centers_w[x]) * -self.box_norm + o2 = (o2 - self.grid_centers_h[y]) * -self.box_norm + o3 = (o3 + self.grid_centers_w[x]) * self.box_norm + o4 = (o4 + self.grid_centers_h[y]) * self.box_norm + return o1, o2, o3, o4 + + def postprocess(self, outputs, min_confidence, analysis_classes, wh_format=True): + """ + Postprocesses the inference output + Args: + outputs (list of float): inference output + min_confidence (float): min confidence to accept detection + analysis_classes (list of int): indices of the classes to consider + + Returns: list of list tuple: each element is a two list tuple (x, y) representing the corners of a bb + """ + # print(len(outputs)) + bbs = [] + class_ids = [] + scores = [] + for c in analysis_classes: + + x1_idx = c * 4 * self.grid_size + y1_idx = x1_idx + self.grid_size + x2_idx = y1_idx + self.grid_size + y2_idx = x2_idx + self.grid_size + + boxes = outputs[0] + for h in range(self.grid_h): + for w in range(self.grid_w): + i = w + h * self.grid_w + score = outputs[1][c * self.grid_size + i] + if score >= min_confidence: + o1 = boxes[x1_idx + w + h * self.grid_w] + o2 = boxes[y1_idx + w + h * self.grid_w] + o3 = boxes[x2_idx + w + h * self.grid_w] + o4 = boxes[y2_idx + w + h * self.grid_w] + + o1, o2, o3, o4 = self._apply_box_norm(o1, o2, o3, o4, w, h) + + xmin = int(o1) + ymin = int(o2) + xmax = int(o3) + ymax = int(o4) + if wh_format: + bbs.append([xmin, ymin, xmax - xmin, ymax - ymin]) + else: + bbs.append([xmin, ymin, xmax, ymax]) + class_ids.append(c) + scores.append(float(score)) + + return bbs, class_ids, scores From 17d6c48118c36724f91d3407fdf4cdc77416da60 Mon Sep 17 00:00:00 2001 From: Zhao Date: Wed, 21 Apr 2021 18:26:23 -0700 Subject: [PATCH 02/21] ravel before concatenate to improve performance --- .../vcap_utils/backends/base_tensorrt.py | 63 +++++++++++++++++-- 1 file changed, 59 insertions(+), 4 deletions(-) diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py index 9df6e4c..8e73f37 100644 --- a/vcap_utils/vcap_utils/backends/base_tensorrt.py +++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py @@ -1,7 +1,10 @@ import numpy as np +import cupy as cp + import pycuda.driver as cuda import tensorrt as trt import pycuda.autoinit +import time from typing import Dict, List, Tuple, Optional, Any @@ -78,14 +81,54 @@ def batch_predict(self, input_data_list: List[Any]) -> List[Any]: yield result def _process_batch(self, input_data: List[np.array]): + pre_batch_time = time.time() batch_size = len(input_data) - batched_image = np.concatenate(input_data, axis=0) + # input_data_cuppy = [cp.array(data) for data in input_data] + """Ideas + 1) Try raveling before concatenate + 2) Instead of concatenating, try generating an array of the size and shape of what the concatenated image WOULD + be, then copy the raveled images into their respective places in the array + """ + ravel_time = time.time() + raveled_input = [data.ravel() for data in input_data] + print("batch_size:", batch_size, "ravel time:", int(round((time.time() - ravel_time) * 1000)), + int(round((time.time() - ravel_time) * 1000)) / batch_size) + + concatenate_time = time.time() + batched_image = np.concatenate(raveled_input, axis=0) + #image_size = len(raveled_input[0]) + #batched_image = np.zeros((1, batch_size * image_size)) + #for index, image in enumerate(batched_image): + # batched_image[index * image_size:(index + 1) * image_size] = image + print("batch_size:", batch_size, "concatenate time:", int(round((time.time() - concatenate_time) * 1000)), + int(round((time.time() - concatenate_time) * 1000)) / batch_size) + + # image_size = self.engine_height * self.engine_width + # batched_image = np.zeros((1, batch_size * image_size)) + # for index, image in enumerate(batched_image): + # for row_index, row in enumerate(image): + # batched_image[index * image_size + row_index * self.engine_width:(index+1) * image_size + row_index * self.engine_width] = row + + # for data in input_data: + # batch_image_array.append(data.ravel()) + + # batched_image = np.concatenate(input_data, axis=0) + # print(type(batched_image)) prepared_buffer = self.buffers[batch_size] inputs = prepared_buffer.inputs outputs = prepared_buffer.outputs bindings = prepared_buffer.bindings stream = prepared_buffer.stream - np.copyto(inputs[0].host, batched_image.ravel()) + copy_time = time.time() + # raveled_image = batched_image.ravel() + + # np.copyto(inputs[0].host, raveled_image) + np.copyto(inputs[0].host, batched_image) + print("batch_size:", batch_size, "copy time:", int(round((time.time() - copy_time) * 1000)), + int(round((time.time() - copy_time) * 1000)) / batch_size) + + print("batch_size:", batch_size, "pre_batch_time:", int(round((time.time() - pre_batch_time) * 1000)), + int(round((time.time() - pre_batch_time) * 1000)) / batch_size) detections = self.do_inference( bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=batch_size @@ -100,8 +143,12 @@ def process_frame(self, frame: np.ndarray, detection_node: DETECTION_NODE_TYPE, def prepare_inputs(self, frame: np.ndarray, transpose: bool, normalize: bool, mean_subtraction: Optional[Tuple] = None) -> \ Tuple[np.array, Resize]: + pre_process_start_time = time.time() + # h, w, c = frame.shape + # print(h, w, self.engine_height, self.engine_width) resize = Resize(frame).resize(self.engine_width, self.engine_height, Resize.ResizeType.EXACT) + # print("resize take:", int(round((time.time() - pre_process_start_time) * 1000))) if transpose: resize.frame = np.transpose(resize.frame, (2, 0, 1)) @@ -114,6 +161,7 @@ def prepare_inputs(self, frame: np.ndarray, transpose: bool, normalize: bool, resize.frame[..., 0] -= mean_subtraction[0] resize.frame[..., 1] -= mean_subtraction[1] resize.frame[..., 2] -= mean_subtraction[2] + # print("prepare input take:", int(round((time.time() - pre_process_start_time) * 1000))) return resize.frame, resize def allocate_buffers(self, batch_size=1): @@ -152,12 +200,14 @@ def allocate_buffers(self, batch_size=1): return inputs, outputs, bindings, stream def do_inference(self, bindings, inputs, outputs, stream, batch_size=1): + inference_start_time = time.time() + # Transfer input data to the GPU. [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] # Run inference. # todo: try execute synchronously - self.context.execute_async( - batch_size=batch_size, bindings=bindings, stream_handle=stream.handle + self.context.execute( + batch_size=batch_size, bindings=bindings ) # Transfer predictions back from the GPU. [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] @@ -176,6 +226,11 @@ def do_inference(self, bindings, inputs, outputs, stream, batch_size=1): for batch_output in batch_outputs: final_output.append(batch_output[i]) final_outputs.append(final_output) + print("batch_size:", batch_size, + "TensorRT inference time: {} ms".format( + int(round((time.time() - inference_start_time) * 1000)) + ) + ) return final_outputs def _prepare_post_process(self): From 23196183c2f02baac9dc747ca15fa138347fa5bc Mon Sep 17 00:00:00 2001 From: Zhao Wang Date: Tue, 27 Apr 2021 14:38:13 -0700 Subject: [PATCH 03/21] don't resize if the size of image remains the same --- vcap/vcap/modifiers.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/vcap/vcap/modifiers.py b/vcap/vcap/modifiers.py index a9cdeba..c03aeca 100644 --- a/vcap/vcap/modifiers.py +++ b/vcap/vcap/modifiers.py @@ -196,7 +196,6 @@ def __init__(self, frame: np.ndarray): def resize(self, resize_width: int, resize_height: int, resize_type: ResizeType): - frame_width = self.frame.shape[1] frame_height = self.frame.shape[0] @@ -251,11 +250,11 @@ def resize(self, resize_width: int, resize_height: int, # Account for scaling scale_width = new_width / frame_width scale_height = new_height / frame_height - self._operations.append( - (self._OperationType.SCALE, (scale_width, scale_height)) - ) - - self.frame = cv2.resize(self.frame, (new_width, new_height)) + if new_width != frame_width or new_height != frame_height: + self._operations.append( + (self._OperationType.SCALE, (scale_width, scale_height)) + ) + self.frame = cv2.resize(self.frame, (new_width, new_height)) return self From 5e12a29e06c7f259c8759e45f7b85b04ed060a10 Mon Sep 17 00:00:00 2001 From: Zhao Wang Date: Tue, 27 Apr 2021 22:52:23 -0700 Subject: [PATCH 04/21] code clean up, add type hint --- .../vcap_utils/backends/base_tensorrt.py | 98 ++++--------------- 1 file changed, 20 insertions(+), 78 deletions(-) diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py index 8e73f37..415b5df 100644 --- a/vcap_utils/vcap_utils/backends/base_tensorrt.py +++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py @@ -1,10 +1,8 @@ import numpy as np -import cupy as cp import pycuda.driver as cuda import tensorrt as trt import pycuda.autoinit -import time from typing import Dict, List, Tuple, Optional, Any @@ -32,11 +30,6 @@ def __repr__(self): return self.__str__() -def load_engine(trt_runtime, engine_data): - engine = trt_runtime.deserialize_cuda_engine(engine_data) - return engine - - class AllocatedBuffer: def __init__(self, inputs_, outputs_, bindings_, stream_): self.inputs = inputs_ @@ -51,7 +44,8 @@ def __init__(self, engine_bytes, width, height): TRT_LOGGER = trt.Logger(trt.Logger.WARNING) self.trt_runtime = trt.Runtime(TRT_LOGGER) # load the engine - self.trt_engine = load_engine(self.trt_runtime, engine_bytes) + self.trt_engine = self.trt_runtime.deserialize_cuda_engine(engine_bytes) + # create execution context self.context = self.trt_engine.create_execution_context() # create buffers for inference @@ -65,6 +59,8 @@ def __init__(self, engine_bytes, width, height): self.engine_width = width self.engine_height = height + # preallocate resources for post process + # todo: post process is only need for detectors self._prepare_post_process() def batch_predict(self, input_data_list: List[Any]) -> List[Any]: @@ -80,58 +76,18 @@ def batch_predict(self, input_data_list: List[Any]) -> List[Any]: for result in self._process_batch(batch): yield result - def _process_batch(self, input_data: List[np.array]): - pre_batch_time = time.time() + def _process_batch(self, input_data: List[np.array]) -> List[List[float]]: batch_size = len(input_data) - # input_data_cuppy = [cp.array(data) for data in input_data] - """Ideas - 1) Try raveling before concatenate - 2) Instead of concatenating, try generating an array of the size and shape of what the concatenated image WOULD - be, then copy the raveled images into their respective places in the array - """ - ravel_time = time.time() - raveled_input = [data.ravel() for data in input_data] - print("batch_size:", batch_size, "ravel time:", int(round((time.time() - ravel_time) * 1000)), - int(round((time.time() - ravel_time) * 1000)) / batch_size) - - concatenate_time = time.time() - batched_image = np.concatenate(raveled_input, axis=0) - #image_size = len(raveled_input[0]) - #batched_image = np.zeros((1, batch_size * image_size)) - #for index, image in enumerate(batched_image): - # batched_image[index * image_size:(index + 1) * image_size] = image - print("batch_size:", batch_size, "concatenate time:", int(round((time.time() - concatenate_time) * 1000)), - int(round((time.time() - concatenate_time) * 1000)) / batch_size) - - # image_size = self.engine_height * self.engine_width - # batched_image = np.zeros((1, batch_size * image_size)) - # for index, image in enumerate(batched_image): - # for row_index, row in enumerate(image): - # batched_image[index * image_size + row_index * self.engine_width:(index+1) * image_size + row_index * self.engine_width] = row - - # for data in input_data: - # batch_image_array.append(data.ravel()) - - # batched_image = np.concatenate(input_data, axis=0) - # print(type(batched_image)) prepared_buffer = self.buffers[batch_size] inputs = prepared_buffer.inputs outputs = prepared_buffer.outputs bindings = prepared_buffer.bindings stream = prepared_buffer.stream - copy_time = time.time() - # raveled_image = batched_image.ravel() + # todo: get dtype from engine + inputs[0].host = np.ascontiguousarray(input_data, dtype=np.float32) - # np.copyto(inputs[0].host, raveled_image) - np.copyto(inputs[0].host, batched_image) - print("batch_size:", batch_size, "copy time:", int(round((time.time() - copy_time) * 1000)), - int(round((time.time() - copy_time) * 1000)) / batch_size) - - print("batch_size:", batch_size, "pre_batch_time:", int(round((time.time() - pre_batch_time) * 1000)), - int(round((time.time() - pre_batch_time) * 1000)) / batch_size) detections = self.do_inference( - bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, - batch_size=batch_size + bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=batch_size ) return detections @@ -143,13 +99,8 @@ def process_frame(self, frame: np.ndarray, detection_node: DETECTION_NODE_TYPE, def prepare_inputs(self, frame: np.ndarray, transpose: bool, normalize: bool, mean_subtraction: Optional[Tuple] = None) -> \ Tuple[np.array, Resize]: - pre_process_start_time = time.time() - # h, w, c = frame.shape - # print(h, w, self.engine_height, self.engine_width) resize = Resize(frame).resize(self.engine_width, self.engine_height, Resize.ResizeType.EXACT) - # print("resize take:", int(round((time.time() - pre_process_start_time) * 1000))) - if transpose: resize.frame = np.transpose(resize.frame, (2, 0, 1)) if normalize: @@ -161,18 +112,12 @@ def prepare_inputs(self, frame: np.ndarray, transpose: bool, normalize: bool, resize.frame[..., 0] -= mean_subtraction[0] resize.frame[..., 1] -= mean_subtraction[1] resize.frame[..., 2] -= mean_subtraction[2] - # print("prepare input take:", int(round((time.time() - pre_process_start_time) * 1000))) return resize.frame, resize - def allocate_buffers(self, batch_size=1): + def allocate_buffers(self, batch_size: int = 1) -> \ + Tuple[List[HostDeviceMem], List[HostDeviceMem], List[int], cuda.Stream]: """Allocates host and device buffer for TRT engine inference. - This function is similair to the one in common.py, but - converts network outputs (which are np.float32) appropriately - before writing them to Python buffer. This is needed, since - TensorRT plugins doesn't support output type description, and - in our particular case, we use NMS plugin as network output. Args: - engine (trt.ICudaEngine): TensorRT engine batch_size: batch size for the input/output memory Returns: inputs [HostDeviceMem]: engine input memory @@ -199,13 +144,14 @@ def allocate_buffers(self, batch_size=1): outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream - def do_inference(self, bindings, inputs, outputs, stream, batch_size=1): - inference_start_time = time.time() - + def do_inference(self, bindings: List[int], inputs: List[HostDeviceMem], outputs: List[HostDeviceMem], + stream: cuda.Stream, batch_size: int = 1) -> List[List[float]]: # Transfer input data to the GPU. [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] # Run inference. - # todo: try execute synchronously + # todo: use async or sync api? + # According to https://docs.nvidia.com/deeplearning/tensorrt/best-practices/index.html#optimize-python + # the performance should be almost identical self.context.execute( batch_size=batch_size, bindings=bindings ) @@ -226,11 +172,6 @@ def do_inference(self, bindings, inputs, outputs, stream, batch_size=1): for batch_output in batch_outputs: final_output.append(batch_output[i]) final_outputs.append(final_output) - print("batch_size:", batch_size, - "TensorRT inference time: {} ms".format( - int(round((time.time() - inference_start_time) * 1000)) - ) - ) return final_outputs def _prepare_post_process(self): @@ -251,7 +192,8 @@ def _prepare_post_process(self): value = (i * self.stride + 0.5) / self.box_norm self.grid_centers_w.append(value) - def _apply_box_norm(self, o1, o2, o3, o4, x, y): + def _apply_box_norm(self, o1: float, o2: float, o3: float, o4: float, x: int, y: int) -> \ + Tuple[float, float, float, float]: """ Applies the GridNet box normalization Args: @@ -274,17 +216,17 @@ def _apply_box_norm(self, o1, o2, o3, o4, x, y): o4 = (o4 + self.grid_centers_h[y]) * self.box_norm return o1, o2, o3, o4 - def postprocess(self, outputs, min_confidence, analysis_classes, wh_format=True): + def postprocess(self, outputs: List[float], min_confidence: float, analysis_classes: List[int], wh_format=True)-> \ + Tuple[List[List[int]], List[int], List[float]]: """ Postprocesses the inference output Args: outputs (list of float): inference output min_confidence (float): min confidence to accept detection analysis_classes (list of int): indices of the classes to consider - + wh_format (bool): return bbox in (xmin, ymin, w, h) or (xmin, ymin, xmax, ymax) Returns: list of list tuple: each element is a two list tuple (x, y) representing the corners of a bb """ - # print(len(outputs)) bbs = [] class_ids = [] scores = [] From d2a00a0be54be77c127b04c7b264b18e0fd63a99 Mon Sep 17 00:00:00 2001 From: Zhao Wang Date: Tue, 27 Apr 2021 22:57:30 -0700 Subject: [PATCH 05/21] update dependency list --- vcap/setup.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vcap/setup.py b/vcap/setup.py index fd04d33..ceba2da 100644 --- a/vcap/setup.py +++ b/vcap/setup.py @@ -25,6 +25,8 @@ "scikit-learn==0.22.2", "numpy>=1.16,<2", "tensorflow-gpu==1.15.4", + "pycuda>=2019.1.1", + "tensorrt==7.2.3.4", ], extras_require={ "tests": test_packages, From 659d003f1b994691bafd49ae2f96bb292fe3109e Mon Sep 17 00:00:00 2001 From: Zhao Wang Date: Thu, 29 Apr 2021 18:32:57 -0700 Subject: [PATCH 06/21] add support for multi GPU --- .../vcap_utils/backends/base_tensorrt.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py index 415b5df..22d8349 100644 --- a/vcap_utils/vcap_utils/backends/base_tensorrt.py +++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py @@ -2,19 +2,15 @@ import pycuda.driver as cuda import tensorrt as trt -import pycuda.autoinit from typing import Dict, List, Tuple, Optional, Any from vcap import ( - Crop, - DetectionNode, Resize, DETECTION_NODE_TYPE, OPTION_TYPE, BaseStreamState, BaseBackend, - rect_to_coords, ) @@ -39,13 +35,16 @@ def __init__(self, inputs_, outputs_, bindings_, stream_): class BaseTensorRTBackend(BaseBackend): - def __init__(self, engine_bytes, width, height): + def __init__(self, engine_bytes, width, height, device_id): super().__init__() + gpu_devide_id = int(device_id[4:]) + cuda.init() + dev = cuda.Device(gpu_devide_id) + self.ctx = dev.make_context() TRT_LOGGER = trt.Logger(trt.Logger.WARNING) self.trt_runtime = trt.Runtime(TRT_LOGGER) # load the engine self.trt_engine = self.trt_runtime.deserialize_cuda_engine(engine_bytes) - # create execution context self.context = self.trt_engine.create_execution_context() # create buffers for inference @@ -147,6 +146,7 @@ def allocate_buffers(self, batch_size: int = 1) -> \ def do_inference(self, bindings: List[int], inputs: List[HostDeviceMem], outputs: List[HostDeviceMem], stream: cuda.Stream, batch_size: int = 1) -> List[List[float]]: # Transfer input data to the GPU. + self.ctx.push() [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] # Run inference. # todo: use async or sync api? @@ -172,6 +172,7 @@ def do_inference(self, bindings: List[int], inputs: List[HostDeviceMem], outputs for batch_output in batch_outputs: final_output.append(batch_output[i]) final_outputs.append(final_output) + self.ctx.pop() return final_outputs def _prepare_post_process(self): @@ -216,7 +217,7 @@ def _apply_box_norm(self, o1: float, o2: float, o3: float, o4: float, x: int, y: o4 = (o4 + self.grid_centers_h[y]) * self.box_norm return o1, o2, o3, o4 - def postprocess(self, outputs: List[float], min_confidence: float, analysis_classes: List[int], wh_format=True)-> \ + def postprocess(self, outputs: List[float], min_confidence: float, analysis_classes: List[int], wh_format=True) -> \ Tuple[List[List[int]], List[int], List[float]]: """ Postprocesses the inference output @@ -262,3 +263,7 @@ def postprocess(self, outputs: List[float], min_confidence: float, analysis_clas scores.append(float(score)) return bbs, class_ids, scores + + def close(self): + super().close() + self.ctx.pop() From 4f28496399bd9b07ea06b94840acb257ab04ce03 Mon Sep 17 00:00:00 2001 From: Zhao Wang Date: Fri, 30 Apr 2021 00:52:03 -0700 Subject: [PATCH 07/21] update post processing api --- .../vcap_utils/backends/base_tensorrt.py | 57 ++++++++++--------- 1 file changed, 31 insertions(+), 26 deletions(-) diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py index 22d8349..9554559 100644 --- a/vcap_utils/vcap_utils/backends/base_tensorrt.py +++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py @@ -2,6 +2,7 @@ import pycuda.driver as cuda import tensorrt as trt +import cv2 from typing import Dict, List, Tuple, Optional, Any @@ -11,6 +12,8 @@ OPTION_TYPE, BaseStreamState, BaseBackend, + rect_to_coords, + DetectionNode, ) @@ -217,53 +220,55 @@ def _apply_box_norm(self, o1: float, o2: float, o3: float, o4: float, x: int, y: o4 = (o4 + self.grid_centers_h[y]) * self.box_norm return o1, o2, o3, o4 - def postprocess(self, outputs: List[float], min_confidence: float, analysis_classes: List[int], wh_format=True) -> \ - Tuple[List[List[int]], List[int], List[float]]: - """ - Postprocesses the inference output - Args: - outputs (list of float): inference output - min_confidence (float): min confidence to accept detection - analysis_classes (list of int): indices of the classes to consider - wh_format (bool): return bbox in (xmin, ymin, w, h) or (xmin, ymin, xmax, ymax) - Returns: list of list tuple: each element is a two list tuple (x, y) representing the corners of a bb - """ + def parse_detection_results( + self, results: List[List[float]], + resize: Resize, + label_map: Dict[int, str], + min_confidence: float = 0.0, + ) -> List[DetectionNode]: bbs = [] class_ids = [] scores = [] - for c in analysis_classes: + for c in label_map.keys(): x1_idx = c * 4 * self.grid_size y1_idx = x1_idx + self.grid_size x2_idx = y1_idx + self.grid_size y2_idx = x2_idx + self.grid_size - boxes = outputs[0] + boxes = results[0] for h in range(self.grid_h): for w in range(self.grid_w): i = w + h * self.grid_w - score = outputs[1][c * self.grid_size + i] + score = results[1][c * self.grid_size + i] if score >= min_confidence: o1 = boxes[x1_idx + w + h * self.grid_w] o2 = boxes[y1_idx + w + h * self.grid_w] o3 = boxes[x2_idx + w + h * self.grid_w] o4 = boxes[y2_idx + w + h * self.grid_w] - o1, o2, o3, o4 = self._apply_box_norm(o1, o2, o3, o4, w, h) - xmin = int(o1) ymin = int(o2) xmax = int(o3) ymax = int(o4) - if wh_format: - bbs.append([xmin, ymin, xmax - xmin, ymax - ymin]) - else: - bbs.append([xmin, ymin, xmax, ymax]) + bbs.append([xmin, ymin, xmax - xmin, ymax - ymin]) class_ids.append(c) scores.append(float(score)) - - return bbs, class_ids, scores - - def close(self): - super().close() - self.ctx.pop() + indexes = cv2.dnn.NMSBoxes(bbs, scores, min_confidence, 0.5) + detections = [] + for idx in indexes: + idx = int(idx) + xmin, ymin, w, h = bbs[idx] + class_id = class_ids[idx] + class_name = label_map[class_id] + detections.append( + DetectionNode( + name=class_name, + coords=rect_to_coords( + [xmin, ymin, (xmin + w), (ymin + h)] + ), + extra_data={"detection_confidence": scores[idx]}, + ) + ) + resize.scale_and_offset_detection_nodes(detections) + return detections From 99e9248830e4b1bb063b97616ebd21c008516c7c Mon Sep 17 00:00:00 2001 From: Zhao Wang Date: Fri, 30 Apr 2021 11:41:37 -0700 Subject: [PATCH 08/21] add close() fucntion back, was deleted by accident --- vcap_utils/vcap_utils/backends/base_tensorrt.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py index 9554559..d119283 100644 --- a/vcap_utils/vcap_utils/backends/base_tensorrt.py +++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py @@ -272,3 +272,7 @@ def parse_detection_results( ) resize.scale_and_offset_detection_nodes(detections) return detections + + def close(self) -> None: + super().close() + self.ctx.pop() \ No newline at end of file From 1901c879732e65f78baf4b7ed9b0cc6ad6091c26 Mon Sep 17 00:00:00 2001 From: Zhao Wang Date: Fri, 30 Apr 2021 21:21:31 -0700 Subject: [PATCH 09/21] remove process_frame, add type hint for __init__ --- vcap_utils/vcap_utils/backends/base_tensorrt.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py index d119283..29b0005 100644 --- a/vcap_utils/vcap_utils/backends/base_tensorrt.py +++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py @@ -8,9 +8,6 @@ from vcap import ( Resize, - DETECTION_NODE_TYPE, - OPTION_TYPE, - BaseStreamState, BaseBackend, rect_to_coords, DetectionNode, @@ -38,7 +35,7 @@ def __init__(self, inputs_, outputs_, bindings_, stream_): class BaseTensorRTBackend(BaseBackend): - def __init__(self, engine_bytes, width, height, device_id): + def __init__(self, engine_bytes: bytes, width: int, height: int, device_id: str): super().__init__() gpu_devide_id = int(device_id[4:]) cuda.init() @@ -93,11 +90,6 @@ def _process_batch(self, input_data: List[np.array]) -> List[List[float]]: ) return detections - def process_frame(self, frame: np.ndarray, detection_node: DETECTION_NODE_TYPE, - options: Dict[str, OPTION_TYPE], - state: BaseStreamState) -> DETECTION_NODE_TYPE: - pass - def prepare_inputs(self, frame: np.ndarray, transpose: bool, normalize: bool, mean_subtraction: Optional[Tuple] = None) -> \ Tuple[np.array, Resize]: @@ -275,4 +267,4 @@ def parse_detection_results( def close(self) -> None: super().close() - self.ctx.pop() \ No newline at end of file + self.ctx.pop() From d51139b95d9d5652afdce2b67effa5ca994c86af Mon Sep 17 00:00:00 2001 From: Zhao Wang Date: Fri, 30 Apr 2021 21:27:55 -0700 Subject: [PATCH 10/21] remove cuda and trt context --- vcap_utils/vcap_utils/backends/base_tensorrt.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py index 29b0005..f576cfd 100644 --- a/vcap_utils/vcap_utils/backends/base_tensorrt.py +++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py @@ -40,13 +40,13 @@ def __init__(self, engine_bytes: bytes, width: int, height: int, device_id: str) gpu_devide_id = int(device_id[4:]) cuda.init() dev = cuda.Device(gpu_devide_id) - self.ctx = dev.make_context() + self.cuda_context = dev.make_context() TRT_LOGGER = trt.Logger(trt.Logger.WARNING) self.trt_runtime = trt.Runtime(TRT_LOGGER) # load the engine self.trt_engine = self.trt_runtime.deserialize_cuda_engine(engine_bytes) # create execution context - self.context = self.trt_engine.create_execution_context() + self.trt_context = self.trt_engine.create_execution_context() # create buffers for inference self.buffers = {} for batch_size in range(1, self.trt_engine.max_batch_size + 1): @@ -141,13 +141,13 @@ def allocate_buffers(self, batch_size: int = 1) -> \ def do_inference(self, bindings: List[int], inputs: List[HostDeviceMem], outputs: List[HostDeviceMem], stream: cuda.Stream, batch_size: int = 1) -> List[List[float]]: # Transfer input data to the GPU. - self.ctx.push() + self.cuda_context.push() [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] # Run inference. # todo: use async or sync api? # According to https://docs.nvidia.com/deeplearning/tensorrt/best-practices/index.html#optimize-python # the performance should be almost identical - self.context.execute( + self.trt_context.execute( batch_size=batch_size, bindings=bindings ) # Transfer predictions back from the GPU. @@ -167,7 +167,7 @@ def do_inference(self, bindings: List[int], inputs: List[HostDeviceMem], outputs for batch_output in batch_outputs: final_output.append(batch_output[i]) final_outputs.append(final_output) - self.ctx.pop() + self.cuda_context.pop() return final_outputs def _prepare_post_process(self): @@ -267,4 +267,4 @@ def parse_detection_results( def close(self) -> None: super().close() - self.ctx.pop() + self.cuda_context.pop() From 07af9dd37d0a62483605e0b5718b29eb22732fa1 Mon Sep 17 00:00:00 2001 From: Zhao Wang Date: Fri, 30 Apr 2021 21:38:45 -0700 Subject: [PATCH 11/21] make stride a local variable --- vcap_utils/vcap_utils/backends/base_tensorrt.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py index f576cfd..efd8784 100644 --- a/vcap_utils/vcap_utils/backends/base_tensorrt.py +++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py @@ -171,21 +171,21 @@ def do_inference(self, bindings: List[int], inputs: List[HostDeviceMem], outputs return final_outputs def _prepare_post_process(self): - self.stride = 16 + stride = 16 self.box_norm = 35.0 - self.grid_h = int(self.engine_height / self.stride) - self.grid_w = int(self.engine_width / self.stride) + self.grid_h = int(self.engine_height / stride) + self.grid_w = int(self.engine_width / stride) self.grid_size = self.grid_h * self.grid_w self.grid_centers_w = [] self.grid_centers_h = [] for i in range(self.grid_h): - value = (i * self.stride + 0.5) / self.box_norm + value = (i * stride + 0.5) / self.box_norm self.grid_centers_h.append(value) for i in range(self.grid_w): - value = (i * self.stride + 0.5) / self.box_norm + value = (i * stride + 0.5) / self.box_norm self.grid_centers_w.append(value) def _apply_box_norm(self, o1: float, o2: float, o3: float, o4: float, x: int, y: int) -> \ From f661509ff75d442222648943f2e4a64dffd7b9a4 Mon Sep 17 00:00:00 2001 From: Zhao Wang Date: Fri, 30 Apr 2021 21:44:18 -0700 Subject: [PATCH 12/21] update type hint for batch_predict --- vcap_utils/vcap_utils/backends/base_tensorrt.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py index efd8784..7ff4702 100644 --- a/vcap_utils/vcap_utils/backends/base_tensorrt.py +++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py @@ -4,7 +4,7 @@ import tensorrt as trt import cv2 -from typing import Dict, List, Tuple, Optional, Any +from typing import Dict, List, Tuple, Optional, Generator from vcap import ( Resize, @@ -62,7 +62,8 @@ def __init__(self, engine_bytes: bytes, width: int, height: int, device_id: str) # todo: post process is only need for detectors self._prepare_post_process() - def batch_predict(self, input_data_list: List[Any]) -> List[Any]: + def batch_predict(self, input_data_list: List[np.ndarray]) \ + -> Generator[List[DetectionNode], None, None]: task_size = len(input_data_list) curr_index = 0 while curr_index < task_size: From 874414b6fc00eddb8d25b2d8ce4f8a5cd3cc2e4c Mon Sep 17 00:00:00 2001 From: Zhao Wang Date: Fri, 30 Apr 2021 21:46:18 -0700 Subject: [PATCH 13/21] formatting --- vcap_utils/vcap_utils/backends/base_tensorrt.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py index 7ff4702..cb2c449 100644 --- a/vcap_utils/vcap_utils/backends/base_tensorrt.py +++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py @@ -86,7 +86,7 @@ def _process_batch(self, input_data: List[np.array]) -> List[List[float]]: # todo: get dtype from engine inputs[0].host = np.ascontiguousarray(input_data, dtype=np.float32) - detections = self.do_inference( + detections = self._do_inference( bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=batch_size ) return detections @@ -139,8 +139,11 @@ def allocate_buffers(self, batch_size: int = 1) -> \ outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream - def do_inference(self, bindings: List[int], inputs: List[HostDeviceMem], outputs: List[HostDeviceMem], - stream: cuda.Stream, batch_size: int = 1) -> List[List[float]]: + def _do_inference(self, bindings: List[int], + inputs: List[HostDeviceMem], + outputs: List[HostDeviceMem], + stream: cuda.Stream, + batch_size: int = 1) -> List[List[float]]: # Transfer input data to the GPU. self.cuda_context.push() [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] From 7255b8c2112d6493753deb8670e6410a1b521d8e Mon Sep 17 00:00:00 2001 From: Zhao Wang Date: Fri, 30 Apr 2021 21:55:48 -0700 Subject: [PATCH 14/21] formatting --- vcap_utils/vcap_utils/backends/base_tensorrt.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py index cb2c449..dcb7b38 100644 --- a/vcap_utils/vcap_utils/backends/base_tensorrt.py +++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py @@ -80,16 +80,16 @@ def _process_batch(self, input_data: List[np.array]) -> List[List[float]]: batch_size = len(input_data) prepared_buffer = self.buffers[batch_size] inputs = prepared_buffer.inputs - outputs = prepared_buffer.outputs - bindings = prepared_buffer.bindings - stream = prepared_buffer.stream # todo: get dtype from engine inputs[0].host = np.ascontiguousarray(input_data, dtype=np.float32) - detections = self._do_inference( - bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=batch_size + return self._do_inference( + bindings=prepared_buffer.bindings, + inputs=inputs, + outputs=prepared_buffer.outputs, + stream=prepared_buffer.stream, + batch_size=batch_size ) - return detections def prepare_inputs(self, frame: np.ndarray, transpose: bool, normalize: bool, mean_subtraction: Optional[Tuple] = None) -> \ From 14d88f96d52e2abdebcdd530589e94125598c25b Mon Sep 17 00:00:00 2001 From: Zhao Wang Date: Mon, 3 May 2021 14:39:01 -0700 Subject: [PATCH 15/21] rename device_id to device_name --- vcap_utils/vcap_utils/backends/base_tensorrt.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py index dcb7b38..460df1b 100644 --- a/vcap_utils/vcap_utils/backends/base_tensorrt.py +++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py @@ -35,9 +35,9 @@ def __init__(self, inputs_, outputs_, bindings_, stream_): class BaseTensorRTBackend(BaseBackend): - def __init__(self, engine_bytes: bytes, width: int, height: int, device_id: str): + def __init__(self, engine_bytes: bytes, width: int, height: int, device_name: str): super().__init__() - gpu_devide_id = int(device_id[4:]) + gpu_devide_id = int(device_name[4:]) cuda.init() dev = cuda.Device(gpu_devide_id) self.cuda_context = dev.make_context() From 1b936981a61ac708ff064f9fed9d2f8a18cdd9ab Mon Sep 17 00:00:00 2001 From: Zhao Wang <65635224+BestDriverCN@users.noreply.github.com> Date: Mon, 3 May 2021 15:03:11 -0700 Subject: [PATCH 16/21] _apply_box_norm will return int Co-authored-by: Alex Thiel --- .../vcap_utils/backends/base_tensorrt.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py index d119283..ccc2e14 100644 --- a/vcap_utils/vcap_utils/backends/base_tensorrt.py +++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py @@ -197,7 +197,7 @@ def _prepare_post_process(self): self.grid_centers_w.append(value) def _apply_box_norm(self, o1: float, o2: float, o3: float, o4: float, x: int, y: int) -> \ - Tuple[float, float, float, float]: + Tuple[int, int, int, int]: """ Applies the GridNet box normalization Args: @@ -209,16 +209,16 @@ def _apply_box_norm(self, o1: float, o2: float, o3: float, o4: float, x: int, y: y: column index on the grid Returns: - float: rescaled first argument - float: rescaled second argument - float: rescaled third argument - float: rescaled fourth argument + int: rescaled first argument + int: rescaled second argument + int: rescaled third argument + int: rescaled fourth argument """ - o1 = (o1 - self.grid_centers_w[x]) * -self.box_norm - o2 = (o2 - self.grid_centers_h[y]) * -self.box_norm - o3 = (o3 + self.grid_centers_w[x]) * self.box_norm - o4 = (o4 + self.grid_centers_h[y]) * self.box_norm - return o1, o2, o3, o4 + xmin = int((o1 - self.grid_centers_w[x]) * -self.box_norm) + ymin = int((o2 - self.grid_centers_h[y]) * -self.box_norm) + xmax = int((o3 + self.grid_centers_w[x]) * self.box_norm) + ymax = int((o4 + self.grid_centers_h[y]) * self.box_norm) + return xmin, ymin, xmax, ymax def parse_detection_results( self, results: List[List[float]], @@ -275,4 +275,4 @@ def parse_detection_results( def close(self) -> None: super().close() - self.ctx.pop() \ No newline at end of file + self.ctx.pop() From 57d3eeb3c4bf08001084213174dc58765a1830a7 Mon Sep 17 00:00:00 2001 From: Zhao Wang Date: Mon, 3 May 2021 15:06:50 -0700 Subject: [PATCH 17/21] converting _apply_box_norm's return type --- vcap_utils/vcap_utils/backends/base_tensorrt.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py index b5b3863..b91037e 100644 --- a/vcap_utils/vcap_utils/backends/base_tensorrt.py +++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py @@ -226,7 +226,6 @@ def parse_detection_results( class_ids = [] scores = [] for c in label_map.keys(): - x1_idx = c * 4 * self.grid_size y1_idx = x1_idx + self.grid_size x2_idx = y1_idx + self.grid_size @@ -242,11 +241,7 @@ def parse_detection_results( o2 = boxes[y1_idx + w + h * self.grid_w] o3 = boxes[x2_idx + w + h * self.grid_w] o4 = boxes[y2_idx + w + h * self.grid_w] - o1, o2, o3, o4 = self._apply_box_norm(o1, o2, o3, o4, w, h) - xmin = int(o1) - ymin = int(o2) - xmax = int(o3) - ymax = int(o4) + xmin, ymin, xmax, ymax = self._apply_box_norm(o1, o2, o3, o4, w, h) bbs.append([xmin, ymin, xmax - xmin, ymax - ymin]) class_ids.append(c) scores.append(float(score)) From ef70e0d9f3a8cb1e843be5ef6f896d38004a78e2 Mon Sep 17 00:00:00 2001 From: Zhao Wang Date: Mon, 3 May 2021 15:25:11 -0700 Subject: [PATCH 18/21] refactor parse_detection_results --- .../vcap_utils/backends/base_tensorrt.py | 44 +++++++------------ 1 file changed, 16 insertions(+), 28 deletions(-) diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py index b91037e..4038119 100644 --- a/vcap_utils/vcap_utils/backends/base_tensorrt.py +++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py @@ -13,6 +13,8 @@ DetectionNode, ) +from vcap_utils import non_max_suppression + class HostDeviceMem(object): def __init__(self, host_mem, device_mem): @@ -222,11 +224,9 @@ def parse_detection_results( label_map: Dict[int, str], min_confidence: float = 0.0, ) -> List[DetectionNode]: - bbs = [] - class_ids = [] - scores = [] - for c in label_map.keys(): - x1_idx = c * 4 * self.grid_size + detection_nodes: List[DetectionNode] = [] + for class_id, class_name in label_map.items(): + x1_idx = class_id * 4 * self.grid_size y1_idx = x1_idx + self.grid_size x2_idx = y1_idx + self.grid_size y2_idx = x2_idx + self.grid_size @@ -235,36 +235,24 @@ def parse_detection_results( for h in range(self.grid_h): for w in range(self.grid_w): i = w + h * self.grid_w - score = results[1][c * self.grid_size + i] + score = results[1][class_id * self.grid_size + i] if score >= min_confidence: o1 = boxes[x1_idx + w + h * self.grid_w] o2 = boxes[y1_idx + w + h * self.grid_w] o3 = boxes[x2_idx + w + h * self.grid_w] o4 = boxes[y2_idx + w + h * self.grid_w] xmin, ymin, xmax, ymax = self._apply_box_norm(o1, o2, o3, o4, w, h) - bbs.append([xmin, ymin, xmax - xmin, ymax - ymin]) - class_ids.append(c) - scores.append(float(score)) - indexes = cv2.dnn.NMSBoxes(bbs, scores, min_confidence, 0.5) - detections = [] - for idx in indexes: - idx = int(idx) - xmin, ymin, w, h = bbs[idx] - class_id = class_ids[idx] - class_name = label_map[class_id] - detections.append( - DetectionNode( - name=class_name, - coords=rect_to_coords( - [xmin, ymin, (xmin + w), (ymin + h)] - ), - extra_data={"detection_confidence": scores[idx]}, - ) - ) - resize.scale_and_offset_detection_nodes(detections) - return detections + detection_nodes.append(DetectionNode( + name=class_name, + coords=rect_to_coords( + [xmin, ymin, xmax, ymax] + ), + extra_data={"detection_confidence": score}, + )) + nodes = non_max_suppression(detection_nodes, max_bbox_overlap=0.5) + resize.scale_and_offset_detection_nodes(detection_nodes) + return detection_nodes def close(self) -> None: super().close() self.cuda_context.pop() - From a33112d3bd08415e3478047c75770fa62e73084a Mon Sep 17 00:00:00 2001 From: Zhao Wang Date: Mon, 3 May 2021 17:51:48 -0700 Subject: [PATCH 19/21] fix import path, fix variable name, simplify code --- .../vcap_utils/backends/base_tensorrt.py | 26 +++++-------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py index 4038119..a3daf04 100644 --- a/vcap_utils/vcap_utils/backends/base_tensorrt.py +++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py @@ -2,7 +2,6 @@ import pycuda.driver as cuda import tensorrt as trt -import cv2 from typing import Dict, List, Tuple, Optional, Generator @@ -12,8 +11,7 @@ rect_to_coords, DetectionNode, ) - -from vcap_utils import non_max_suppression +from vcap_utils.algorithms import non_max_suppression class HostDeviceMem(object): @@ -167,12 +165,8 @@ def _do_inference(self, bindings: List[int], out_array_by_batch = np.split(entire_out_array, batch_size) out_lists = [out_array.tolist() for out_array in out_array_by_batch] batch_outputs.append(out_lists) - final_outputs = [] - for i in range(len(batch_outputs[0])): - final_output = [] - for batch_output in batch_outputs: - final_output.append(batch_output[i]) - final_outputs.append(final_output) + final_outputs = list(zip(*batch_outputs)) + final_outputs = [list(item) for item in final_outputs] self.cuda_context.pop() return final_outputs @@ -183,16 +177,8 @@ def _prepare_post_process(self): self.grid_w = int(self.engine_width / stride) self.grid_size = self.grid_h * self.grid_w - self.grid_centers_w = [] - self.grid_centers_h = [] - - for i in range(self.grid_h): - value = (i * stride + 0.5) / self.box_norm - self.grid_centers_h.append(value) - - for i in range(self.grid_w): - value = (i * stride + 0.5) / self.box_norm - self.grid_centers_w.append(value) + self.grid_centers_h = [(i * stride + 0.5) / self.box_norm for i in range(self.grid_h)] + self.grid_centers_w = [(i * stride + 0.5) / self.box_norm for i in range(self.grid_w)] def _apply_box_norm(self, o1: float, o2: float, o3: float, o4: float, x: int, y: int) -> \ Tuple[int, int, int, int]: @@ -249,7 +235,7 @@ def parse_detection_results( ), extra_data={"detection_confidence": score}, )) - nodes = non_max_suppression(detection_nodes, max_bbox_overlap=0.5) + detection_nodes = non_max_suppression(detection_nodes, max_bbox_overlap=0.5) resize.scale_and_offset_detection_nodes(detection_nodes) return detection_nodes From 8fe05983a25b2f0cb287785a7bccc34bdf154e6b Mon Sep 17 00:00:00 2001 From: Zhao Wang Date: Mon, 3 May 2021 18:14:26 -0700 Subject: [PATCH 20/21] simplify code --- vcap_utils/vcap_utils/backends/base_tensorrt.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py index a3daf04..b66c83f 100644 --- a/vcap_utils/vcap_utils/backends/base_tensorrt.py +++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py @@ -79,13 +79,12 @@ def batch_predict(self, input_data_list: List[np.ndarray]) \ def _process_batch(self, input_data: List[np.array]) -> List[List[float]]: batch_size = len(input_data) prepared_buffer = self.buffers[batch_size] - inputs = prepared_buffer.inputs # todo: get dtype from engine - inputs[0].host = np.ascontiguousarray(input_data, dtype=np.float32) + prepared_buffer.inputs[0].host = np.ascontiguousarray(input_data, dtype=np.float32) return self._do_inference( bindings=prepared_buffer.bindings, - inputs=inputs, + inputs=prepared_buffer.inputs, outputs=prepared_buffer.outputs, stream=prepared_buffer.stream, batch_size=batch_size From 1da54367b4aaa08385f0072e66e838114a9f353a Mon Sep 17 00:00:00 2001 From: Zhao Wang Date: Wed, 5 May 2021 15:21:53 -0700 Subject: [PATCH 21/21] change log level to INFO, cast score to float --- vcap_utils/vcap_utils/backends/base_tensorrt.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vcap_utils/vcap_utils/backends/base_tensorrt.py b/vcap_utils/vcap_utils/backends/base_tensorrt.py index b66c83f..596e8c0 100644 --- a/vcap_utils/vcap_utils/backends/base_tensorrt.py +++ b/vcap_utils/vcap_utils/backends/base_tensorrt.py @@ -41,7 +41,7 @@ def __init__(self, engine_bytes: bytes, width: int, height: int, device_name: st cuda.init() dev = cuda.Device(gpu_devide_id) self.cuda_context = dev.make_context() - TRT_LOGGER = trt.Logger(trt.Logger.WARNING) + TRT_LOGGER = trt.Logger(trt.Logger.INFO) self.trt_runtime = trt.Runtime(TRT_LOGGER) # load the engine self.trt_engine = self.trt_runtime.deserialize_cuda_engine(engine_bytes) @@ -232,7 +232,7 @@ def parse_detection_results( coords=rect_to_coords( [xmin, ymin, xmax, ymax] ), - extra_data={"detection_confidence": score}, + extra_data={"detection_confidence": float(score)}, )) detection_nodes = non_max_suppression(detection_nodes, max_bbox_overlap=0.5) resize.scale_and_offset_detection_nodes(detection_nodes)