Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Inference on Multiple streams #990

Open
williamhoole opened this issue Aug 12, 2024 · 0 comments
Open

Inference on Multiple streams #990

williamhoole opened this issue Aug 12, 2024 · 0 comments

Comments

@williamhoole
Copy link

williamhoole commented Aug 12, 2024

Hi, I have done some benchmarking using the ./trtexec binary. I was using this to measure the inference speed of a model and compare if using a Larger batch size or using Multiple streams is better for Inference. The results show that when I use a model with batch size 1 on 10 streams it is faster than running a single model with batch size 10. I wanted to test this with real images( because ./trtexec generates random image inputs) however i don't get the same inference speed. What is the correct way to run multiple streams to run models in parallel?

when using ./trtexec --loadEngine= path/to/engine.engine --streams=10 --iterations=100

my python code while trying to create multiple contextexecution for multiple streams:

import torch
import numpy as np
import time
import pycuda.driver as cuda
import pycuda.autoinit
import tensorrt as trt
from PIL import Image
from utils import make_query_image

def load_image(img_path):
    with open(img_path, 'rb') as f:
        img = Image.open(f).convert('RGB')
    return img

def preprocess_image(img_path, img_size=(256, 256)):
    img = load_image(img_path)
    resized_img = make_query_image(img, img_size)
    img_array = np.array(resized_img)
    img_tensor = torch.from_numpy(img_array)[None] / 255.0
    img_tensor = img_tensor.unsqueeze(0).to(device=device, dtype=torch.float32)
    return img_tensor

class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem, name):
        self.host = host_mem
        self.device = device_mem
        self.name = name

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

class TRTInference:
    def __init__(self, engine_path, dtype=np.float32):
        self.engine_path = engine_path
        self.dtype = dtype
        self.logger = trt.Logger(trt.Logger.WARNING)
        self.runtime = trt.Runtime(self.logger)
        self.engine = self.load_engine(self.runtime, self.engine_path)
        self.contexts = [self.engine.create_execution_context() for _ in range(num_contexts)]
        self.streams = [cuda.Stream() for _ in range(num_contexts)]
        self.buffers = [self.allocate_buffers() for _ in range(num_contexts)]

    @staticmethod
    def load_engine(trt_runtime, engine_path):
        trt.init_libnvinfer_plugins(None, "")
        with open(engine_path, 'rb') as f:
            engine_data = f.read()
        engine = trt_runtime.deserialize_cuda_engine(engine_data)
        return engine

    def allocate_buffers(self):
        inputs = []
        outputs = []
        bindings = []

        for i in range(self.engine.num_bindings):
            binding = self.engine[i]
            size = trt.volume(self.engine.get_tensor_shape(binding))
            host_mem = cuda.pagelocked_empty(size, self.dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)

            bindings.append(int(device_mem))

            if self.engine.get_tensor_mode(binding) == trt.TensorIOMode.INPUT:
                inputs.append(HostDeviceMem(host_mem, device_mem, self.engine.get_tensor_name(i)))
            else:
                outputs.append(HostDeviceMem(host_mem, device_mem, self.engine.get_tensor_name(i)))

        return inputs, outputs, bindings

    def infer(self, img0, img1, context_idx):
        context = self.contexts[context_idx]
        stream = self.streams[context_idx]
        inputs, outputs, bindings = self.buffers[context_idx]

        img0 = img0.cpu().numpy().astype(np.float32).ravel()
        img1 = img1.cpu().numpy().astype(np.float32).ravel()

        np.copyto(inputs[0].host, img0)
        np.copyto(inputs[1].host, img1)

        for inp in inputs:
            cuda.memcpy_htod_async(inp.device, inp.host, stream)

        context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)

        for out in outputs:
            cuda.memcpy_dtoh_async(out.host, out.device, stream)

        stream.synchronize()

        output1 = outputs[0].host.reshape((1, 1024, 1024))
        output2 = outputs[1].host.reshape((1, 1024, 1024))

        return output1, output2

# Path to the TensorRT engine
trt_engine_path = "Feature_Matching_Model.engine"

img0_path = "path/to/img0.png"
image1_paths = [
   "path/to/img1.png"
"path/to/img2.png",
"path/to/img3.png",
"path/to/img4.png",
"path/to/img5.png"
]

# Number of parallel execution contexts
num_contexts = 10

# Load and preprocess images
device = 'cuda' if torch.cuda.is_available() else 'cpu'
img0 = preprocess_image(img0_path)

# Preprocess all target images
preprocessed_images = [preprocess_image(img_path) for img_path in image1_paths]

# Instantiate the TRT model with multiple contexts
trt_model = TRTInference(trt_engine_path)

# Warm-up loop to discard the initial inference overhead
warmup_iterations = 100
print("Warming up...")

for _ in range(warmup_iterations):
    for idx in range(len(preprocessed_images)):
        trt_model.infer(img0, preprocessed_images[idx], context_idx=idx % num_contexts)

print("Warm-up completed. Starting timed inferences...")
start_time = time.time()

# Run inference in parallel without ThreadPoolExecutor
results = []
for idx in range(len(preprocessed_images)):
    output1, output2 = trt_model.infer(img0, preprocessed_images[idx], context_idx=idx % num_contexts)
    results.append((output1, output2))

infer_time = time.time() - start_time
print("Inference time:", infer_time)
print("Results shape:", np.shape(results))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant