triton采用客户端访问的速度比较triton客户端最慢的grpc为0.037秒，比原有的http请求0.51秒，快了

背景

当前triton server的访问，采用的是普通的http API 请求。

在最近的一个sbert full模型中（除了平均向量，新增每个字符的向量输出，新增传输字节约为2567684），网络请求很慢。

办公网络-》开发网络（2.5M/s)，需要1.2秒的时间。

即使是localhost网络中，也达到0.51秒。

工程性能较差。

（全文所有平均时间均为连续执行100次的平均结果）

改进

采用triton client去访问triton server

triton client 安装

pip install nvidia-pyindex
pip install tritonclient[all]

triton客户端不同模式下的运行速度

样例代码参考：github.com/triton-infe…

这里为几个根据sbert full模型修改的可运行脚本。

shm代表采用共享内存来传递模型的输入和输出

grpc代表采用grpc协议传递模型的输入和输出

cudashm代表采用cuda内存传递模型的输入和输出

http_async代表采用异步http的方式传递模型的输入和输出

python3 ./common_sbert_full.py 采用http方式

当使用共享内存时，时间约为0.032秒

当不使用共享内存时，时间约为0.036秒

python3 ./common_sbert_full.py -i grpc -u localhost:8001 采用grpc方式

当使用共享内存时，时间约为0.032秒

当不使用共享内存时，时间约为0.035秒（采用deflate和gzip压缩，会更慢，达到0.04秒）

cudashm_sbert_full.py 为采用cudashm访问全量的sbert模型，时间约为0.0315秒

http_async_sbert_full.py 当设置1并发时，等价同步方式，时间约为0.035秒；当设置为50并发时（100并发显存不足），时间约为0.014秒。

结论

采用共享内存和cudashm速度最快（几乎一样），为0.032秒

采用http和grpc速度其次（几乎一样），为0.036秒

采用http或grpc方式，比原有的http请求0.51秒，快了超过13倍。

采用共享内存和cudashm，快了超过15倍。

采用http 50并发时，超过35倍。

参考代码

common_sbert_full.py

import argparse

import numpy as np

import sys

from builtins import range

\


import tritonclient.grpc as grpcclient

import tritonclient.http as httpclient

import tritonclient.utils as utils

import tritonclient.utils.shared_memory as shm

import time

\


FLAGS = None

\


INPUT_NUM = 3

OUTPUT_NUM = 2

LOOP_NUM = 100

\


def infer_and_validata(use_shared_memory, inputs_data):

if use_shared_memory:

byte_size = inputs_data[0].size * inputs_data[0].itemsize

[inputs[i].set_shared_memory(f"input{i}_data", byte_size) for i in range(INPUT_NUM)]

[outputs[i].set_shared_memory(f"output{i}_data", outputs_byte_size[i]) for i in range(OUTPUT_NUM)]

else:

[inputs[i].set_data_from_numpy(inputs_data[i]) for i in range(INPUT_NUM)]

[outputs[i].unset_shared_memory() for i in range(OUTPUT_NUM)]

\


results = triton_client.infer(model_name=model_name,

inputs=inputs,

outputs=outputs)

\


# Read results from the shared memory.

for i in range(OUTPUT_NUM):

output = results.get_output(f"OUTPUT__{i}")

if output is not None:

if use_shared_memory:

if protocol == "grpc":

output_data = shm.get_contents_as_numpy(

shm_op_handles[i], utils.triton_to_np_dtype(output.datatype),

output.shape)

else:

output_data = shm.get_contents_as_numpy(

shm_op_handles[i],

utils.triton_to_np_dtype(output['datatype']),

output['shape'])

else:

output_data = results.as_numpy(f'OUTPUT__{i}')

else:

print(f"OUTPUT__{i} is missing in the response.")

sys.exit(1)



# Tests whether the same InferInput and InferRequestedOutput objects can be

# successfully used repeatedly for different inferences using/not-using

# shared memory.

if __name__ == '__main__':

parser = argparse.ArgumentParser()

parser.add_argument('-v',

'--verbose',

action="store_true",

required=False,

default=False,

help='Enable verbose output')

parser.add_argument('-i',

'--protocol',

type=str,

required=False,

default='HTTP',

help='Protocol (HTTP/gRPC) used to communicate with ' +

'the inference service. Default is HTTP.')

parser.add_argument('-u',

'--url',

type=str,

required=False,

default='localhost:8000',

help='Inference server URL. Default is localhost:8000.')

\


FLAGS = parser.parse_args()



protocol = FLAGS.protocol.lower()



try:

if protocol == "grpc":

# Create gRPC client for communicating with the server

triton_client = grpcclient.InferenceServerClient(

url=FLAGS.url, verbose=FLAGS.verbose)

else:

# Create HTTP client for communicating with the server

triton_client = httpclient.InferenceServerClient(

url=FLAGS.url, verbose=FLAGS.verbose)

except Exception as e:

print("client creation failed: " + str(e))

sys.exit(1)



# To make sure no shared memory regions are registered with the

# server.

triton_client.unregister_system_shared_memory()

triton_client.unregister_cuda_shared_memory()



# We use a simple model that takes 2 input tensors of 16 integers

# each and returns 2 output tensors of 16 integers each. One

# output tensor is the element-wise sum of the inputs and one

# output is the element-wise difference.

model_name = "shansou_sbert_full"

model_version = "1"


# Create the data for the two input tensors. Initialize the first

# to unique integers and the second to all ones.


input_byte_size = 256 * 8

outputs_byte_size = [768*4, 256*768*4]

# Create Output0 and Output1 in Shared Memory and store shared memory handles

shm_op_handles = [shm.create_shared_memory_region(f"output{i}_data",

f"/output{i}_simple",

outputs_byte_size[i]) for i in range(OUTPUT_NUM)]

# Register Output0 and Output1 shared memory with Triton Server

[triton_client.register_system_shared_memory(f"output{i}_data",

f"/output{i}_simple",

outputs_byte_size[i]) for i in range(OUTPUT_NUM)]

# Create Input0 and Input1 in Shared Memory and store shared memory handles

shm_ip_handles = [shm.create_shared_memory_region(f"input{i}_data",

f"/input{i}_simple",

input_byte_size) for i in range(INPUT_NUM)]

# Put input data values into shared memory

#TODO: this is necessary?

inputs_data = [np.full(shape=(1, 256), fill_value=value, dtype=np.int64) for value in (1, 1, 0)]

[shm.set_shared_memory_region(shm_ip_handles[i], [inputs_data[i]]) for i in range(INPUT_NUM)]

# Register Input0 and Input1 shared memory with Triton Server

[triton_client.register_system_shared_memory(f"input{i}_data", f"/input{i}_simple",

input_byte_size) for i in range(INPUT_NUM)]

# Set the parameters to use data from shared memory

infer_input_f = grpcclient.InferInput if protocol == "grpc" else httpclient.InferInput

inputs = [infer_input_f(f'INPUT__{i}', [1, 256], "INT64") for i in range(INPUT_NUM)]

infer_output_f = grpcclient.InferRequestedOutput if protocol == "grpc" else httpclient.InferRequestedOutput

outputs = [infer_output_f(f'OUTPUT__{i}') for i in range(OUTPUT_NUM)]



start = time.perf_counter()

for _ in range(LOOP_NUM):

# Use shared memory

infer_and_validata(True, inputs_data)

end = time.perf_counter()

print("infer: ", end - start)



start = time.perf_counter()

for _ in range(LOOP_NUM):

infer_and_validata(False, inputs_data)

end = time.perf_counter()

print("infer: ", end - start)



triton_client.unregister_system_shared_memory()

for handler in (shm_ip_handles + shm_op_handles):

shm.destroy_shared_memory_region(handler)

http_async_sbert_full.py

from functools import partial

import argparse

import numpy as np

import sys


import tritonclient.http as httpclient

from tritonclient.utils import InferenceServerException

import time



if __name__ == '__main__':

parser = argparse.ArgumentParser()

parser.add_argument('-v',

'--verbose',

action="store_true",

required=False,

default=False,

help='Enable verbose output')

parser.add_argument('-u',

'--url',

type=str,

required=False,

default='localhost:8000',

help='Inference server URL. Default is localhost:8000.')



FLAGS = parser.parse_args()


request_count = 50

try:

# Need to specify large enough concurrency to issue all the

# inference requests to the server in parallel.

triton_client = httpclient.InferenceServerClient(

url=FLAGS.url, verbose=FLAGS.verbose, concurrency=request_count)

except Exception as e:

print("context creation failed: " + str(e))

sys.exit()



model_name = 'shansou_sbert_full'

# prepare

INPUT_NUM = 3

OUTPUT_NUM = 2

inputs = [httpclient.InferInput(f'INPUT__{i}', [1, 256], "INT64") for i in range(INPUT_NUM)]

inputs_data = [np.full(shape=(1, 256), fill_value=value, dtype=np.int64) for value in (1, 1, 0)]

[inputs[i].set_data_from_numpy(inputs_data[i], binary_data=True) for i in range(INPUT_NUM)]

outputs = [httpclient.InferRequestedOutput(f'OUTPUT__{i}', binary_data=True) for i in range(OUTPUT_NUM)]

# infer

start = time.perf_counter()

for _ in range(2):

async_requests = [triton_client.async_infer(model_name=model_name,inputs=inputs,outputs=outputs) for i in range(request_count)]

results = [async_request.get_result().get_response() for async_request in async_requests]

end = time.perf_counter()

print("async infer: ", end - start)