背景
当前triton server的访问,采用的是普通的http API 请求。
在最近的一个sbert full模型中(除了平均向量,新增每个字符的向量输出,新增传输字节约为2567684),网络请求很慢。
办公网络-》开发网络(2.5M/s),需要1.2秒的时间。
即使是localhost网络中,也达到0.51秒。
工程性能较差。
(全文所有平均时间均为连续执行100次的平均结果)
改进
采用triton client去访问triton server
triton client 安装
pip install nvidia-pyindex
pip install tritonclient[all]
triton客户端不同模式下的运行速度
样例代码参考:github.com/triton-infe…
这里为几个根据sbert full模型修改的可运行脚本。
shm代表采用共享内存来传递模型的输入和输出
grpc代表采用grpc协议传递模型的输入和输出
cudashm代表采用cuda内存传递模型的输入和输出
http_async代表采用异步http的方式传递模型的输入和输出
python3 ./common_sbert_full.py 采用http方式
当使用共享内存时,时间约为0.032秒
当不使用共享内存时,时间约为0.036秒
python3 ./common_sbert_full.py -i grpc -u localhost:8001 采用grpc方式
当使用共享内存时,时间约为0.032秒
当不使用共享内存时,时间约为0.035秒 (采用deflate和gzip压缩,会更慢,达到0.04秒)
cudashm_sbert_full.py 为采用cudashm访问全量的sbert模型,时间约为0.0315秒
http_async_sbert_full.py 当设置1并发时,等价同步方式,时间约为0.035秒;当设置为50并发时(100并发显存不足),时间约为0.014秒。
结论
采用共享内存和cudashm速度最快(几乎一样),为0.032秒
采用http和grpc速度其次(几乎一样),为0.036秒
采用http或grpc方式,比原有的http请求0.51秒,快了超过13倍。
采用共享内存和cudashm,快了超过15倍。
采用http 50并发时,超过35倍。
参考代码
common_sbert_full.py
import argparse
import numpy as np
import sys
from builtins import range
\
import tritonclient.grpc as grpcclient
import tritonclient.http as httpclient
import tritonclient.utils as utils
import tritonclient.utils.shared_memory as shm
import time
\
FLAGS = None
\
INPUT_NUM = 3
OUTPUT_NUM = 2
LOOP_NUM = 100
\
def infer_and_validata(use_shared_memory, inputs_data):
if use_shared_memory:
byte_size = inputs_data[0].size * inputs_data[0].itemsize
[inputs[i].set_shared_memory(f"input{i}_data", byte_size) for i in range(INPUT_NUM)]
[outputs[i].set_shared_memory(f"output{i}_data", outputs_byte_size[i]) for i in range(OUTPUT_NUM)]
else:
[inputs[i].set_data_from_numpy(inputs_data[i]) for i in range(INPUT_NUM)]
[outputs[i].unset_shared_memory() for i in range(OUTPUT_NUM)]
\
results = triton_client.infer(model_name=model_name,
inputs=inputs,
outputs=outputs)
\
# Read results from the shared memory.
for i in range(OUTPUT_NUM):
output = results.get_output(f"OUTPUT__{i}")
if output is not None:
if use_shared_memory:
if protocol == "grpc":
output_data = shm.get_contents_as_numpy(
shm_op_handles[i], utils.triton_to_np_dtype(output.datatype),
output.shape)
else:
output_data = shm.get_contents_as_numpy(
shm_op_handles[i],
utils.triton_to_np_dtype(output['datatype']),
output['shape'])
else:
output_data = results.as_numpy(f'OUTPUT__{i}')
else:
print(f"OUTPUT__{i} is missing in the response.")
sys.exit(1)
# Tests whether the same InferInput and InferRequestedOutput objects can be
# successfully used repeatedly for different inferences using/not-using
# shared memory.
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-v',
'--verbose',
action="store_true",
required=False,
default=False,
help='Enable verbose output')
parser.add_argument('-i',
'--protocol',
type=str,
required=False,
default='HTTP',
help='Protocol (HTTP/gRPC) used to communicate with ' +
'the inference service. Default is HTTP.')
parser.add_argument('-u',
'--url',
type=str,
required=False,
default='localhost:8000',
help='Inference server URL. Default is localhost:8000.')
\
FLAGS = parser.parse_args()
protocol = FLAGS.protocol.lower()
try:
if protocol == "grpc":
# Create gRPC client for communicating with the server
triton_client = grpcclient.InferenceServerClient(
url=FLAGS.url, verbose=FLAGS.verbose)
else:
# Create HTTP client for communicating with the server
triton_client = httpclient.InferenceServerClient(
url=FLAGS.url, verbose=FLAGS.verbose)
except Exception as e:
print("client creation failed: " + str(e))
sys.exit(1)
# To make sure no shared memory regions are registered with the
# server.
triton_client.unregister_system_shared_memory()
triton_client.unregister_cuda_shared_memory()
# We use a simple model that takes 2 input tensors of 16 integers
# each and returns 2 output tensors of 16 integers each. One
# output tensor is the element-wise sum of the inputs and one
# output is the element-wise difference.
model_name = "shansou_sbert_full"
model_version = "1"
# Create the data for the two input tensors. Initialize the first
# to unique integers and the second to all ones.
input_byte_size = 256 * 8
outputs_byte_size = [768*4, 256*768*4]
# Create Output0 and Output1 in Shared Memory and store shared memory handles
shm_op_handles = [shm.create_shared_memory_region(f"output{i}_data",
f"/output{i}_simple",
outputs_byte_size[i]) for i in range(OUTPUT_NUM)]
# Register Output0 and Output1 shared memory with Triton Server
[triton_client.register_system_shared_memory(f"output{i}_data",
f"/output{i}_simple",
outputs_byte_size[i]) for i in range(OUTPUT_NUM)]
# Create Input0 and Input1 in Shared Memory and store shared memory handles
shm_ip_handles = [shm.create_shared_memory_region(f"input{i}_data",
f"/input{i}_simple",
input_byte_size) for i in range(INPUT_NUM)]
# Put input data values into shared memory
#TODO: this is necessary?
inputs_data = [np.full(shape=(1, 256), fill_value=value, dtype=np.int64) for value in (1, 1, 0)]
[shm.set_shared_memory_region(shm_ip_handles[i], [inputs_data[i]]) for i in range(INPUT_NUM)]
# Register Input0 and Input1 shared memory with Triton Server
[triton_client.register_system_shared_memory(f"input{i}_data", f"/input{i}_simple",
input_byte_size) for i in range(INPUT_NUM)]
# Set the parameters to use data from shared memory
infer_input_f = grpcclient.InferInput if protocol == "grpc" else httpclient.InferInput
inputs = [infer_input_f(f'INPUT__{i}', [1, 256], "INT64") for i in range(INPUT_NUM)]
infer_output_f = grpcclient.InferRequestedOutput if protocol == "grpc" else httpclient.InferRequestedOutput
outputs = [infer_output_f(f'OUTPUT__{i}') for i in range(OUTPUT_NUM)]
start = time.perf_counter()
for _ in range(LOOP_NUM):
# Use shared memory
infer_and_validata(True, inputs_data)
end = time.perf_counter()
print("infer: ", end - start)
start = time.perf_counter()
for _ in range(LOOP_NUM):
infer_and_validata(False, inputs_data)
end = time.perf_counter()
print("infer: ", end - start)
triton_client.unregister_system_shared_memory()
for handler in (shm_ip_handles + shm_op_handles):
shm.destroy_shared_memory_region(handler)
http_async_sbert_full.py
from functools import partial
import argparse
import numpy as np
import sys
import tritonclient.http as httpclient
from tritonclient.utils import InferenceServerException
import time
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-v',
'--verbose',
action="store_true",
required=False,
default=False,
help='Enable verbose output')
parser.add_argument('-u',
'--url',
type=str,
required=False,
default='localhost:8000',
help='Inference server URL. Default is localhost:8000.')
FLAGS = parser.parse_args()
request_count = 50
try:
# Need to specify large enough concurrency to issue all the
# inference requests to the server in parallel.
triton_client = httpclient.InferenceServerClient(
url=FLAGS.url, verbose=FLAGS.verbose, concurrency=request_count)
except Exception as e:
print("context creation failed: " + str(e))
sys.exit()
model_name = 'shansou_sbert_full'
# prepare
INPUT_NUM = 3
OUTPUT_NUM = 2
inputs = [httpclient.InferInput(f'INPUT__{i}', [1, 256], "INT64") for i in range(INPUT_NUM)]
inputs_data = [np.full(shape=(1, 256), fill_value=value, dtype=np.int64) for value in (1, 1, 0)]
[inputs[i].set_data_from_numpy(inputs_data[i], binary_data=True) for i in range(INPUT_NUM)]
outputs = [httpclient.InferRequestedOutput(f'OUTPUT__{i}', binary_data=True) for i in range(OUTPUT_NUM)]
# infer
start = time.perf_counter()
for _ in range(2):
async_requests = [triton_client.async_infer(model_name=model_name,inputs=inputs,outputs=outputs) for i in range(request_count)]
results = [async_request.get_result().get_response() for async_request in async_requests]
end = time.perf_counter()
print("async infer: ", end - start)