reference:
| Version | note | |
|---|---|---|
| Kubernetes | 1.19.9 | GPU enabled |
| Kubeflow Pipeline | 1.7.0 |
triton inference server
查看gpu驱动版本:
$ nvidia-smi
Mon Jul 25 04:14:27 2022
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.45.01 Driver Version: 455.45.01 CUDA Version: 11.1 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 Tesla M60 On | 00000000:03:00.0 Off | Off |
| N/A 41C P8 14W / 150W | 0MiB / 8129MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
| 1 Tesla M60 On | 00000000:0B:00.0 Off | Off |
| N/A 37C P8 13W / 150W | 0MiB / 8129MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
根据Nvidia optimized frameworks containers support matrix,使用triton镜像 nvcr.io/nvidia/tritonserver:20.10-py3.
安装python依赖包:
$ tee requirements.txt << EOF
kfp==1.7.0
kubernetes==19.15.0
EOF
$ pip3 install -r requirements.txt
使用pipeline python sdk 生成pipeline yaml: triton_ops.py
# triton_ops.py
#!/usr/bin/env python3
import kfp.dsl as dsl
from kubernetes import client as k8s_client
import yaml
__TRITON_CONTAINER_VERSION__ = 'nvcr.io/nvidia/tritonserver:20.10-py3'
__TRITON_POD_LABEL__ = 'triton-kubeflow'
__TRITON_SERVICE_MANIFEST___ = '''
apiVersion: v1
kind: Service
metadata:
name: {}
spec:
selector:
app: {}
ports:
- name: http
protocol: TCP
port: 8000
targetPort: 8000
nodePort: 30800
- name: grpc
port: 8001
targetPort: 8001
nodePort: 30801
- name: metrics
port: 8002
targetPort: 8002
nodePort: 30802
type: NodePort
'''.format(__TRITON_POD_LABEL__, __TRITON_POD_LABEL__)
class ObjectDict(dict):
def __getattr__(self, name):
if name in self:
return self[name]
else:
raise AttributeError("No such attribute: " + name)
class TritonVolume(dsl.ResourceOp):
'''Initialize a volume if one does not exist'''
def __init__(self, name, pv_name):
super(TritonVolume, self).__init__(
k8s_resource=k8s_client.V1PersistentVolumeClaim(
api_version="v1", kind="PersistentVolumeClaim",
metadata=k8s_client.V1ObjectMeta(name=pv_name),
spec=k8s_client.V1PersistentVolumeClaimSpec(
access_modes=['ReadWriteMany'], resources=k8s_client.V1ResourceRequirements(
requests={'storage': '2000Gi'}),
storage_class_name="nfs-client")),
action='apply',
name=name
)
name = name
class TritonDownload(dsl.ContainerOp):
'''Download example Triton models and move them into the PV'''
def __init__(self, name, models):
cmd = ["/bin/bash", "-cx"]
arguments = ["cd /tmp; git clone https://github.com/triton-inference-server/server.git; " \
"cd server/docs/examples; ./fetch_models.sh; cd model_repository; cp -a . " + str(models)]
super(TritonDownload, self).__init__(
name=name,
image=__TRITON_CONTAINER_VERSION__,
command=cmd,
arguments=arguments,
file_outputs={}
)
self.pod_labels['app'] = __TRITON_POD_LABEL__
name = name
class TritonDeploy(dsl.ContainerOp):
'''Deploy Triton'''
def __init__(self, name, models):
cmd = ["/bin/bash", "-cx"]
arguments = ["echo Deploying: " + str(
models) + ";ls /data; ls /results; ls /checkpoints; tritonserver --model-store=" + models]
super(TritonDeploy, self).__init__(
name=name,
image=__TRITON_CONTAINER_VERSION__,
command=cmd,
arguments=arguments,
file_outputs={}
)
self.pod_labels['app'] = __TRITON_POD_LABEL__
name = name
class TritonService(dsl.ResourceOp):
'''Launch Triton Service'''
def __init__(self, name):
super(TritonService, self).__init__(
name=name,
k8s_resource=yaml.load(__TRITON_SERVICE_MANIFEST___, Loader=None),
action='create'
)
triton.py
# triton.py
#!/usr/bin/env python3
'''
Kubeflow documentation: https://kubeflow-pipelines.readthedocs.io/en/latest/_modules/kfp/dsl/_container_op.html
K8S documentation: https://github.com/kubernetes-client/python/blob/02ef5be4ecead787961037b236ae498944040b43/kubernetes/docs/V1Container.md
Example Triton Inference Server Models: https://docs.nvidia.com/deeplearning/sdk/tensorrt-inference-server-master-branch-guide/docs/run.html#example-model-repository
Example Triton Inference Server Client: https://docs.nvidia.com/deeplearning/sdk/tensorrt-inference-server-master-branch-guide/docs/client_example.html#section-getting-the-client-examples
Bugs:
Cannot dynamically assign GPU counts: https://github.com/kubeflow/pipelines/issues/1956
# Manual run example:
nvidia-docker run --rm --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 -p8000:8000 -p8001:8001 -p8002:8002 -v/raid/shared/results/model_repository/:/model_repository nvcr.io/nvidia/tensorrtserver:20.02-py3 trtserver --model-repository=/model_repository
docker run -it --rm --net=host tensorrtserver_client /workspace/install/bin/image_client -m resnet50_netdef images/mug.jpg
'''
import triton_ops
import kfp.dsl as dsl
from kubernetes import client as k8s_client
@dsl.pipeline(
name='tritonPipeline',
description='Deploy a Triton server'
)
def triton_pipeline(skip_examples):
op_dict = {}
# Hardcoded paths mounted in the Triton container
results_dir = "/results/"
data_dir = "/data/"
checkpoints_dir = "/checkpoints/"
models = "/results/model_repository"
# Set default volume names
pv_data = "triton-data"
pv_results = "triton-results"
pv_checkpoints = "triton-checkpoints"
# Create K8s PVs
op_dict['triton_volume_results'] = triton_ops.TritonVolume('triton_volume_results', pv_results)
op_dict['triton_volume_data'] = triton_ops.TritonVolume('triton_volume_data', pv_data)
op_dict['triton_volume_checkpoints'] = triton_ops.TritonVolume('triton_volume_checkpoints', pv_checkpoints)
# Download example models
with dsl.Condition(skip_examples == '', name='skip-examples-download'):
op_dict['triton_download'] = triton_ops.TritonDownload('triton_download', models)
# Common Operations
op_dict['triton_service'] = triton_ops.TritonService('triton_service')
op_dict['triton_deploy'] = triton_ops.TritonDeploy('triton_deploy', models)
# Use GPUs
op_dict['triton_deploy'].set_gpu_limit(1, vendor="nvidia")
# Add Triton Ports
op_dict['triton_deploy'].add_port(k8s_client.V1ContainerPort(container_port=8000, host_port=8000)) # HTTP
op_dict['triton_deploy'].add_port(k8s_client.V1ContainerPort(8001, host_port=8001)) # gRPC
op_dict['triton_deploy'].add_port(k8s_client.V1ContainerPort(8002, host_port=8002)) # Metrics
# Set order so tha volumes are created, then examples downloaded, then service started
op_dict['triton_download'].after(op_dict['triton_volume_results'])
op_dict['triton_download'].after(op_dict['triton_volume_data'])
op_dict['triton_download'].after(op_dict['triton_volume_checkpoints'])
op_dict['triton_deploy'].after(op_dict['triton_download'])
# Mount Volumes
for name, container_op in op_dict.items():
if name == 'triton_service' or type(container_op) == triton_ops.TritonVolume:
continue
container_op.add_volume(
k8s_client.V1Volume(persistent_volume_claim=k8s_client.V1PersistentVolumeClaimVolumeSource(
claim_name=pv_results, read_only=False), name=pv_results))
container_op.add_volume_mount(k8s_client.V1VolumeMount(
mount_path=results_dir, name=pv_results, read_only=False))
container_op.add_volume(
k8s_client.V1Volume(persistent_volume_claim=k8s_client.V1PersistentVolumeClaimVolumeSource(
claim_name=pv_data, read_only=False), name=pv_data))
container_op.add_volume_mount(k8s_client.V1VolumeMount(
mount_path=data_dir, name=pv_data, read_only=True))
container_op.add_volume(
k8s_client.V1Volume(persistent_volume_claim=k8s_client.V1PersistentVolumeClaimVolumeSource(
claim_name=pv_checkpoints, read_only=False), name=pv_checkpoints))
container_op.add_volume_mount(k8s_client.V1VolumeMount(
mount_path=checkpoints_dir, name=pv_checkpoints, read_only=True))
'''
TODO Implement https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1Probe.md:
livenessProbe:
httpGet:
path: /api/health/live
port: http
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
httpGet:
path: /api/health/ready
port: http
'''
if __name__ == '__main__':
import kfp.compiler as compiler
compiler.Compiler().compile(triton_pipeline, __file__ + '.tar.gz')
生成打包后的pipeline yaml文件:
python3 triton.py
# triton.py.tar.gz
上传打包后的文件到pipeline UI:
使用默认的Experiment创建一个Run:
查看pipeline运行状态:
检查Triton inference server 是否正确的运行:
$ kubectl get svc -n kubeflow | grep triton
triton-kubeflow NodePort 10.233.46.114 <none> 8000:30800/TCP,8001:30801/TCP,8002:30802/TCP 18m
# The HTTP request returns status 200 if Triton is ready and non-200 if it is not ready.
$ curl -v 192.168.1.146:30800/v2/health/ready
* Trying 192.168.1.146:30800...
* TCP_NODELAY set
* Connected to 192.168.1.146 (192.168.1.146) port 30800 (#0)
> GET /v2/health/ready HTTP/1.1
> Host: 192.168.1.146:30800
> User-Agent: curl/7.68.0
> Accept: */*
>
* Mark bundle as not supporting multiuse
< HTTP/1.1 200 OK
< Content-Length: 0
< Content-Type: text/plain
<
* Connection #0 to host 192.168.1.146 left intact
运行图像分类客户端示例:
# 使用与triton inference server对应的版本
$ docker pull nvcr.io/nvidia/tritonserver:20.10-py3-clientsdk
$ docker run -it --rm nvcr.io/nvidia/tritonserver:20.10-py3-clientsdk \
/workspace/install/bin/image_client \
-u 192.168.1.146:30800 \
-m densenet_onnx \
-c 3 -s INCEPTION /workspace/images/mug.jpg
# 输出类似的日志说明成功连接triton服务
Request 0, batch size 1
Image '/workspace/images/mug.jpg':
15.346228 (504) = COFFEE MUG
13.224324 (968) = CUP
10.422966 (505) = COFFEEPOT