"docker19.03版本以下需要安装nvidia-docker"
"NVIDIA 驱动版本 > 361.93"
安装NVIDIA Container Runtime
curl -s -L https://nvidia.github.io/nvidia-container-runtime/gpgkey | \
sudo apt-key add -
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/nvidia-container-runtime/$distribution/nvidia-container-runtime.list | \
sudo tee /etc/apt/sources.list.d/nvidia-container-runtime.list
sudo apt-get update
apt-get install nvidia-container-runtime -y
Docker 的默认运行时设置为 nvidia-container-runtime
{
"default-runtime": "nvidia",
"runtimes": {
"nvidia": {
"path": "nvidia-container-runtime",
"runtimeArgs": []
}
}
}
master上部署
apiVersion: extensions/v1beta1
kind: DaemonSet
metadata:
name: nvidia-device-plugin-daemonset
namespace: kube-system
spec:
template:
metadata:
# Mark this pod as a critical add-on; when enabled, the critical add-on scheduler
# reserves resources for critical add-on pods so that they can be rescheduled after
# a failure. This annotation works in tandem with the toleration below.
annotations:
scheduler.alpha.kubernetes.io/critical-pod: ""
labels:
name: nvidia-device-plugin-ds
spec:
tolerations:
# Allow this pod to be rescheduled while the node is in "critical add-ons only" mode.
# This, along with the annotation above marks this pod as a critical add-on.
- key: CriticalAddonsOnly
operator: Exists
containers:
- image: nvidia/k8s-device-plugin:1.11
name: nvidia-device-plugin-ctr
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
kubectl describe node pro-spider-router-223-2
Capacity:
cpu: 64
ephemeral-storage: 2299763876Ki
hugepages-1Gi: 0
hugepages-2Mi: 0
memory: 263770796Ki
nvidia.com/gpu: 2
pods: 110
Allocatable:
cpu: 64
ephemeral-storage: 2119462384613
hugepages-1Gi: 0
hugepages-2Mi: 0
memory: 263668396Ki
nvidia.com/gpu: 2
pods: 110
ubuntu nfs服务端nfs-kernel-server
ubuntu nfs客户端nfs-common
zabbix监控项
UserParameter=gpu0.mem.used,nvidia-smi -q -g 0 2>&1| grep -A 3 "FB Memory Usage"|grep -i "used"|awk '{print $3/1024}'
UserParameter=gpu1.mem.used,nvidia-smi -q -g 1 2>&1| grep -A 3 "FB Memory Usage"|grep -i "used"|awk '{print $3/1024}'
UserParameter=gpu2.mem.used,nvidia-smi -q -g 2 2>&1| grep -A 3 "FB Memory Usage"|grep -i "used"|awk '{print $3/1024}'
UserParameter=gpu0.gpu.used,nvidia-smi -q -g 0 2>&1| grep -w "Gpu"|awk '{print $3}'
UserParameter=gpu1.gpu.used,nvidia-smi -q -g 1 2>&1| grep -w "Gpu"|awk '{print $3}'
UserParameter=gpu2.gpu.used,nvidia-smi -q -g 2 2>&1| grep -w "Gpu"|awk '{print $3}'