本来以为 kubespray 可以自动修复的,但是尝试了下,感觉确实修复不了,由于感觉 etcd 都是正常的,所以直接重建下静态 pod 即可
问题现象如下: 其他 pod 都正常,就是 kube- 控制面 pod 健康检查有问题。
root@k8s-ctrl:~/kubespray1-31# kgp
NAMESPACE NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
ingress-nginx ingress-nginx-controller-ntxl8 1/1 Running 0 29m 172.16.189.100 k8s-ctrl <none> <none>
ingress-nginx ingress-nginx-controller-q8rv6 1/1 Running 0 29m 172.16.189.101 k8s-work1 <none> <none>
ingress-nginx ingress-nginx-controller-xs4h7 1/1 Running 0 29m 172.16.189.102 k8s-work2 <none> <none>
kube-system calico-kube-controllers-55d498b656-qm764 0/1 Running 0 29m 10.199.84.136 k8s-work2 <none> <none>
kube-system calico-node-cqvxd 1/1 Running 0 29m 172.16.189.101 k8s-work1 <none> <none>
kube-system calico-node-ttj6t 1/1 Running 0 29m 172.16.189.102 k8s-work2 <none> <none>
kube-system calico-node-wctnp 1/1 Running 0 29m 172.16.189.100 k8s-ctrl <none> <none>
kube-system coredns-69d6675447-8g6c6 1/1 Running 0 29m 10.199.182.204 k8s-work1 <none> <none>
kube-system coredns-69d6675447-bwblj 1/1 Running 0 29m 10.199.89.131 k8s-ctrl <none> <none>
kube-system dns-autoscaler-5cb4578f5f-jqbjf 1/1 Running 0 29m 10.199.182.202 k8s-work1 <none> <none>
kube-system kube-apiserver-k8s-ctrl 1/1 Running 1 29m 172.16.189.100 k8s-ctrl <none> <none>
kube-system kube-apiserver-k8s-work1 0/1 Running 1 29m 172.16.189.101 k8s-work1 <none> <none>
kube-system kube-apiserver-k8s-work2 0/1 Running 1 29m 172.16.189.102 k8s-work2 <none> <none>
kube-system kube-controller-manager-k8s-ctrl 1/1 Running 2 29m 172.16.189.100 k8s-ctrl <none> <none>
kube-system kube-controller-manager-k8s-work1 0/1 Running 3 29m 172.16.189.101 k8s-work1 <none> <none>
kube-system kube-controller-manager-k8s-work2 0/1 Running 2 29m 172.16.189.102 k8s-work2 <none> <none>
kube-system kube-proxy-6pvvl 1/1 Running 0 29m 172.16.189.100 k8s-ctrl <none> <none>
kube-system kube-proxy-cp9qj 1/1 Running 0 29m 172.16.189.102 k8s-work2 <none> <none>
kube-system kube-proxy-xxkqr 1/1 Running 0 29m 172.16.189.101 k8s-work1 <none> <none>
kube-system kube-scheduler-k8s-ctrl 1/1 Running 1 29m 172.16.189.100 k8s-ctrl <none> <none>
kube-system kube-scheduler-k8s-work1 0/1 Running 1 29m 172.16.189.101 k8s-work1 <none> <none>
kube-system kube-scheduler-k8s-work2 0/1 Running 1 29m 172.16.189.102 k8s-work2 <none> <none>
kube-system metrics-server-6c8bff4c-pzhzg 0/1 Running 0 29m 10.199.84.137 k8s-work2 <none> <none>
kube-system nodelocaldns-9wllx 1/1 Running 0 29m 172.16.189.101 k8s-work1 <none> <none>
kube-system nodelocaldns-b79zv 1/1 Running 0 29m 172.16.189.100 k8s-ctrl <none> <none>
kube-system nodelocaldns-ldq85 1/1 Running 0 29m 172.16.189.102 k8s-work2 <none> <none>
kube-system registry-n9tw8 1/1 Running 0 29m 10.199.182.203 k8s-work1 <none> <none>
root@k8s-ctrl:~/kubespray1-31#
kubespray 本身的命令
ansible-playbook -i inventory/mycluster/hosts.yaml cluster.yml --tags etcd
ansible-playbook -i inventory/mycluster/hosts.yaml cluster.yml --limit k8s-ctrl,k8s-work1,k8s-work2
解决
crictl ps -a | grep kube- | awk '{print $1}' | xargs -r crictl rm -f
ansible all -m shell -a "systemctl restart containerd; systemctl restart kubelet"
kubectl delete po --all --all-namespaces