故障现象
[root@master01 ~]# kubectl get pod -A -o wide
NAMESPACE NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
default test-hostpath 2/2 Running 17 (119m ago) 22d 10.244.166.129 node1 <none> <none>
kube-system calico-kube-controllers-558d465845-wckg4 0/1 CrashLoopBackOff 36 (2m4s ago) 3h53m 10.244.104.2 node2 <none> <none>
kube-system calico-node-frpb9 0/1 Running 1 (118m ago) 3h36m 192.168.95.12 node2 <none> <none>
kube-system calico-node-l5j54 0/1 CrashLoopBackOff 39 (70s ago) 3h36m 192.168.95.11 node1 <none> <none>
kube-system calico-node-tqvb5 0/1 CrashLoopBackOff 34 (69s ago) 3h36m 192.168.95.10 master01 <none> <none>
kube-system calico-typha-5b56944f9b-h5cps 1/1 Running 1 (118m ago) 3h53m 192.168.95.12 node2 <none> <none>
kube-system coredns-66f779496c-4qzzf 1/1 Running 4 (119m ago) 22d 10.244.241.65 master01 <none> <none>
kube-system coredns-66f779496c-gbqqj 0/1 Running 2 (119m ago) 5h13m 10.244.166.130 node1 <none> <none>
kube-system etcd-master01 1/1 Running 17 (119m ago) 23d 192.168.95.10 master01 <none> <none>
kube-system kube-apiserver-master01 1/1 Running 31 (119m ago) 23d 192.168.95.10 master01 <none> <none>
kube-system kube-controller-manager-master01 1/1 Running 20 (119m ago) 23d 192.168.95.10 master01 <none> <none>
kube-system kube-proxy-4qpbt 1/1 Running 14 (118m ago) 23d 192.168.95.12 node2 <none> <none>
kube-system kube-proxy-9mh2v 1/1 Running 17 (119m ago) 23d 192.168.95.10 master01 <none> <none>
kube-system kube-proxy-z2xfp 1/1 Running 15 (119m ago) 23d 192.168.95.11 node1 <none> <none>
kube-system kube-scheduler-master01 1/1 Running 21 (119m ago) 23d 192.168.95.10 master01 <none> <none>
[root@master01 ~]# kubectl logs calico-node-l5j54 -n kube-system
Defaulted container "calico-node" out of: calico-node, upgrade-ipam (init), install-cni (init), mount-bpffs (init)
Error from server: Get "https://192.168.95.11:10250/containerLogs/kube-system/calico-node-l5j54/calico-node": dial tcp 192.168.95.11:10250: connect: no route to host
[root@master01 ~]# kubectl logs coredns-66f779496c-gbqqj -n kube-system
Error from server: Get "https://192.168.95.11:10250/containerLogs/kube-system/coredns-66f779496c-gbqqj/coredns": dial tcp 192.168.95.11:10250: connect: no route to host
#
[root@master01 ~]# telnet 192.168.95.11 10250
Trying 192.168.95.11...
telnet: connect to address 192.168.95.11: No route to host
[root@master01 ~]# telnet 192.168.95.12 10250
Trying 192.168.95.12...
telnet: connect to address 192.168.95.12: No route to host
可知master-> node01 或node02 都是no route to host
故障树分析
graph TD
A[No Route to Host] --> B{网络层问题}
A --> C[传输层问题]
A --> D{应用层}
B --> B1[内核路由表]
B --> B2[反向路径过滤]
B --> B3[ICMP限制]
C --> C1[连接跟踪]
C --> C2[Firewall REJECT]
C --> C3[Conntrack状态]
C --> C4[TCP参数]
D --> D1[端口监听的address是否正常]
style A stroke:#ff0000,stroke-width:2px
style B1 fill:#ffee90
style B2 fill:#ff9999
逐步排查:
#网络层
#已知两两直接可以互相ping通
#应用层
[root@node1 ~]# systemctl status kubelet
● kubelet.service - kubelet: The Kubernetes Node Agent
Loaded: loaded (/usr/lib/systemd/system/kubelet.service; enabled; preset: disabled)
Drop-In: /usr/lib/systemd/system/kubelet.service.d
└─10-kubeadm.conf
Active: active (running) since Sat 2025-06-07 14:54:27 CST; 25min ago
Docs: https://kubernetes.io/docs/
Main PID: 4976 (kubelet)
Tasks: 12 (limit: 10852)
Memory: 49.3M
CPU: 15.959s
CGroup: /system.slice/kubelet.service
└─4976 /usr/bin/kubelet --bootstrap-kubeconfig=/etc/kubernetes/bootstrap-kubelet.conf --kubeconfig=/etc/kubernetes/kubelet.conf --config=/var/lib/kubelet/config.yaml --container-ru>
Jun 07 15:17:11 node1 kubelet[4976]: I0607 15:17:11.588024 4976 scope.go:117] "RemoveContainer" containerID="2a8abc30b8e9492661fd7ff3c589287502e8c4fe29cb2117771c48ed4825954b"
Jun 07 15:17:11 node1 kubelet[4976]: E0607 15:17:11.588323 4976 pod_workers.go:1300] "Error syncing pod, skipping" err="failed to \"StartContainer\" for \"calico-node\" with CrashLoopBackOff>
Jun 07 15:17:24 node1 kubelet[4976]: I0607 15:17:24.587072 4976 scope.go:117] "RemoveContainer" containerID="2a8abc30b8e9492661fd7ff3c589287502e8c4fe29cb2117771c48ed4825954b"
Jun 07 15:17:24 node1 kubelet[4976]: E0607 15:17:24.587341 4976 pod_workers.go:1300] "Error syncing pod, skipping" err="failed to \"StartContainer\" for \"calico-node\" with CrashLoopBackOff>
Jun 07 15:17:37 node1 kubelet[4976]: I0607 15:17:37.587080 4976 scope.go:117] "RemoveContainer" containerID="2a8abc30b8e9492661fd7ff3c589287502e8c4fe29cb2117771c48ed4825954b"
[root@node1 ~]# ss -anpt | grep 10250
LISTEN 0 30000 *:10250 *:* users:(("kubelet",pid=4976,fd=13))
应用正常运行,且正常监听所有地址的10250 端口,而非127.0.0.1地址。
#防火墙
iptables -L -n | grep 10250
不能确定是否放行10250端口
抓包
[root@node1 ~]# tcpdump -i ens33 host 192.168.95.10 and port 10250
[root@master01 ~]# telnet 192.168.95.11 10250
Trying 192.168.95.11...
telnet: connect to address 192.168.95.11: No route to host
# 发现 node1 收到了SYN包没有回包
[root@node1 ~]# tcpdump -i ens33 host 192.168.95.10 and port 10250
dropped privs to tcpdump
tcpdump: verbose output suppressed, use -v[v]... for full protocol decode
listening on ens33, link-type EN10MB (Ethernet), snapshot length 262144 bytes
15:27:58.878819 IP master01.42830 > node1.10250: Flags [S], seq 3745119612, win 64240, options [mss 1460,sackOK,TS val 907510165 ecr 0,nop,wscale 7], length 0
怀疑与kubelet服务或防火墙有关
#关闭kubelet
systemctl stop kubelet
# 使用原始套接字测试(其他方式打开10250端口)
socat TCP4-LISTEN:10250,reuseaddr,fork -
# node1 再次转包测试
# 发现依然收到SYN包,没有回包, 故与iptables规则有关
[root@node1 ~]#iptables -I INPUT -p tcp --dport 10250 -s 192.168.95.0/24 -j ACCEPT
[root@node2 ~]#iptables -I INPUT -p tcp --dport 10250 -s 192.168.95.0/24 -j ACCEPT
[root@master01 ~]# telnet node1 10250
Trying 192.168.95.11...
Connected to node1.
Escape character is '^]'.
[root@master01 ~]# telnet node2 10250
Trying 192.168.95.12...
Connected to node2.
Escape character is '^]'.
# 端口已通
果然问题在此
[root@master01 ~]# kubectl get pod -A -o wide
NAMESPACE NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
kube-system calico-kube-controllers-558d465845-h7mfw 1/1 Running 1 (13h ago) 13h 10.244.241.68 master01 <none> <none>
kube-system calico-node-frpb9 0/1 Running 2 (13h ago) 22h 192.168.95.12 node2 <none> <none>
kube-system calico-node-l5j54 0/1 Running 54 (3s ago) 22h 192.168.95.11 node1 <none> <none>
kube-system calico-node-tqvb5 0/1 CrashLoopBackOff 61 (3m48s ago) 22h 192.168.95.10 master01 <none> <none>
kube-system calico-typha-5b56944f9b-h5cps 1/1 Running 2 (15m ago) 22h 192.168.95.12 node2 <none> <none>
kube-system coredns-66f779496c-4qzzf 1/1 Running 5 (13h ago) 22d 10.244.241.70 master01 <none> <none>
kube-system coredns-66f779496c-qq7nd 1/1 Running 1 (13h ago) 13h 10.244.241.69 master01 <none> <none>
kube-system etcd-master01 1/1 Running 18 (13h ago) 23d 192.168.95.10 master01 <none> <none>
kube-system kube-apiserver-master01 1/1 Running 33 (15m ago) 23d 192.168.95.10 master01 <none> <none>
kube-system kube-controller-manager-master01 1/1 Running 22 (13h ago) 23d 192.168.95.10 master01 <none> <none>
kube-system kube-proxy-4qpbt 1/1 Running 15 (15m ago) 23d 192.168.95.12 node2 <none> <none>
kube-system kube-proxy-9mh2v 1/1 Running 18 (13h ago) 23d 192.168.95.10 master01 <none> <none>
kube-system kube-proxy-z2xfp 1/1 Running 16 (16m ago) 23d 192.168.95.11 node1 <none> <none>
kube-system kube-scheduler-master01 1/1 Running 23 (13h ago) 23d 192.168.95.10 master01 <none> <none>
core-dns已经正常,但是还有calico-node 是not-ready。
[root@master01 ~]# kubectl logs calico-node-l5j54 -n kube-system
...#省略若干
2025-06-08 02:06:21.917 [WARNING][66] felix/sync_client.go 192: Failed to connect to typha endpoint 192.168.95.12:5473. Will try another if available... error=dial tcp 192.168.95.12:5473: connect: no route to host myID=0x1 type=""
[root@master01 ~]# telnet 192.168.95.11 5473
Trying 192.168.95.11...
telnet: connect to address 192.168.95.11: No route to host
#和上面问题可以确认是一致的
[root@node1 ~]# ss -anpt | grep 5473
SYN-SENT 0 1 192.168.95.11:13512 192.168.95.12:5473 users:(("calico-node",pid=18330,fd=10))
SYN-SENT 0 1 192.168.95.11:63730 192.168.95.12:5473 users:(("calico-node",pid=18445,fd=10))
开始操作
[root@node1 ~]# iptables -I INPUT -p tcp --dport 5473 -s 192.168.95.0/24 -j ACCEPT
[root@master01 ~]# telnet 192.168.95.12 5473
Trying 192.168.95.12...
Connected to 192.168.95.12
最后记得保存iptables规则
#node1 和 node2 都执行
[root@node1 ~]# service iptables save
iptables: Saving firewall rules to /etc/sysconfig/iptables: [ OK ]
新问题
pod calico-kube-controllers-558d465845-xf4ww依然是not-ready状态
kubectl logs calico-kube-controllers-558d465845-xf4ww -n kube-system发现,Failed to initialize datastore error=Get "https://10.0.0.1:443/apis/crd.projectcalico.org/v1/clusterinformations/default": dial tcp 10.0.0.1:443: connect: no route to host
但是我发现master节点和每个node节点都可以正常 telnet 10.0.0.1 443 是通的,为什么会有此报错导致 calico-kube-controller 为not-ready状态
信息如下
1,不能进入pod的容器排查
[root@master01 calico]# kubectl exec -ti calico-kube-controllers-558d465845-xf4ww -n kube-system -- /bin/sh error: unable to upgrade connection: container not found ("calico-kube-controllers")
2,svc正常运行
[root@master01 calico]# kubectl get svc NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE kubernetes ClusterIP 10.0.0.1 <none> 443/TCP 23d
3.endpoint正常存在
[root@master01 calico]# kubectl get endpoints kubernetes -n default NAME ENDPOINTS AGE kubernetes 192.168.95.10:6443 23d
4.发现在node1 和node2 执行iptables -F 后calico-kube-controllers-558d465845-xf4ww 正常运行,为ready状态.
最终还是iptables规则的问题
node1和node2都执行
[root@node1 ~]# iptables -I FORWARD -s 10.244.0.0/16 -j ACCEPT
发现calico-kube-controllers-558d465845-xf4ww为ready状态
done