calico故障-calico节点not-ready最终解决

304 阅读8分钟

故障现象

[root@master01 ~]# kubectl get pod -A -o wide
NAMESPACE     NAME                                       READY   STATUS             RESTARTS        AGE     IP               NODE       NOMINATED NODE   READINESS GATES
default       test-hostpath                              2/2     Running            17 (119m ago)   22d     10.244.166.129   node1      <none>           <none>
kube-system   calico-kube-controllers-558d465845-wckg4   0/1     CrashLoopBackOff   36 (2m4s ago)   3h53m   10.244.104.2     node2      <none>           <none>
kube-system   calico-node-frpb9                          0/1     Running            1 (118m ago)    3h36m   192.168.95.12    node2      <none>           <none>
kube-system   calico-node-l5j54                          0/1     CrashLoopBackOff   39 (70s ago)    3h36m   192.168.95.11    node1      <none>           <none>
kube-system   calico-node-tqvb5                          0/1     CrashLoopBackOff   34 (69s ago)    3h36m   192.168.95.10    master01   <none>           <none>
kube-system   calico-typha-5b56944f9b-h5cps              1/1     Running            1 (118m ago)    3h53m   192.168.95.12    node2      <none>           <none>
kube-system   coredns-66f779496c-4qzzf                   1/1     Running            4 (119m ago)    22d     10.244.241.65    master01   <none>           <none>
kube-system   coredns-66f779496c-gbqqj                   0/1     Running            2 (119m ago)    5h13m   10.244.166.130   node1      <none>           <none>
kube-system   etcd-master01                              1/1     Running            17 (119m ago)   23d     192.168.95.10    master01   <none>           <none>
kube-system   kube-apiserver-master01                    1/1     Running            31 (119m ago)   23d     192.168.95.10    master01   <none>           <none>
kube-system   kube-controller-manager-master01           1/1     Running            20 (119m ago)   23d     192.168.95.10    master01   <none>           <none>
kube-system   kube-proxy-4qpbt                           1/1     Running            14 (118m ago)   23d     192.168.95.12    node2      <none>           <none>
kube-system   kube-proxy-9mh2v                           1/1     Running            17 (119m ago)   23d     192.168.95.10    master01   <none>           <none>
kube-system   kube-proxy-z2xfp                           1/1     Running            15 (119m ago)   23d     192.168.95.11    node1      <none>           <none>
kube-system   kube-scheduler-master01                    1/1     Running            21 (119m ago)   23d     192.168.95.10    master01   <none>           <none>
[root@master01 ~]# kubectl logs calico-node-l5j54  -n kube-system
Defaulted container "calico-node" out of: calico-node, upgrade-ipam (init), install-cni (init), mount-bpffs (init)
Error from server: Get "https://192.168.95.11:10250/containerLogs/kube-system/calico-node-l5j54/calico-node": dial tcp 192.168.95.11:10250: connect: no route to host
[root@master01 ~]# kubectl logs coredns-66f779496c-gbqqj  -n kube-system
Error from server: Get "https://192.168.95.11:10250/containerLogs/kube-system/coredns-66f779496c-gbqqj/coredns": dial tcp 192.168.95.11:10250: connect: no route to host
# 
[root@master01 ~]# telnet 192.168.95.11 10250
Trying 192.168.95.11...
telnet: connect to address 192.168.95.11: No route to host
[root@master01 ~]# telnet 192.168.95.12 10250
Trying 192.168.95.12...
telnet: connect to address 192.168.95.12: No route to host

可知master-> node01 或node02 都是no route to host

故障树分析

graph TD
    A[No Route to Host] --> B{网络层问题}
    A --> C[传输层问题]
    A --> D{应用层} 
    B --> B1[内核路由表]
    B --> B2[反向路径过滤]
    B --> B3[ICMP限制]
    
    C --> C1[连接跟踪]
    C --> C2[Firewall REJECT]
    C --> C3[Conntrack状态]
    C --> C4[TCP参数]
    
    D --> D1[端口监听的address是否正常]
    style A stroke:#ff0000,stroke-width:2px
    style B1 fill:#ffee90
    style B2 fill:#ff9999

逐步排查:

#网络层
  #已知两两直接可以互相ping通
#应用层
[root@node1 ~]# systemctl status kubelet
● kubelet.service - kubelet: The Kubernetes Node Agent
     Loaded: loaded (/usr/lib/systemd/system/kubelet.service; enabled; preset: disabled)
    Drop-In: /usr/lib/systemd/system/kubelet.service.d
             └─10-kubeadm.conf
     Active: active (running) since Sat 2025-06-07 14:54:27 CST; 25min ago
       Docs: https://kubernetes.io/docs/
   Main PID: 4976 (kubelet)
      Tasks: 12 (limit: 10852)
     Memory: 49.3M
        CPU: 15.959s
     CGroup: /system.slice/kubelet.service
             └─4976 /usr/bin/kubelet --bootstrap-kubeconfig=/etc/kubernetes/bootstrap-kubelet.conf --kubeconfig=/etc/kubernetes/kubelet.conf --config=/var/lib/kubelet/config.yaml --container-ru>

Jun 07 15:17:11 node1 kubelet[4976]: I0607 15:17:11.588024    4976 scope.go:117] "RemoveContainer" containerID="2a8abc30b8e9492661fd7ff3c589287502e8c4fe29cb2117771c48ed4825954b"
Jun 07 15:17:11 node1 kubelet[4976]: E0607 15:17:11.588323    4976 pod_workers.go:1300] "Error syncing pod, skipping" err="failed to \"StartContainer\" for \"calico-node\" with CrashLoopBackOff>
Jun 07 15:17:24 node1 kubelet[4976]: I0607 15:17:24.587072    4976 scope.go:117] "RemoveContainer" containerID="2a8abc30b8e9492661fd7ff3c589287502e8c4fe29cb2117771c48ed4825954b"
Jun 07 15:17:24 node1 kubelet[4976]: E0607 15:17:24.587341    4976 pod_workers.go:1300] "Error syncing pod, skipping" err="failed to \"StartContainer\" for \"calico-node\" with CrashLoopBackOff>
Jun 07 15:17:37 node1 kubelet[4976]: I0607 15:17:37.587080    4976 scope.go:117] "RemoveContainer" containerID="2a8abc30b8e9492661fd7ff3c589287502e8c4fe29cb2117771c48ed4825954b"

[root@node1 ~]# ss -anpt | grep 10250
LISTEN    0      30000              *:10250              *:*    users:(("kubelet",pid=4976,fd=13))   

应用正常运行,且正常监听所有地址的10250 端口,而非127.0.0.1地址。

#防火墙
iptables -L -n | grep 10250
不能确定是否放行10250端口

抓包

[root@node1 ~]# tcpdump -i ens33  host 192.168.95.10 and port 10250



[root@master01 ~]# telnet 192.168.95.11 10250
Trying 192.168.95.11...
telnet: connect to address 192.168.95.11: No route to host

# 发现 node1 收到了SYN包没有回包
[root@node1 ~]# tcpdump -i ens33  host 192.168.95.10 and port 10250
dropped privs to tcpdump
tcpdump: verbose output suppressed, use -v[v]... for full protocol decode
listening on ens33, link-type EN10MB (Ethernet), snapshot length 262144 bytes
15:27:58.878819 IP master01.42830 > node1.10250: Flags [S], seq 3745119612, win 64240, options [mss 1460,sackOK,TS val 907510165 ecr 0,nop,wscale 7], length 0

怀疑与kubelet服务或防火墙有关

#关闭kubelet
systemctl stop kubelet
# 使用原始套接字测试(其他方式打开10250端口)
socat TCP4-LISTEN:10250,reuseaddr,fork -
# node1 再次转包测试
# 发现依然收到SYN包,没有回包, 故与iptables规则有关
[root@node1 ~]#iptables -I INPUT -p tcp --dport 10250 -s 192.168.95.0/24 -j ACCEPT
[root@node2 ~]#iptables -I INPUT -p tcp --dport 10250 -s 192.168.95.0/24 -j ACCEPT
[root@master01 ~]# telnet node1 10250
Trying 192.168.95.11...
Connected to node1.
Escape character is '^]'.

[root@master01 ~]# telnet node2 10250
Trying 192.168.95.12...
Connected to node2.
Escape character is '^]'.
# 端口已通

果然问题在此

[root@master01 ~]# kubectl get pod -A -o wide
NAMESPACE     NAME                                       READY   STATUS             RESTARTS         AGE   IP              NODE       NOMINATED NODE   READINESS GATES
kube-system   calico-kube-controllers-558d465845-h7mfw   1/1     Running            1 (13h ago)      13h   10.244.241.68   master01   <none>           <none>
kube-system   calico-node-frpb9                          0/1     Running            2 (13h ago)      22h   192.168.95.12   node2      <none>           <none>
kube-system   calico-node-l5j54                          0/1     Running            54 (3s ago)      22h   192.168.95.11   node1      <none>           <none>
kube-system   calico-node-tqvb5                          0/1     CrashLoopBackOff   61 (3m48s ago)   22h   192.168.95.10   master01   <none>           <none>
kube-system   calico-typha-5b56944f9b-h5cps              1/1     Running            2 (15m ago)      22h   192.168.95.12   node2      <none>           <none>
kube-system   coredns-66f779496c-4qzzf                   1/1     Running            5 (13h ago)      22d   10.244.241.70   master01   <none>           <none>
kube-system   coredns-66f779496c-qq7nd                   1/1     Running            1 (13h ago)      13h   10.244.241.69   master01   <none>           <none>
kube-system   etcd-master01                              1/1     Running            18 (13h ago)     23d   192.168.95.10   master01   <none>           <none>
kube-system   kube-apiserver-master01                    1/1     Running            33 (15m ago)     23d   192.168.95.10   master01   <none>           <none>
kube-system   kube-controller-manager-master01           1/1     Running            22 (13h ago)     23d   192.168.95.10   master01   <none>           <none>
kube-system   kube-proxy-4qpbt                           1/1     Running            15 (15m ago)     23d   192.168.95.12   node2      <none>           <none>
kube-system   kube-proxy-9mh2v                           1/1     Running            18 (13h ago)     23d   192.168.95.10   master01   <none>           <none>
kube-system   kube-proxy-z2xfp                           1/1     Running            16 (16m ago)     23d   192.168.95.11   node1      <none>           <none>
kube-system   kube-scheduler-master01                    1/1     Running            23 (13h ago)     23d   192.168.95.10   master01   <none>           <none>

core-dns已经正常,但是还有calico-node 是not-ready。

1011.png

[root@master01 ~]# kubectl logs calico-node-l5j54 -n kube-system
...#省略若干
2025-06-08 02:06:21.917 [WARNING][66] felix/sync_client.go 192: Failed to connect to typha endpoint 192.168.95.12:5473.  Will try another if available... error=dial tcp 192.168.95.12:5473: connect: no route to host myID=0x1 type=""
[root@master01 ~]# telnet 192.168.95.11 5473
Trying 192.168.95.11...
telnet: connect to address 192.168.95.11: No route to host
#和上面问题可以确认是一致的

[root@node1 ~]# ss -anpt | grep 5473
SYN-SENT  0      1      192.168.95.11:13512 192.168.95.12:5473 users:(("calico-node",pid=18330,fd=10))              
SYN-SENT  0      1      192.168.95.11:63730 192.168.95.12:5473 users:(("calico-node",pid=18445,fd=10))

开始操作

[root@node1 ~]# iptables -I INPUT -p tcp --dport 5473 -s 192.168.95.0/24 -j ACCEPT

[root@master01 ~]# telnet 192.168.95.12 5473
Trying 192.168.95.12...
Connected to 192.168.95.12

最后记得保存iptables规则

#node1 和 node2 都执行
[root@node1 ~]# service iptables save
iptables: Saving firewall rules to /etc/sysconfig/iptables: [  OK  ]

新问题

pod calico-kube-controllers-558d465845-xf4ww依然是not-ready状态

kubectl logs calico-kube-controllers-558d465845-xf4ww -n kube-system发现,Failed to initialize datastore error=Get "https://10.0.0.1:443/apis/crd.projectcalico.org/v1/clusterinformations/default": dial tcp 10.0.0.1:443: connect: no route to host

但是我发现master节点和每个node节点都可以正常 telnet 10.0.0.1 443 是通的,为什么会有此报错导致 calico-kube-controller 为not-ready状态

信息如下

1,不能进入pod的容器排查 [root@master01 calico]# kubectl exec -ti calico-kube-controllers-558d465845-xf4ww -n kube-system -- /bin/sh error: unable to upgrade connection: container not found ("calico-kube-controllers") 2,svc正常运行 [root@master01 calico]# kubectl get svc NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE kubernetes ClusterIP 10.0.0.1 <none> 443/TCP 23d 3.endpoint正常存在 [root@master01 calico]# kubectl get endpoints kubernetes -n default NAME ENDPOINTS AGE kubernetes 192.168.95.10:6443 23d 4.发现在node1 和node2 执行iptables -F 后calico-kube-controllers-558d465845-xf4ww 正常运行,为ready状态.

最终还是iptables规则的问题

node1和node2都执行

[root@node1 ~]# iptables -I FORWARD -s 10.244.0.0/16 -j ACCEPT

发现calico-kube-controllers-558d465845-xf4ww为ready状态

done