节点情况
qingcloud(master) 139.198.18.18
qingcloud2(worker) 139.198.17.9
初始pod如下
[root@qingcloud ~]# kubectl get pods -A -owide
NAMESPACE NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
default k8sutils-1 1/1 Running 0 41m 10.244.1.2 qingcloud2 <none> <none>
kube-flannel kube-flannel-ds-bl6j8 1/1 Running 0 43m 139.198.18.18 qingcloud <none> <none>
kube-flannel kube-flannel-ds-xnnjf 1/1 Running 0 43m 139.198.17.9 qingcloud2 <none> <none>
kube-system coredns-7bdc4cb885-8k6wq 1/1 Running 0 55m 10.244.0.3 qingcloud <none> <none>
kube-system coredns-7bdc4cb885-smnkl 1/1 Running 0 55m 10.244.0.2 qingcloud <none> <none>
kube-system etcd-qingcloud 1/1 Running 0 56m 139.198.18.18 qingcloud <none> <none>
kube-system kube-apiserver-qingcloud 1/1 Running 0 56m 139.198.18.18 qingcloud <none> <none>
kube-system kube-controller-manager-qingcloud 1/1 Running 0 56m 139.198.18.18 qingcloud <none> <none>
kube-system kube-proxy-24nql 1/1 Running 0 55m 139.198.18.18 qingcloud <none> <none>
kube-system kube-proxy-6mj2m 1/1 Running 0 44m 139.198.17.9 qingcloud2 <none> <none>
kube-system kube-scheduler-qingcloud 1/1 Running 0 56m 139.198.18.18 qingcloud <none> <none>
连接k8sutils-1
kubectl exec -it k8sutils-1 -- bash
[root@k8sutils-1 /]# ip a
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
valid_lft forever preferred_lft forever
inet6 ::1/128 scope host
valid_lft forever preferred_lft forever
2: tunl0@NONE: <NOARP> mtu 1480 qdisc noop state DOWN group default qlen 1000
link/ipip 0.0.0.0 brd 0.0.0.0
3: eth0@if7: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1450 qdisc noqueue state UP group default
link/ether c2:53:79:4c:cc:28 brd ff:ff:ff:ff:ff:ff link-netnsid 0
inet 10.244.1.2/24 brd 10.244.1.255 scope global eth0
valid_lft forever preferred_lft forever
inet6 fe80::c053:79ff:fe4c:cc28/64 scope link
valid_lft forever preferred_lft forever
可以看到该pod的ip是10.244.1.2 使用该ip的pod去ping其他节点的pod例如10.244.0.3,可以看到如下结果,ping不通,丢包率为100%
[root@k8sutils-1 /]# ping 10.244.0.3
PING 10.244.0.3 (10.244.0.3) 56(84) bytes of data.
^C
--- 10.244.0.3 ping statistics ---
178 packets transmitted, 0 received, 100% packet loss, time 181246ms
由于 k8sutils-1属于qingcloud2节点,所以在这个节点抓包
因为使用了flannel的vxlan模式,flanneld会监听在8472的udp端口,通过该端口传递vxlan数据,抓包如下
[root@qingcloud2 ~]# tcpdump -i eth0 -s0 -nnn port 8472
dropped privs to tcpdump
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on eth0, link-type EN10MB (Ethernet), capture size 262144 bytes
22:08:32.095199 IP 139.198.17.9.60090 > 139.198.18.18.8472: OTV, flags [I] (0x08), overlay 0, instance 1
IP 10.244.1.2 > 10.244.0.2: ICMP echo request, id 63, seq 309, length 64
22:08:33.119205 IP 139.198.17.9.60090 > 139.198.18.18.8472: OTV, flags [I] (0x08), overlay 0, instance 1
^C
本机的ip如下
[root@qingcloud2 ~]# ip a
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
valid_lft forever preferred_lft forever
inet6 ::1/128 scope host
valid_lft forever preferred_lft forever
2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP group default qlen 1000
link/ether 52:54:99:12:d4:d2 brd ff:ff:ff:ff:ff:ff
inet 139.198.17.9/32 scope global noprefixroute eth0:1
valid_lft forever preferred_lft forever
inet 10.120.109.23/24 brd 10.120.109.255 scope global dynamic noprefixroute eth0
valid_lft 82686sec preferred_lft 82686sec
inet6 fe80::5054:99ff:fe12:d4d2/64 scope link
valid_lft forever preferred_lft forever
3: tunl0@NONE: <NOARP> mtu 1480 qdisc noop state DOWN group default qlen 1000
link/ipip 0.0.0.0 brd 0.0.0.0
4: kube-ipvs0: <BROADCAST,NOARP> mtu 1500 qdisc noop state DOWN group default
link/ether da:e5:a6:34:65:c1 brd ff:ff:ff:ff:ff:ff
inet 10.96.0.10/32 scope global kube-ipvs0
valid_lft forever preferred_lft forever
inet 10.96.0.1/32 scope global kube-ipvs0
valid_lft forever preferred_lft forever
5: flannel.1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1450 qdisc noqueue state UNKNOWN group default
link/ether f6:35:d3:a4:e4:09 brd ff:ff:ff:ff:ff:ff
inet 10.244.1.0/32 scope global flannel.1
valid_lft forever preferred_lft forever
inet6 fe80::f435:d3ff:fea4:e409/64 scope link
valid_lft forever preferred_lft forever
6: cni0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1450 qdisc noqueue state UP group default qlen 1000
link/ether 76:a9:dc:27:0f:c2 brd ff:ff:ff:ff:ff:ff
inet 10.244.1.1/24 brd 10.244.1.255 scope global cni0
valid_lft forever preferred_lft forever
inet6 fe80::74a9:dcff:fe27:fc2/64 scope link
valid_lft forever preferred_lft forever
7: veth12ccda06@if3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1450 qdisc noqueue master cni0 state UP group default
link/ether 7a:5a:eb:d3:e2:b9 brd ff:ff:ff:ff:ff:ff link-netns cni-595b02b7-f4f4-1dc8-6f91-e323322432a9
inet6 fe80::785a:ebff:fed3:e2b9/64 scope link
valid_lft forever preferred_lft forever
本机路由如下
[root@qingcloud2 ~]# route -n
Kernel IP routing table
Destination Gateway Genmask Flags Metric Ref Use Iface
0.0.0.0 10.120.109.1 0.0.0.0 UG 100 0 0 eth0
10.120.109.0 0.0.0.0 255.255.255.0 U 100 0 0 eth0
10.244.0.0 10.244.0.0 255.255.255.0 UG 0 0 0 flannel.1
10.244.1.0 0.0.0.0 255.255.255.0 U 0 0 0 cni0
通过上面的分析可以得知,flannel将vxlan数据包的使用了139.198.17.9封包,使用该ip封包后在转发出去时候会走默认网关,所以该数据包会发送给10.120.109.1,但是根据从青云得知他们会再进行一步nat转换,即将10.120.109.23转换为139.198.17.9转发出去,对于其他的包无法转换的包丢弃,所以上面flannel产生的vxlan数据包发不出去。
解决办法
添加一条iptables规则,将数据包的源地址变为10.120.109.23,同理,其他节点也许要添加规则,为了能将数据转发出去
iptables -t nat -I POSTROUTING -o eth0 -p udp -s 139.198.17.9 -j SNAT --to-source 10.120.109.23
查看效果
可以看到ip通了,即不同主机的pod节点互通了
[root@k8sutils-1 /]# ping 10.244.0.3
PING 10.244.0.3 (10.244.0.3) 56(84) bytes of data.
64 bytes from 10.244.0.3: icmp_seq=1 ttl=62 time=0.432 ms
64 bytes from 10.244.0.3: icmp_seq=2 ttl=62 time=0.406 ms
64 bytes from 10.244.0.3: icmp_seq=3 ttl=62 time=0.440 ms
64 bytes from 10.244.0.3: icmp_seq=4 ttl=62 time=0.462 ms
64 bytes from 10.244.0.3: icmp_seq=5 ttl=62 time=0.466 ms
64 bytes from 10.244.0.3: icmp_seq=6 ttl=62 time=0.469 ms
64 bytes from 10.244.0.3: icmp_seq=7 ttl=62 time=0.472 ms
64 bytes from 10.244.0.3: icmp_seq=8 ttl=62 time=0.452 ms
^C
--- 10.244.0.3 ping statistics ---
8 packets transmitted, 8 received, 0% packet loss, time 7175ms
rtt min/avg/max/mdev = 0.406/0.449/0.472/0.035 ms
查看此时数据包,可以看到源ip地址变了
[root@qingcloud2 ~]# tcpdump -i eth0 -s0 -nnn port 8472
dropped privs to tcpdump
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on eth0, link-type EN10MB (Ethernet), capture size 262144 bytes
22:23:05.987058 IP 10.120.109.23.60090 > 139.198.18.18.8472: OTV, flags [I] (0x08), overlay 0, instance 1
IP 10.244.1.2 > 10.244.0.2: ICMP echo request, id 71, seq 1, length 64
22:23:05.987575 IP 139.198.18.18.47919 > 10.120.109.23.8472: OTV, flags [I] (0x08), overlay 0, instance 1
IP 10.244.0.2 > 10.244.1.2: ICMP echo reply, id 71, seq 1, length 64