跟踪 ovn fip snat dnat 转发路径,顺便简单介绍下kube-ovn 这些资源的使用方式
1. ovn 集中式网关 NAT 流量路径
[root@mst # k ko nbctl show
switch bb7cab91-5cc8-42ef-93af-486cab7c2291 (external)
port kube-ovn-worker
addresses: ["00:00:00:02:6A:FB 172.19.0.7"]
port external-ovn-cluster
type: router
router-port: ovn-cluster-external
port ln-external
type: localnet
addresses: ["unknown"]
port external-vpc-75989
type: router
router-port: vpc-75989-external
port kube-ovn-control-plane
addresses: ["00:00:00:5D:EF:19 172.19.0.6"]
switch 95f2aace-9665-4c6f-aefe-0c0a21b67c38 (ovn-default)
port coredns-787d4945fb-5fdk7.kube-system
addresses: ["00:00:00:0C:0F:D0 10.16.0.7"]
port kube-ovn-pinger-hlg6t.kube-system
addresses: ["00:00:00:8B:3E:A3 10.16.0.8"]
port ovn-default-ovn-cluster
type: router
router-port: ovn-cluster-ovn-default
port kube-ovn-pinger-5zrdb.kube-system
addresses: ["00:00:00:10:62:70 10.16.0.9"]
port coredns-787d4945fb-pqp2t.kube-system
addresses: ["00:00:00:D8:EB:5E 10.16.0.6"]
switch 3235b25a-815d-4870-8dd4-cfb85a49b41e (join)
port join-ovn-cluster
type: router
router-port: ovn-cluster-join
port node-kube-ovn-worker
addresses: ["00:00:00:93:93:4A 100.64.0.3"]
port node-kube-ovn-control-plane
addresses: ["00:00:00:A6:EB:BB 100.64.0.2"]
switch 32fe7ffb-709f-4a27-8f9b-f8f52a01e447 (overlay-subnet-73569)
port fip-busybox02.default
addresses: ["00:00:00:A2:70:8D 192.168.0.5"]
port fip-busybox01.default
addresses: ["00:00:00:48:38:46 192.168.0.4"]
port overlay-subnet-73569-vpc-75989
type: router
router-port: vpc-75989-overlay-subnet-73569
router 3cf06bbb-9db1-4ed0-949b-c4f13822c83a (ovn-cluster)
port ovn-cluster-external
mac: "00:00:00:BD:15:64"
networks: ["172.19.0.5/16"]
gateway chassis: [2e3175b0-2129-4504-a333-0c4ac1f680d7 2922af73-b67b-49ab-9a70-018c40af25a7]
port ovn-cluster-join
mac: "00:00:00:47:24:2F"
networks: ["100.64.0.1/16"]
port ovn-cluster-ovn-default
mac: "00:00:00:3F:6D:45"
networks: ["10.16.0.1/16"]
router 374d0e33-a2ff-413e-9dd1-cad108e82cc6 (vpc-75989)
port vpc-75989-overlay-subnet-73569
mac: "00:00:00:91:CC:AA"
networks: ["192.168.0.1/24"]
port vpc-75989-external
mac: "00:00:00:21:44:FD"
networks: ["172.19.0.4/16"]
gateway chassis: [2922af73-b67b-49ab-9a70-018c40af25a7 2e3175b0-2129-4504-a333-0c4ac1f680d7]
nat 07dc7f5a-96ef-4633-98f3-00aff46ac054
external ip: "172.19.0.8"
logical ip: "192.168.0.2"
type: "dnat_and_snat"
nat 1a307979-f0f0-4749-b39e-91a38f7895ce
external ip: "172.19.0.11"
logical ip: "192.168.0.4"
type: "dnat_and_snat" # fip
nat 2e66cfb6-1bfb-4263-a725-5d09f3c8982a
external ip: "172.19.0.12"
logical ip: "192.168.0.5"
type: "dnat_and_snat"
nat 513a2811-41ac-47a0-8d41-b7d3a2c08d3a
external ip: "172.19.0.9"
logical ip: "192.168.0.0/24"
type: "snat"
[root@mst #
[root@mst # k ko nbctl lr-lb-list vpc-75989
UUID LB PROTO VIP IPs
a11397e0-9cec-4627-a552-9c6877eceb8b dnat-74661 tcp 172.19.0.10:8080 192.168.0.3:80
# dnat
1.1 ovn-eip的使用
ovn-eip 仅用于分配公网ip
[root@mst # k get oeip
NAME V4IP V6IP MAC TYPE READY
dnat-eip-71780 172.19.0.10 00:00:00:D3:98:E7 dnat true
fip-eip-76463 172.19.0.8 00:00:00:EB:5D:E1 fip true
kube-ovn-control-plane 172.19.0.6 00:00:00:5D:EF:19 node-ext-gw true
kube-ovn-worker 172.19.0.7 00:00:00:02:6A:FB node-ext-gw true
oeip-busybox01 172.19.0.11 00:00:00:FC:05:21 fip true
oeip-busybox02 172.19.0.12 00:00:00:E8:A8:6D fip true
ovn-cluster-external 172.19.0.5 00:00:00:BD:15:64 lrp true
snat-eip-76840 172.19.0.9 00:00:00:3F:C0:77 snat true
vpc-75989-external 172.19.0.4 00:00:00:21:44:FD lrp true
# ovn-eip 资源用于从公网网段分配ip,且记录该公网ip的被用于的资源类型,目前有4种,比如dnat,fip,snat,以及bfd的下一跳网关节点的网卡
1.2 跟踪fip的流量
[root@mst # k get po -A -o wide | grep busy
default fip-busybox01 1/1 Running 0 54s 192.168.0.6 kube-ovn-control-plane
default fip-busybox02 1/1 Running 0 54s 192.168.0.7 kube-ovn-worker
[root@mst # k get ofip
NAME VPC V4EIP V4IP READY IPTYPE IPNAME
fip-71198 vpc-75989 172.19.0.8 192.168.0.2 true vip fip-vip-77485
ofip-busybox01 vpc-75989 172.19.0.11 192.168.0.6 true ip fip-busybox01.default
ofip-busybox02 vpc-75989 172.19.0.12 192.168.0.7 true ip fip-busybox02.default
# 可以看到 两个pod 分属于两个node,下面验证fip的流量都从pod所在node的本地出,本地进
# 在集群外 ping node kube-ovn-control-plane 的 pod 的 fip
## 确认从 kube-ovn-control-plane 进
root@kube-ovn-control-plane:/kube-ovn# tcpdump -i any host 192.168.0.6 or 172.19.0.11 -netvv
tcpdump: data link type LINUX_SLL2
tcpdump: listening on any, link-type LINUX_SLL2 (Linux cooked v2), snapshot length 262144 bytes
eth1 P ifindex 81 02:42:57:db:a9:c6 ethertype IPv4 (0x0800), length 104: (tos 0x0, ttl 64, id 28471, offset 0, flags [DF], proto ICMP (1), length 84)
172.19.0.1 > 172.19.0.11: ICMP echo request, id 3846, seq 1, length 64
dd2b931fc304_h Out ifindex 23 00:00:00:91:cc:aa ethertype IPv4 (0x0800), length 104: (tos 0x0, ttl 63, id 28471, offset 0, flags [DF], proto ICMP (1), length 84)
172.19.0.1 > 192.168.0.6: ICMP echo request, id 3846, seq 1, length 64
dd2b931fc304_h P ifindex 23 00:00:00:de:cb:07 ethertype IPv4 (0x0800), length 104: (tos 0x0, ttl 64, id 37678, offset 0, flags [none], proto ICMP (1), length 84)
192.168.0.6 > 172.19.0.1: ICMP echo reply, id 3846, seq 1, length 64
eth1 Out ifindex 81 00:00:00:21:44:fd ethertype IPv4 (0x0800), length 104: (tos 0x0, ttl 63, id 37678, offset 0, flags [none], proto ICMP (1), length 84)
172.19.0.11 > 172.19.0.1: ICMP echo reply, id 3846, seq 1, length 64
eth1 P ifindex 81 02:42:57:db:a9:c6 ethertype ARP (0x0806), length 48: Ethernet (len 6), IPv4 (len 4), Request who-has 172.19.0.11 tell 172.19.0.1, length 28
dd2b931fc304_h P ifindex 23 00:00:00:de:cb:07 ethertype ARP (0x0806), length 48: Ethernet (len 6), IPv4 (len 4), Request who-has 192.168.0.1 tell 192.168.0.6, length 28
dd2b931fc304_h Out ifindex 23 00:00:00:91:cc:aa ethertype ARP (0x0806), length 48: Ethernet (len 6), IPv4 (len 4), Reply 192.168.0.1 is-at 00:00:00:91:cc:aa, length 28
eth1 Out ifindex 81 00:00:00:21:44:fd ethertype ARP (0x0806), length 48: Ethernet (len 6), IPv4 (len 4), Reply 172.19.0.11 is-at 00:00:00:21:44:fd, length 28
# 可以看到dnat的过程在 kube-ovn-control-plane ovn-cni中抓包, fip的流量经eth1 到br-external
集群外node--> 交换机 --> eth1--> cust-vpc-subnet-veth
可以看到dnat时,目标mac是
# 同样的测试下,fip出
[root@mst # k exec -it -n default fip-busybox01 bash
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
fip-busybox01:~# ip a
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
valid_lft forever preferred_lft forever
inet6 ::1/128 scope host
valid_lft forever preferred_lft forever
22: eth0@if23: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1400 qdisc noqueue state UP group default
link/ether 00:00:00:de:cb:07 brd ff:ff:ff:ff:ff:ff link-netnsid 0
inet 192.168.0.6/24 brd 192.168.0.255 scope global eth0
valid_lft forever preferred_lft forever
inet6 fe80::200:ff:fede:cb07/64 scope link
valid_lft forever preferred_lft forever
fip-busybox01:~# route -n
Kernel IP routing table
Destination Gateway Genmask Flags Metric Ref Use Iface
0.0.0.0 192.168.0.1 0.0.0.0 UG 0 0 0 eth0
192.168.0.0 0.0.0.0 255.255.255.0 U 0 0 0 eth0
fip-busybox01:~# ping -c1 172.19.0.1
PING 172.19.0.1 (172.19.0.1) 56(84) bytes of data.
64 bytes from 172.19.0.1: icmp_seq=1 ttl=63 time=1.30 ms
--- 172.19.0.1 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 1.299/1.299/1.299/0.000 ms
root@kube-ovn-control-plane:/kube-ovn# tcpdump -i any host 192.168.0.6 or 172.19.0.11 -netvv
tcpdump: data link type LINUX_SLL2
tcpdump: listening on any, link-type LINUX_SLL2 (Linux cooked v2), snapshot length 262144 bytes
dd2b931fc304_h P ifindex 23 00:00:00:de:cb:07 ethertype IPv4 (0x0800), length 104: (tos 0x0, ttl 64, id 14373, offset 0, flags [DF], proto ICMP (1), length 84)
192.168.0.6 > 172.19.0.1: ICMP echo request, id 20900, seq 1, length 64
eth1 Out ifindex 81 00:00:00:21:44:fd ethertype IPv4 (0x0800), length 104: (tos 0x0, ttl 63, id 14373, offset 0, flags [DF], proto ICMP (1), length 84)
172.19.0.11 > 172.19.0.1: ICMP echo request, id 20900, seq 1, length 64
eth1 P ifindex 81 02:42:57:db:a9:c6 ethertype IPv4 (0x0800), length 104: (tos 0x0, ttl 64, id 42193, offset 0, flags [none], proto ICMP (1), length 84)
172.19.0.1 > 172.19.0.11: ICMP echo reply, id 20900, seq 1, length 64
dd2b931fc304_h Out ifindex 23 00:00:00:91:cc:aa ethertype IPv4 (0x0800), length 104: (tos 0x0, ttl 63, id 42193, offset 0, flags [none], proto ICMP (1), length 84)
172.19.0.1 > 192.168.0.6: ICMP echo reply, id 20900, seq 1, length 64
dd2b931fc304_h P ifindex 23 00:00:00:de:cb:07 ethertype ARP (0x0806), length 48: Ethernet (len 6), IPv4 (len 4), Request who-has 192.168.0.1 tell 192.168.0.6, length 28
eth1 P ifindex 81 02:42:57:db:a9:c6 ethertype ARP (0x0806), length 48: Ethernet (len 6), IPv4 (len 4), Request who-has 172.19.0.11 tell 172.19.0.1, length 28
dd2b931fc304_h Out ifindex 23 00:00:00:91:cc:aa ethertype ARP (0x0806), length 48: Ethernet (len 6), IPv4 (len 4), Reply 192.168.0.1 is-at 00:00:00:91:cc:aa, length 28
eth1 Out ifindex 81 00:00:00:21:44:fd ethertype ARP (0x0806), length 48: Ethernet (len 6), IPv4 (len 4), Reply 172.19.0.11 is-at 00:00:00:21:44:fd, length 28
# 可以看到出去的流量走 eth1直接出,
cust-vpc-subnet-lsp --> br-int --> localnet -->eth1
## 注意snat后出去时 fip src的mac 00:00:00:21:44:fd 是lrp 的 mac
[root@mst # k ko nbctl show | grep -C2 -i "00:00:00:21:44:fd"
networks: ["192.168.0.1/24"]
port vpc-75989-external
mac: "00:00:00:21:44:FD"
networks: ["172.19.0.4/16"]
gateway chassis: [2922af73-b67b-49ab-9a70-018c40af25a7 2e3175b0-2129-4504-a333-0c4ac1f680d7]
fip-busybox01:~# traceroute -n 172.19.0.1
traceroute to 172.19.0.1 (172.19.0.1), 30 hops max, 46 byte packets
1 192.168.0.1 1.981 ms 0.454 ms 0.354 ms
2 172.19.0.1 1.313 ms 0.180 ms 0.114 ms
查看另一个节点的pod出来的包,不再详细描述,只记录包确实是从pod所在node上出来的。且mac也是lrp的。
目前测试出来的结果是vpc的 fip 公网功能是集中式的,如果要测试分布式FIP,需要基于 ovn-external-gw-config 配置调整下。
参考: kubeovn.github.io/docs/v1.12.…
2. 分布式网关
2.1 分布式网关跟踪 FIP 流量
# 环境
# k get cm -n kube-system ovn-external-gw-config -o yaml
apiVersion: v1
data:
enable-external-gw: "true"
external-gw-addr: 172.19.0.0/16
external-gw-nic: eth1
external-gw-nodes: kube-ovn-control-plane,kube-ovn-worker
type: distributed # 这里为分布式的
kind: ConfigMap
metadata:
name: ovn-external-gw-config
namespace: kube-system
# 准备两个pod 用于观察fip确实是分布式的
(vv) root@wk:~/dev/tests/kube-ovn/kind-dvr-fip# kgp | grep nginx
default nginx-netshoot-0 2/2 Running 0 7m30s 192.168.0.2 kube-ovn-worker
default nginx-netshoot-1 2/2 Running 0 7m10s 192.168.0.3 kube-ovn-control-plane
(vv) root@wk:~/dev/tests/kube-ovn/kind-dvr-fip# k get ofip
NAME VPC V4EIP V4IP READY IPTYPE IPNAME
fip1 vpc1 172.19.0.12 192.168.0.2 true nginx-netshoot-0.default
fip2 vpc1 172.19.0.13 192.168.0.3 true nginx-netshoot-1.default
(vv) root@wk:~/dev/tests/kube-ovn/kind-dvr-fip# k ko nbctl show
switch 615a8c68-4bcd-4e82-b880-f6abf9eb4ca2 (external)
port external-vpc-45847
type: router
router-port: vpc-45847-external
port localnet.external
type: localnet
addresses: ["unknown"]
port external-vpc1
type: router
router-port: vpc1-external
port external-ovn-cluster
type: router
router-port: ovn-cluster-external
switch 76af40bd-3253-4245-85c6-38d6de9d5358 (vpc1-subnet1)
port vpc1-subnet1-vpc1
type: router
router-port: vpc1-vpc1-subnet1
port nginx-netshoot-1.default
addresses: ["00:00:00:F2:0D:7B 192.168.0.3"]
port nginx-netshoot-0.default
addresses: ["00:00:00:C4:5B:CE 192.168.0.2"]
router 6fcbfa41-bac3-4f15-ad12-c9a75defd629 (vpc1)
port vpc1-vpc1-subnet1
mac: "00:00:00:A8:9F:B5"
networks: ["192.168.0.1/24"]
port vpc1-external
mac: "00:00:00:A6:95:CC"
networks: ["172.19.0.11/16"]
gateway chassis: [b177a84b-74e5-498a-9501-13e6628b796f 2c47abdb-6350-4373-a839-68c7181d964a]
nat cd5ba8d0-8156-4f89-b279-629be9f3bf94
external ip: "172.19.0.12"
logical ip: "192.168.0.2"
type: "dnat_and_snat"
nat dc8320a0-6071-4ac6-ace9-672ed4f5c67f
external ip: "172.19.0.13"
logical ip: "192.168.0.3"
type: "dnat_and_snat"
router 18e187fb-1b03-4f90-ab37-ec519488134b (vpc-45847)
port vpc-45847-external
mac: "00:00:00:AC:4C:17"
networks: ["172.19.0.4/16"]
gateway chassis: [2c47abdb-6350-4373-a839-68c7181d964a b177a84b-74e5-498a-9501-13e6628b796f]
...
(vv) root@wk:~/dev/tests/kube-ovn/kind-dvr-fip# k ko nbctl lr-route-list vpc1
IPv4 Routes
Route Table <main>:
0.0.0.0/0 172.19.0.1 dst-ip
当前是kind环境,在集群外 (kind集群所在node上)ping fip,抓包看dvr fip转发路径
结论:DVR fip 从pod所在node直接响应,相应的mac就是pod网卡的mac
(vv) root@wk:~# k ko nbctl find NAT type=dnat_and_snat
_uuid : dc8320a0-6071-4ac6-ace9-672ed4f5c67f
allowed_ext_ips : []
exempted_ext_ips : []
external_ids : {}
external_ip : "172.19.0.13"
external_mac : "00:00:00:F2:0D:7B"
external_port_range : ""
gateway_port : []
logical_ip : "192.168.0.3"
logical_port : nginx-netshoot-1.default
options : {stateless="true"}
type : dnat_and_snat
_uuid : cd5ba8d0-8156-4f89-b279-629be9f3bf94
allowed_ext_ips : []
exempted_ext_ips : []
external_ids : {}
external_ip : "172.19.0.12"
external_mac : "00:00:00:C4:5B:CE"
external_port_range : ""
gateway_port : []
logical_ip : "192.168.0.2"
logical_port : nginx-netshoot-0.default
options : {stateless="true"}
type : dnat_and_snat
(vv) root@wk:~#
2.2 跟踪 SNAT 流量
# 创建一个subnet绑定到整个vpc 子网
router bdee7174-a005-4172-95e5-9e5cd44508aa (vpc1)
port vpc1-vpc1-subnet1
mac: "00:00:00:BE:9A:F2"
networks: ["192.168.0.1/24"]
port vpc1-external
mac: "00:00:00:A8:69:64"
networks: ["172.19.0.11/16"]
gateway chassis: [e40bf1d2-a70a-4015-b867-b6c814b16f1d 08b7433a-fdc2-4c5a-9f7f-cd6eb87d08b7]
nat a1dbe648-dd12-4d1e-ae0b-3044ad29e583
external ip: "172.19.0.12"
logical ip: "192.168.0.0/24"
type: "snat"
(vv) root@wk:~/dev/tests/kube-ovn/kind/dvr-fip# k get oeip
NAME V4IP V6IP MAC TYPE READY
snat-for-subnet-in-vpc 172.19.0.12 00:00:00:36:CA:A6 snat true
vpc1-external 172.19.0.11 00:00:00:A8:69:64 lrp true
# snat crd 操作模版
(vv) root@wk:~/dev/tests/kube-ovn/kind/dvr-fip# cat 05-snat.yaml
---
kind: OvnEip
apiVersion: kubeovn.io/v1
metadata:
name: snat-for-subnet-in-vpc
spec:
externalSubnet: external
type: snat
---
kind: OvnSnatRule
apiVersion: kubeovn.io/v1
metadata:
name: snat-for-subnet-in-vpc
spec:
ovnEip: snat-for-subnet-in-vpc
vpcSubnet: vpc1-subnet1 # eip 对应整个网段
2.3 跟踪 DNAT 流量
这里说明下kube-ovn 目前dnat的设计,由于ovn调整过一次dnat的设计,导致端口转发粒度的dnat无法基于NAT表实现,只能基于router LB实现,所以目前kube-ovn的ovn 原生dnat实际上是router lb作为后端
(vv) root@wk:~/dev/tests/kube-ovn/kind/dvr-fip# k ko nbctl lb-list
UUID LB PROTO VIP IPs
09d7bcfe-aadb-41bc-8642-9e9efff83455 dnat01 tcp 172.19.0.13:8000 192.168.0.2:80
dc5107c1-6f11-432d-bc5b-9367d459461a dnat02 tcp 172.19.0.14:8000 192.168.0.3:80
# 模版
(vv) root@wk:~/dev/tests/kube-ovn/kind/dvr-fip# cat 06-dnat.yaml
---
kind: OvnEip
apiVersion: kubeovn.io/v1
metadata:
name: dnat01
spec:
externalSubnet: external
---
kind: OvnDnatRule
apiVersion: kubeovn.io/v1
metadata:
name: dnat01
spec:
ovnEip: dnat01
ipName: nginx-netshoot-0.default # 注意这里是 pod ip crd 的名字,具有唯一性
protocol: tcp
internalPort: "80"
externalPort: "8000"
---
kind: OvnEip
apiVersion: kubeovn.io/v1
metadata:
name: dnat02
spec:
externalSubnet: external
---
kind: OvnDnatRule
apiVersion: kubeovn.io/v1
metadata:
name: dnat02
spec:
ovnEip: dnat02
ipName: nginx-netshoot-1.default # 注意这里是 pod ip crd 的名字,具有唯一性
protocol: tcp
internalPort: "80"
externalPort: "8000"
测试从集群外访问dnat 端口转发的流量是否是集中式的