ovn 原生 fip snat dnat 转发路径

786 阅读13分钟

跟踪 ovn fip snat dnat 转发路径,顺便简单介绍下kube-ovn 这些资源的使用方式

1. ovn 集中式网关 NAT 流量路径


[root@mst #  k ko nbctl show
switch bb7cab91-5cc8-42ef-93af-486cab7c2291 (external)
    port kube-ovn-worker
        addresses: ["00:00:00:02:6A:FB 172.19.0.7"]
    port external-ovn-cluster
        type: router
        router-port: ovn-cluster-external
    port ln-external
        type: localnet
        addresses: ["unknown"]
    port external-vpc-75989
        type: router
        router-port: vpc-75989-external
    port kube-ovn-control-plane
        addresses: ["00:00:00:5D:EF:19 172.19.0.6"]
switch 95f2aace-9665-4c6f-aefe-0c0a21b67c38 (ovn-default)
    port coredns-787d4945fb-5fdk7.kube-system
        addresses: ["00:00:00:0C:0F:D0 10.16.0.7"]
    port kube-ovn-pinger-hlg6t.kube-system
        addresses: ["00:00:00:8B:3E:A3 10.16.0.8"]
    port ovn-default-ovn-cluster
        type: router
        router-port: ovn-cluster-ovn-default
    port kube-ovn-pinger-5zrdb.kube-system
        addresses: ["00:00:00:10:62:70 10.16.0.9"]
    port coredns-787d4945fb-pqp2t.kube-system
        addresses: ["00:00:00:D8:EB:5E 10.16.0.6"]
switch 3235b25a-815d-4870-8dd4-cfb85a49b41e (join)
    port join-ovn-cluster
        type: router
        router-port: ovn-cluster-join
    port node-kube-ovn-worker
        addresses: ["00:00:00:93:93:4A 100.64.0.3"]
    port node-kube-ovn-control-plane
        addresses: ["00:00:00:A6:EB:BB 100.64.0.2"]
switch 32fe7ffb-709f-4a27-8f9b-f8f52a01e447 (overlay-subnet-73569)
    port fip-busybox02.default
        addresses: ["00:00:00:A2:70:8D 192.168.0.5"]
    port fip-busybox01.default
        addresses: ["00:00:00:48:38:46 192.168.0.4"]
    port overlay-subnet-73569-vpc-75989
        type: router
        router-port: vpc-75989-overlay-subnet-73569
router 3cf06bbb-9db1-4ed0-949b-c4f13822c83a (ovn-cluster)
    port ovn-cluster-external
        mac: "00:00:00:BD:15:64"
        networks: ["172.19.0.5/16"]
        gateway chassis: [2e3175b0-2129-4504-a333-0c4ac1f680d7 2922af73-b67b-49ab-9a70-018c40af25a7]
    port ovn-cluster-join
        mac: "00:00:00:47:24:2F"
        networks: ["100.64.0.1/16"]
    port ovn-cluster-ovn-default
        mac: "00:00:00:3F:6D:45"
        networks: ["10.16.0.1/16"]
router 374d0e33-a2ff-413e-9dd1-cad108e82cc6 (vpc-75989)
    port vpc-75989-overlay-subnet-73569
        mac: "00:00:00:91:CC:AA"
        networks: ["192.168.0.1/24"]
    port vpc-75989-external
        mac: "00:00:00:21:44:FD"
        networks: ["172.19.0.4/16"]
        gateway chassis: [2922af73-b67b-49ab-9a70-018c40af25a7 2e3175b0-2129-4504-a333-0c4ac1f680d7]
    nat 07dc7f5a-96ef-4633-98f3-00aff46ac054
        external ip: "172.19.0.8"
        logical ip: "192.168.0.2"
        type: "dnat_and_snat"
    nat 1a307979-f0f0-4749-b39e-91a38f7895ce
        external ip: "172.19.0.11"
        logical ip: "192.168.0.4"
        type: "dnat_and_snat" # fip
    nat 2e66cfb6-1bfb-4263-a725-5d09f3c8982a
        external ip: "172.19.0.12"
        logical ip: "192.168.0.5"
        type: "dnat_and_snat"
    nat 513a2811-41ac-47a0-8d41-b7d3a2c08d3a
        external ip: "172.19.0.9"
        logical ip: "192.168.0.0/24"
        type: "snat"
[root@mst #

[root@mst #  k ko nbctl lr-lb-list vpc-75989
UUID                                    LB                  PROTO      VIP                 IPs
a11397e0-9cec-4627-a552-9c6877eceb8b    dnat-74661          tcp        172.19.0.10:8080    192.168.0.3:80
# dnat

1.1 ovn-eip的使用

ovn-eip 仅用于分配公网ip


[root@mst #  k get oeip
NAME                     V4IP          V6IP   MAC                 TYPE          READY
dnat-eip-71780           172.19.0.10          00:00:00:D3:98:E7   dnat          true
fip-eip-76463            172.19.0.8           00:00:00:EB:5D:E1   fip           true
kube-ovn-control-plane   172.19.0.6           00:00:00:5D:EF:19   node-ext-gw   true
kube-ovn-worker          172.19.0.7           00:00:00:02:6A:FB   node-ext-gw   true
oeip-busybox01           172.19.0.11          00:00:00:FC:05:21   fip           true
oeip-busybox02           172.19.0.12          00:00:00:E8:A8:6D   fip           true
ovn-cluster-external     172.19.0.5           00:00:00:BD:15:64   lrp           true
snat-eip-76840           172.19.0.9           00:00:00:3F:C0:77   snat          true
vpc-75989-external       172.19.0.4           00:00:00:21:44:FD   lrp           true


# ovn-eip 资源用于从公网网段分配ip,且记录该公网ip的被用于的资源类型,目前有4种,比如dnat,fip,snat,以及bfd的下一跳网关节点的网卡

1.2 跟踪fip的流量


[root@mst #  k get po -A -o wide | grep busy
default       fip-busybox01 1/1     Running   0    54s     192.168.0.6   kube-ovn-control-plane  
default       fip-busybox02 1/1     Running   0    54s     192.168.0.7   kube-ovn-worker       


[root@mst #  k get ofip
NAME             VPC         V4EIP         V4IP          READY   IPTYPE   IPNAME
fip-71198        vpc-75989   172.19.0.8    192.168.0.2   true    vip      fip-vip-77485
ofip-busybox01   vpc-75989   172.19.0.11   192.168.0.6   true    ip       fip-busybox01.default
ofip-busybox02   vpc-75989   172.19.0.12   192.168.0.7   true    ip       fip-busybox02.default


# 可以看到 两个pod 分属于两个node,下面验证fip的流量都从pod所在node的本地出,本地进


# 在集群外 ping node kube-ovn-control-plane 的 pod 的 fip
## 确认从 kube-ovn-control-plane 进



root@kube-ovn-control-plane:/kube-ovn# tcpdump -i any host 192.168.0.6 or 172.19.0.11 -netvv
tcpdump: data link type LINUX_SLL2
tcpdump: listening on any, link-type LINUX_SLL2 (Linux cooked v2), snapshot length 262144 bytes
eth1  P   ifindex 81 02:42:57:db:a9:c6 ethertype IPv4 (0x0800), length 104: (tos 0x0, ttl 64, id 28471, offset 0, flags [DF], proto ICMP (1), length 84)
    172.19.0.1 > 172.19.0.11: ICMP echo request, id 3846, seq 1, length 64
dd2b931fc304_h Out ifindex 23 00:00:00:91:cc:aa ethertype IPv4 (0x0800), length 104: (tos 0x0, ttl 63, id 28471, offset 0, flags [DF], proto ICMP (1), length 84)
    172.19.0.1 > 192.168.0.6: ICMP echo request, id 3846, seq 1, length 64
dd2b931fc304_h P   ifindex 23 00:00:00:de:cb:07 ethertype IPv4 (0x0800), length 104: (tos 0x0, ttl 64, id 37678, offset 0, flags [none], proto ICMP (1), length 84)
    192.168.0.6 > 172.19.0.1: ICMP echo reply, id 3846, seq 1, length 64
eth1  Out ifindex 81 00:00:00:21:44:fd ethertype IPv4 (0x0800), length 104: (tos 0x0, ttl 63, id 37678, offset 0, flags [none], proto ICMP (1), length 84)
    172.19.0.11 > 172.19.0.1: ICMP echo reply, id 3846, seq 1, length 64
eth1  P   ifindex 81 02:42:57:db:a9:c6 ethertype ARP (0x0806), length 48: Ethernet (len 6), IPv4 (len 4), Request who-has 172.19.0.11 tell 172.19.0.1, length 28
dd2b931fc304_h P   ifindex 23 00:00:00:de:cb:07 ethertype ARP (0x0806), length 48: Ethernet (len 6), IPv4 (len 4), Request who-has 192.168.0.1 tell 192.168.0.6, length 28
dd2b931fc304_h Out ifindex 23 00:00:00:91:cc:aa ethertype ARP (0x0806), length 48: Ethernet (len 6), IPv4 (len 4), Reply 192.168.0.1 is-at 00:00:00:91:cc:aa, length 28
eth1  Out ifindex 81 00:00:00:21:44:fd ethertype ARP (0x0806), length 48: Ethernet (len 6), IPv4 (len 4), Reply 172.19.0.11 is-at 00:00:00:21:44:fd, length 28


# 可以看到dnat的过程在  kube-ovn-control-plane ovn-cni中抓包, fip的流量经eth1 到br-external

集群外node--> 交换机 --> eth1--> cust-vpc-subnet-veth

可以看到dnat时,目标mac是

# 同样的测试下,fip出

[root@mst #  k exec -it -n default       fip-busybox01 bash
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
fip-busybox01:~# ip a
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
    link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
    inet 127.0.0.1/8 scope host lo
       valid_lft forever preferred_lft forever
    inet6 ::1/128 scope host
       valid_lft forever preferred_lft forever
22: eth0@if23: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1400 qdisc noqueue state UP group default
    link/ether 00:00:00:de:cb:07 brd ff:ff:ff:ff:ff:ff link-netnsid 0
    inet 192.168.0.6/24 brd 192.168.0.255 scope global eth0
       valid_lft forever preferred_lft forever
    inet6 fe80::200:ff:fede:cb07/64 scope link
       valid_lft forever preferred_lft forever
fip-busybox01:~# route -n
Kernel IP routing table
Destination     Gateway         Genmask         Flags Metric Ref    Use Iface
0.0.0.0         192.168.0.1     0.0.0.0         UG    0      0        0 eth0
192.168.0.0     0.0.0.0         255.255.255.0   U     0      0        0 eth0
fip-busybox01:~# ping -c1 172.19.0.1
PING 172.19.0.1 (172.19.0.1) 56(84) bytes of data.
64 bytes from 172.19.0.1: icmp_seq=1 ttl=63 time=1.30 ms

--- 172.19.0.1 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 1.299/1.299/1.299/0.000 ms


root@kube-ovn-control-plane:/kube-ovn# tcpdump -i any host 192.168.0.6 or 172.19.0.11 -netvv
tcpdump: data link type LINUX_SLL2
tcpdump: listening on any, link-type LINUX_SLL2 (Linux cooked v2), snapshot length 262144 bytes
dd2b931fc304_h P   ifindex 23 00:00:00:de:cb:07 ethertype IPv4 (0x0800), length 104: (tos 0x0, ttl 64, id 14373, offset 0, flags [DF], proto ICMP (1), length 84)
    192.168.0.6 > 172.19.0.1: ICMP echo request, id 20900, seq 1, length 64
eth1  Out ifindex 81 00:00:00:21:44:fd ethertype IPv4 (0x0800), length 104: (tos 0x0, ttl 63, id 14373, offset 0, flags [DF], proto ICMP (1), length 84)
    172.19.0.11 > 172.19.0.1: ICMP echo request, id 20900, seq 1, length 64
eth1  P   ifindex 81 02:42:57:db:a9:c6 ethertype IPv4 (0x0800), length 104: (tos 0x0, ttl 64, id 42193, offset 0, flags [none], proto ICMP (1), length 84)
    172.19.0.1 > 172.19.0.11: ICMP echo reply, id 20900, seq 1, length 64
dd2b931fc304_h Out ifindex 23 00:00:00:91:cc:aa ethertype IPv4 (0x0800), length 104: (tos 0x0, ttl 63, id 42193, offset 0, flags [none], proto ICMP (1), length 84)
    172.19.0.1 > 192.168.0.6: ICMP echo reply, id 20900, seq 1, length 64
dd2b931fc304_h P   ifindex 23 00:00:00:de:cb:07 ethertype ARP (0x0806), length 48: Ethernet (len 6), IPv4 (len 4), Request who-has 192.168.0.1 tell 192.168.0.6, length 28
eth1  P   ifindex 81 02:42:57:db:a9:c6 ethertype ARP (0x0806), length 48: Ethernet (len 6), IPv4 (len 4), Request who-has 172.19.0.11 tell 172.19.0.1, length 28
dd2b931fc304_h Out ifindex 23 00:00:00:91:cc:aa ethertype ARP (0x0806), length 48: Ethernet (len 6), IPv4 (len 4), Reply 192.168.0.1 is-at 00:00:00:91:cc:aa, length 28
eth1  Out ifindex 81 00:00:00:21:44:fd ethertype ARP (0x0806), length 48: Ethernet (len 6), IPv4 (len 4), Reply 172.19.0.11 is-at 00:00:00:21:44:fd, length 28

# 可以看到出去的流量走 eth1直接出,

cust-vpc-subnet-lsp --> br-int --> localnet -->eth1


## 注意snat后出去时 fip src的mac 00:00:00:21:44:fd 是lrp 的 mac

[root@mst #  k ko nbctl show | grep -C2 -i  "00:00:00:21:44:fd"
        networks: ["192.168.0.1/24"]
    port vpc-75989-external
        mac: "00:00:00:21:44:FD"
        networks: ["172.19.0.4/16"]
        gateway chassis: [2922af73-b67b-49ab-9a70-018c40af25a7 2e3175b0-2129-4504-a333-0c4ac1f680d7]


fip-busybox01:~# traceroute -n 172.19.0.1
traceroute to 172.19.0.1 (172.19.0.1), 30 hops max, 46 byte packets
 1  192.168.0.1  1.981 ms  0.454 ms  0.354 ms
 2  172.19.0.1  1.313 ms  0.180 ms  0.114 ms



image.png

image.png

查看另一个节点的pod出来的包,不再详细描述,只记录包确实是从pod所在node上出来的。且mac也是lrp的。

9e3eec1ba0964ddb97d26549b5366c4.png

目前测试出来的结果是vpc的 fip 公网功能是集中式的,如果要测试分布式FIP,需要基于 ovn-external-gw-config 配置调整下。

image.png

参考: kubeovn.github.io/docs/v1.12.…

2. 分布式网关

2.1 分布式网关跟踪 FIP 流量

# 环境
# k get cm -n kube-system          ovn-external-gw-config -o yaml
apiVersion: v1
data:
  enable-external-gw: "true"
  external-gw-addr: 172.19.0.0/16
  external-gw-nic: eth1
  external-gw-nodes: kube-ovn-control-plane,kube-ovn-worker
  type: distributed  # 这里为分布式的
kind: ConfigMap
metadata:
  name: ovn-external-gw-config
  namespace: kube-system

# 准备两个pod 用于观察fip确实是分布式的

(vv) root@wk:~/dev/tests/kube-ovn/kind-dvr-fip# kgp | grep nginx
default       nginx-netshoot-0 2/2     Running   0          7m30s   192.168.0.2   kube-ovn-worker 
default       nginx-netshoot-1 2/2     Running   0          7m10s   192.168.0.3   kube-ovn-control-plane 

(vv) root@wk:~/dev/tests/kube-ovn/kind-dvr-fip# k get ofip
NAME   VPC    V4EIP         V4IP          READY   IPTYPE   IPNAME
fip1   vpc1   172.19.0.12   192.168.0.2   true             nginx-netshoot-0.default
fip2   vpc1   172.19.0.13   192.168.0.3   true             nginx-netshoot-1.default

(vv) root@wk:~/dev/tests/kube-ovn/kind-dvr-fip# k ko nbctl show 
switch 615a8c68-4bcd-4e82-b880-f6abf9eb4ca2 (external)
    port external-vpc-45847
        type: router
        router-port: vpc-45847-external
    port localnet.external
        type: localnet
        addresses: ["unknown"]
    port external-vpc1
        type: router
        router-port: vpc1-external
    port external-ovn-cluster
        type: router
        router-port: ovn-cluster-external
switch 76af40bd-3253-4245-85c6-38d6de9d5358 (vpc1-subnet1)
    port vpc1-subnet1-vpc1
        type: router
        router-port: vpc1-vpc1-subnet1
    port nginx-netshoot-1.default
        addresses: ["00:00:00:F2:0D:7B 192.168.0.3"]
    port nginx-netshoot-0.default
        addresses: ["00:00:00:C4:5B:CE 192.168.0.2"]
router 6fcbfa41-bac3-4f15-ad12-c9a75defd629 (vpc1)
    port vpc1-vpc1-subnet1
        mac: "00:00:00:A8:9F:B5"
        networks: ["192.168.0.1/24"]
    port vpc1-external
        mac: "00:00:00:A6:95:CC"
        networks: ["172.19.0.11/16"]
        gateway chassis: [b177a84b-74e5-498a-9501-13e6628b796f 2c47abdb-6350-4373-a839-68c7181d964a]
    nat cd5ba8d0-8156-4f89-b279-629be9f3bf94
        external ip: "172.19.0.12"
        logical ip: "192.168.0.2"
        type: "dnat_and_snat"
    nat dc8320a0-6071-4ac6-ace9-672ed4f5c67f
        external ip: "172.19.0.13"
        logical ip: "192.168.0.3"
        type: "dnat_and_snat"
router 18e187fb-1b03-4f90-ab37-ec519488134b (vpc-45847)
    port vpc-45847-external
        mac: "00:00:00:AC:4C:17"
        networks: ["172.19.0.4/16"]
        gateway chassis: [2c47abdb-6350-4373-a839-68c7181d964a b177a84b-74e5-498a-9501-13e6628b796f]
        ...

(vv) root@wk:~/dev/tests/kube-ovn/kind-dvr-fip# k ko nbctl lr-route-list vpc1
IPv4 Routes
Route Table <main>:
                0.0.0.0/0                172.19.0.1 dst-ip

当前是kind环境,在集群外 (kind集群所在node上)ping fip,抓包看dvr fip转发路径

image.png

image.png

结论:DVR fip 从pod所在node直接响应,相应的mac就是pod网卡的mac


(vv) root@wk:~# k ko nbctl find NAT type=dnat_and_snat
_uuid               : dc8320a0-6071-4ac6-ace9-672ed4f5c67f
allowed_ext_ips     : []
exempted_ext_ips    : []
external_ids        : {}
external_ip         : "172.19.0.13"
external_mac        : "00:00:00:F2:0D:7B"
external_port_range : ""
gateway_port        : []
logical_ip          : "192.168.0.3"
logical_port        : nginx-netshoot-1.default
options             : {stateless="true"}
type                : dnat_and_snat

_uuid               : cd5ba8d0-8156-4f89-b279-629be9f3bf94
allowed_ext_ips     : []
exempted_ext_ips    : []
external_ids        : {}
external_ip         : "172.19.0.12"
external_mac        : "00:00:00:C4:5B:CE"
external_port_range : ""
gateway_port        : []
logical_ip          : "192.168.0.2"
logical_port        : nginx-netshoot-0.default
options             : {stateless="true"}
type                : dnat_and_snat
(vv) root@wk:~# 

2.2 跟踪 SNAT 流量


# 创建一个subnet绑定到整个vpc 子网


router bdee7174-a005-4172-95e5-9e5cd44508aa (vpc1)
    port vpc1-vpc1-subnet1
        mac: "00:00:00:BE:9A:F2"
        networks: ["192.168.0.1/24"]
    port vpc1-external
        mac: "00:00:00:A8:69:64"
        networks: ["172.19.0.11/16"]
        gateway chassis: [e40bf1d2-a70a-4015-b867-b6c814b16f1d 08b7433a-fdc2-4c5a-9f7f-cd6eb87d08b7]
    nat a1dbe648-dd12-4d1e-ae0b-3044ad29e583
        external ip: "172.19.0.12"
        logical ip: "192.168.0.0/24"
        type: "snat"

(vv) root@wk:~/dev/tests/kube-ovn/kind/dvr-fip# k get oeip
NAME                     V4IP          V6IP   MAC                 TYPE   READY
snat-for-subnet-in-vpc   172.19.0.12          00:00:00:36:CA:A6   snat   true
vpc1-external            172.19.0.11          00:00:00:A8:69:64   lrp    true

# snat crd 操作模版
(vv) root@wk:~/dev/tests/kube-ovn/kind/dvr-fip# cat 05-snat.yaml
---
kind: OvnEip
apiVersion: kubeovn.io/v1
metadata:
  name: snat-for-subnet-in-vpc
spec:
  externalSubnet: external
  type: snat

---
kind: OvnSnatRule
apiVersion: kubeovn.io/v1
metadata:
  name: snat-for-subnet-in-vpc
spec:
  ovnEip: snat-for-subnet-in-vpc
  vpcSubnet: vpc1-subnet1 # eip 对应整个网段

image.png

image.png

2.3 跟踪 DNAT 流量

这里说明下kube-ovn 目前dnat的设计,由于ovn调整过一次dnat的设计,导致端口转发粒度的dnat无法基于NAT表实现,只能基于router LB实现,所以目前kube-ovn的ovn 原生dnat实际上是router lb作为后端


(vv) root@wk:~/dev/tests/kube-ovn/kind/dvr-fip# k ko nbctl lb-list
UUID                                    LB                  PROTO      VIP                    IPs
09d7bcfe-aadb-41bc-8642-9e9efff83455    dnat01              tcp        172.19.0.13:8000       192.168.0.2:80
dc5107c1-6f11-432d-bc5b-9367d459461a    dnat02              tcp        172.19.0.14:8000       192.168.0.3:80

# 模版

(vv) root@wk:~/dev/tests/kube-ovn/kind/dvr-fip# cat 06-dnat.yaml
---
kind: OvnEip
apiVersion: kubeovn.io/v1
metadata:
  name: dnat01
spec:
  externalSubnet: external
---
kind: OvnDnatRule
apiVersion: kubeovn.io/v1
metadata:
  name: dnat01
spec:
  ovnEip: dnat01
  ipName: nginx-netshoot-0.default  # 注意这里是 pod ip crd 的名字,具有唯一性
  protocol: tcp
  internalPort: "80"
  externalPort: "8000"
---
kind: OvnEip
apiVersion: kubeovn.io/v1
metadata:
  name: dnat02
spec:
  externalSubnet: external
---
kind: OvnDnatRule
apiVersion: kubeovn.io/v1
metadata:
  name: dnat02
spec:
  ovnEip: dnat02
  ipName: nginx-netshoot-1.default  # 注意这里是 pod ip crd 的名字,具有唯一性
  protocol: tcp
  internalPort: "80"
  externalPort: "8000"
  

测试从集群外访问dnat 端口转发的流量是否是集中式的

image.png

image.png