ovn switch lb 不通的问题

510 阅读6分钟

目前发现 基于 svc 创建的 lb,是可以正常访问的,但是基于 slr (switch lb rule) 创建的 lb 是无法访问的,slr 功能上了一年多了,期间都是可以正常使用,但是现在发现不能用了

kubectl ko nbctl list Load_Balancer


_uuid               : 48a25e56-2c0f-4cbd-8b31-d8eb3b34e191
external_ids        : {}
health_check        : []
ip_port_mappings    : {}
name                : cluster-tcp-loadbalancer
options             : {}
protocol            : tcp
selection_fields    : []
vips                : {"10.233.0.100:80"="10.16.0.10:8000,10.16.0.11:8000", "10.233.0.1:443"="172.20.214.30:6443", "10.233.0.3:53"="10.16.0.4:53,10.16.0.5:53", "10.233.0.3:9153"="10.16.0.4:9153,10.16.0.5:9153", "10.233.17.99:10661"="172.20.214.30:10661", "10.233.43.231:6643"="172.20.214.30:6643", "10.233.52.51:10660"="172.20.214.30:10660", "10.233.53.147:80"="10.16.0.10:8000,10.16.0.11:8000", "10.233.56.93:6641"="172.20.214.30:6641", "10.233.60.191:10665"="172.20.214.30:10665", "10.233.63.181:8080"="10.16.0.6:8080", "10.233.63.185:6642"="172.20.214.30:6642"}


# 这是默认 vpc  ovn-default 的 tcp lb 详情

# 注意: 
"10.233.0.100:80"="10.16.0.10:8000,10.16.0.11:8000", 
"10.233.53.147:80"="10.16.0.10:8000,10.16.0.11:8000", 

10.233.0.100 是我用 slr 创建的
10.233.53.147 是我用 svc 创建的



# k get slr netshoot -o yaml
apiVersion: kubeovn.io/v1
kind: SwitchLBRule
metadata:
  name: netshoot
spec:
  namespace: default
  ports:
  - name: python
    port: 80
    protocol: TCP
    targetPort: 8000
  selector:
  - app:netshoot
  sessionAffinity: None
  vip: 10.233.0.100
status:
  ports: 80/TCP
  service: default/slr-netshoot
  
# slr 是基于 headless svc 来维护 endpoints 的
# k get svc -n default       slr-netshoot -o yaml
apiVersion: v1
kind: Service
metadata:
  annotations:
    ovn.kubernetes.io/switch_lb_vip: 10.233.0.100
    ovn.kubernetes.io/vpc: ovn-cluster
  name: slr-netshoot
  namespace: default
spec:
  clusterIP: None
  clusterIPs:
  - None
  internalTrafficPolicy: Cluster
  ipFamilies:
  - IPv4
  ipFamilyPolicy: SingleStack
  ports:
  - name: python
    port: 80
    protocol: TCP
    targetPort: 8000
  selector:
    app: netshoot
  sessionAffinity: None
  type: ClusterIP
status:
  loadBalancer: {}







# 直接创建 cluster ip 也能触发 创建 lb

#  k get svc -n default       netshoot -o yaml
apiVersion: v1
kind: Service
metadata:
  name: netshoot
  namespace: default
spec:
  clusterIP: 10.233.53.147
  clusterIPs:
  - 10.233.53.147
  internalTrafficPolicy: Cluster
  ipFamilies:
  - IPv4
  ipFamilyPolicy: SingleStack
  ports:
  - name: python
    port: 80
    protocol: TCP
    targetPort: 8000
  selector:
    app: netshoot
  sessionAffinity: None
  type: ClusterIP
status:
  loadBalancer: {}



# k exec -it -n default       lb-client -- bash
lb-client:~# curl 10.233.53.147
netshoot-1
lb-client:~# curl 10.233.53.147
netshoot-0
lb-client:~# curl 10.233.0.100





^C
lb-client:~# curl 10.233.0.100
^C
lb-client:~# tracepath -n 10.233.53.147
 1?: [LOCALHOST]                      pmtu 1400
 1:  10.16.0.1                                             0.880ms asymm  2
 1:  10.16.0.1                                             0.476ms asymm  2
 2:  10.233.53.147                                         0.528ms reached
     Resume: pmtu 1400 hops 2 back 2
lb-client:~# tracepath -n 10.233.0.100
 1?: [LOCALHOST]                      pmtu 1400
 1:  10.16.0.1                                             1.535ms asymm  2
 1:  10.16.0.1                                             0.507ms asymm  2
 2:  172.20.214.30                                         0.310ms
 3:  172.20.215.254                                        4.103ms
 4:  172.20.254.13                                         3.648ms
 5:  no reply
^C
lb-client:~#

根据 tracepath 的结果可以看到,正常情况在到网关之后就会触发解析,但是 slr 创建的 lb 并没有触发解析。

2. 在自定义 vpc 中测试 svc 依旧也是无法访问

小结


root@empty:~/test/kube-ovn/lb-to-subnet/vip-cust-vpc/01-pod# k get svc -A -o wide | grep netshoot
default       netshoot              ClusterIP   10.233.53.147   <none>        80/TCP                   40m    app=netshoot
default       slr-netshoot          ClusterIP   None            <none>        80/TCP                   23m    app=netshoot
ns1           netshoot              ClusterIP   10.233.59.75    <none>        80/TCP                   107s   app=netshoot

这三个 svc 中只有 default的 netshoot svc 是可以成功访问的,自定义 vpc 下的 ns1 netshoot svc 只是改了下 ns,但无法访问


root@empty:~/test/kube-ovn/lb-to-subnet/vip-cust-vpc/01-pod# cat 01-lb-client.yaml
---
apiVersion: v1
kind: Pod
metadata:
  name: lb-client
  namespace: ns1
spec:
  containers:
    - name: lb-client
      image: docker.io/nicolaka/netshoot:latest
      imagePullPolicy: Never
      command:
        - sh
        - -c
        - "sleep infinity"
      securityContext:
        capabilities:
          add:
            - NET_ADMIN
root@empty:~/test/kube-ovn/lb-to-subnet/vip-cust-vpc/01-pod# cat 02-sts.yaml
apiVersion: v1
kind: Service
metadata:
  name: netshoot
  namespace: ns1
spec:
  internalTrafficPolicy: Cluster
  ipFamilies:
  - IPv4
  ipFamilyPolicy: SingleStack
  ports:
  - name: python
    port: 80
    protocol: TCP
    targetPort: 8000
  selector:
    app: netshoot
  sessionAffinity: None
  type: ClusterIP
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
  labels:
    app: netshoot
  name: netshoot
  namespace: ns1
spec:
  replicas: 2
  selector:
    matchLabels:
      app: netshoot
  serviceName: netshoot
  template:
    metadata:
      labels:
        app: netshoot
    spec:
      containers:
      - name: netshoot
        image: nicolaka/netshoot
        imagePullPolicy: Never
        ports:
        - name: python
          containerPort: 8000
        command:
        - bash
        - -c
        - echo `hostname` > index.html; python -m http.server
        

上述模版 只需要把 ns 改到默认网络下 即是可以访问的

3. 手动测试


# 快速创建

kubectl ko nbctl create load_balancer name=test-vpc1-subnet1-tcp-lb0 protocol=tcp
kubectl ko nbctl ls-lb-add vpc1-subnet1 test-vpc1-subnet1-tcp-lb0
kubectl ko nbctl lb-add test-vpc1-subnet1-tcp-lb0 10.106.51.35:80 10.1.0.3:8000,10.1.0.4:8000 tcp
kubectl ko nbctl list load_balancer test-vpc1-subnet1-tcp-lb0



# pod 内测试

root@empty:~/test/kube-ovn/lb-to-subnet/vip-cust-vpc/01-pod# k exec -it -n ns1       lb-client -- bash
lb-client:~# curl 10.106.51.35:80
netshoot-0
lb-client:~# curl 10.106.51.35:80
netshoot-1
lb-client:~# curl 10.106.51.35:80
netshoot-1
lb-client:~# curl 10.106.51.35:80
netshoot-1
lb-client:~# ip a
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
    link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
    inet 127.0.0.1/8 scope host lo
       valid_lft forever preferred_lft forever
27: eth0@if28: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1400 qdisc noqueue state UP group default
    link/ether 00:00:00:ca:d8:ba brd ff:ff:ff:ff:ff:ff link-netnsid 0
    inet 10.1.0.2/24 brd 10.1.0.255 scope global eth0
       valid_lft forever preferred_lft forever
lb-client:~#

4. 继续测试健康检查

image.png

我关闭了一个pod后,健康检查稳定选到健康的后端,测试通过

5. 对比自定义 vpc 中 svc lb (不能通) 和 手动自建(能通)的区别

_uuid               : efb2c91e-3c90-4c82-997d-296f491e7335
external_ids        : {}
health_check        : []
ip_port_mappings    : {}
name                : vpc-vpc1-tcp-load
options             : {}
protocol            : tcp
selection_fields    : []
vips                : {"10.233.59.75:80"="10.1.0.3:8000"}


_uuid               : 5f9ed1d5-873e-4343-92b8-2ae39e9a5ef5
external_ids        : {}
health_check        : [0d1d8980-e067-4870-a418-c46b0f35265c]
ip_port_mappings    : {"10.1.0.3"="netshoot-0.ns1:10.1.0.254", "10.1.0.4"="netshoot-1.ns1:10.1.0.254"}
name                : xxx-vpc1-subnet1-tcp-lb0
options             : {}
protocol            : tcp
selection_fields    : []
vips                : {"10.106.51.35:80"="10.1.0.3:8000,10.1.0.4:8000"}

我感觉 在加健康检查之前 没有任何区别, 由于已经有health check了,重建了资源再对照下区别, 下面是添加健康检查之前的对照:

root@empty:~/test/kube-ovn/lb-to-subnet/vip-cust-vpc# kubectl ko nbctl ls-lb-list vpc1-subnet1
UUID                                    LB                  PROTO      VIP                 IPs
0afa900d-741e-4c9d-9333-9c65a6976037    vpc-vpc1-tcp-loa    tcp        10.233.47.183:80    10.1.0.4:8000,10.1.0.7:8000
b2689101-f403-4de1-96cf-51a18e53683a    xxx-vpc1-subnet1    tcp        10.106.51.35:80     10.1.0.4:8000,10.1.0.7:8000

root@empty:~/test/kube-ovn/lb-to-subnet/vip-cust-vpc# kubectl ko nbctl list Load_Balancer  | grep -B 8 10.233.47.183
_uuid               : 0afa900d-741e-4c9d-9333-9c65a6976037
external_ids        : {}
health_check        : []
ip_port_mappings    : {}
name                : vpc-vpc1-tcp-load
options             : {}
protocol            : tcp
selection_fields    : []
vips                : {"10.233.47.183:80"="10.1.0.4:8000,10.1.0.7:8000"}
root@empty:~/test/kube-ovn/lb-to-subnet/vip-cust-vpc#

root@empty:~/test/kube-ovn/lb-to-subnet/vip-cust-vpc# kubectl ko nbctl list Load_Balancer  | grep -B 8 10.106.51.35
_uuid               : b2689101-f403-4de1-96cf-51a18e53683a
external_ids        : {}
health_check        : []
ip_port_mappings    : {}
name                : xxx-vpc1-subnet1-tcp-lb0
options             : {}
protocol            : tcp
selection_fields    : []
vips                : {"10.106.51.35:80"="10.1.0.4:8000,10.1.0.7:8000"}

root@empty:~/test/kube-ovn/lb-to-subnet/vip-cust-vpc# k exec -it -n ns1       lb-client -- bash
lb-client:~#
lb-client:~# curl 10.106.51.35
netshoot-1
lb-client:~# curl 10.106.51.35
netshoot-0
lb-client:~# curl 10.233.47.183
^C
lb-client:~# curl 10.233.47.183
^C
lb-client:~#
# 可以看到 lb 配置本身没有任何区别,但就是不通


# /g/m/ovn/ovn-nb.ovsschema
...

        "Load_Balancer": {
            "columns": {
                "name": {"type": "string"},
                "vips": {
                    "type": {"key": "string", "value": "string",
                             "min": 0, "max": "unlimited"}},
                "protocol": {
                    "type": {"key": {"type": "string",
                             "enum": ["set", ["tcp", "udp", "sctp"]]},
                             "min": 0, "max": 1}},
                "health_check": {"type": {
                    "key": {"type": "uuid",
                            "refTable": "Load_Balancer_Health_Check",
                            "refType": "strong"},
                    "min": 0,
                    "max": "unlimited"}},
                "ip_port_mappings": {
                    "type": {"key": "string", "value": "string",
                             "min": 0, "max": "unlimited"}},
                "selection_fields": {
                    "type": {"key": {"type": "string",
                             "enum": ["set",
                                ["eth_src", "eth_dst", "ip_src", "ip_dst",
                                 "tp_src", "tp_dst"]]},
                             "min": 0, "max": "unlimited"}},
                "options": {
                     "type": {"key": "string",
                              "value": "string",
                              "min": 0,
                              "max": "unlimited"}},
                "external_ids": {
                    "type": {"key": "string", "value": "string",
                             "min": 0, "max": "unlimited"}}},
            "isRoot": true},
...

再看下 南向数据库




_uuid               : d7b7bb4c-96ca-4b54-a12f-06862285330e
datapath_group      : b0340dbc-65fd-4b44-a7d7-8f59d57683c1
datapaths           : []
external_ids        : {lb_id="0afa900d-741e-4c9d-9333-9c65a6976037"}
name                : vpc-vpc1-tcp-load
options             : {hairpin_orig_tuple="true"}
protocol            : tcp
vips                : {"10.233.47.183:80"="10.1.0.4:8000,10.1.0.7:8000"}


_uuid               : 7a3a68af-a79b-427f-980c-15ad0adc07df
datapath_group      : b0340dbc-65fd-4b44-a7d7-8f59d57683c1
datapaths           : []
external_ids        : {lb_id="b2689101-f403-4de1-96cf-51a18e53683a"}
name                : xxx-vpc1-subnet1-tcp-lb0
options             : {hairpin_orig_tuple="true"}
protocol            : tcp
vips                : {"10.106.51.35:80"="10.1.0.4:8000,10.1.0.7:8000"}


6. 继续定位 排除 默认 vpc 的 lb 与自定义 slr 的 lb 以及 自建的 lb 本身有不同的地方

image.png

目前认为默认 vpc 下的 lb 和 手动创建的 lb 相比 没有特殊设置, 添加 lb rule 后,直接在 pod 内访问都是没问题的

image.png

image.png

确认代码上 默认 vpc subnet svc 和 自定义 vpc subnet svc 走的是同一套代码

流表也是正常的

image.png