目前发现 基于 svc 创建的 lb,是可以正常访问的,但是基于 slr (switch lb rule) 创建的 lb 是无法访问的,slr 功能上了一年多了,期间都是可以正常使用,但是现在发现不能用了
kubectl ko nbctl list Load_Balancer
_uuid : 48a25e56-2c0f-4cbd-8b31-d8eb3b34e191
external_ids : {}
health_check : []
ip_port_mappings : {}
name : cluster-tcp-loadbalancer
options : {}
protocol : tcp
selection_fields : []
vips : {"10.233.0.100:80"="10.16.0.10:8000,10.16.0.11:8000", "10.233.0.1:443"="172.20.214.30:6443", "10.233.0.3:53"="10.16.0.4:53,10.16.0.5:53", "10.233.0.3:9153"="10.16.0.4:9153,10.16.0.5:9153", "10.233.17.99:10661"="172.20.214.30:10661", "10.233.43.231:6643"="172.20.214.30:6643", "10.233.52.51:10660"="172.20.214.30:10660", "10.233.53.147:80"="10.16.0.10:8000,10.16.0.11:8000", "10.233.56.93:6641"="172.20.214.30:6641", "10.233.60.191:10665"="172.20.214.30:10665", "10.233.63.181:8080"="10.16.0.6:8080", "10.233.63.185:6642"="172.20.214.30:6642"}
# 这是默认 vpc ovn-default 的 tcp lb 详情
# 注意:
"10.233.0.100:80"="10.16.0.10:8000,10.16.0.11:8000",
"10.233.53.147:80"="10.16.0.10:8000,10.16.0.11:8000",
10.233.0.100 是我用 slr 创建的
10.233.53.147 是我用 svc 创建的
# k get slr netshoot -o yaml
apiVersion: kubeovn.io/v1
kind: SwitchLBRule
metadata:
name: netshoot
spec:
namespace: default
ports:
- name: python
port: 80
protocol: TCP
targetPort: 8000
selector:
- app:netshoot
sessionAffinity: None
vip: 10.233.0.100
status:
ports: 80/TCP
service: default/slr-netshoot
# slr 是基于 headless svc 来维护 endpoints 的
# k get svc -n default slr-netshoot -o yaml
apiVersion: v1
kind: Service
metadata:
annotations:
ovn.kubernetes.io/switch_lb_vip: 10.233.0.100
ovn.kubernetes.io/vpc: ovn-cluster
name: slr-netshoot
namespace: default
spec:
clusterIP: None
clusterIPs:
- None
internalTrafficPolicy: Cluster
ipFamilies:
- IPv4
ipFamilyPolicy: SingleStack
ports:
- name: python
port: 80
protocol: TCP
targetPort: 8000
selector:
app: netshoot
sessionAffinity: None
type: ClusterIP
status:
loadBalancer: {}
# 直接创建 cluster ip 也能触发 创建 lb
# k get svc -n default netshoot -o yaml
apiVersion: v1
kind: Service
metadata:
name: netshoot
namespace: default
spec:
clusterIP: 10.233.53.147
clusterIPs:
- 10.233.53.147
internalTrafficPolicy: Cluster
ipFamilies:
- IPv4
ipFamilyPolicy: SingleStack
ports:
- name: python
port: 80
protocol: TCP
targetPort: 8000
selector:
app: netshoot
sessionAffinity: None
type: ClusterIP
status:
loadBalancer: {}
# k exec -it -n default lb-client -- bash
lb-client:~# curl 10.233.53.147
netshoot-1
lb-client:~# curl 10.233.53.147
netshoot-0
lb-client:~# curl 10.233.0.100
^C
lb-client:~# curl 10.233.0.100
^C
lb-client:~# tracepath -n 10.233.53.147
1?: [LOCALHOST] pmtu 1400
1: 10.16.0.1 0.880ms asymm 2
1: 10.16.0.1 0.476ms asymm 2
2: 10.233.53.147 0.528ms reached
Resume: pmtu 1400 hops 2 back 2
lb-client:~# tracepath -n 10.233.0.100
1?: [LOCALHOST] pmtu 1400
1: 10.16.0.1 1.535ms asymm 2
1: 10.16.0.1 0.507ms asymm 2
2: 172.20.214.30 0.310ms
3: 172.20.215.254 4.103ms
4: 172.20.254.13 3.648ms
5: no reply
^C
lb-client:~#
根据 tracepath 的结果可以看到,正常情况在到网关之后就会触发解析,但是 slr 创建的 lb 并没有触发解析。
2. 在自定义 vpc 中测试 svc 依旧也是无法访问
小结
root@empty:~/test/kube-ovn/lb-to-subnet/vip-cust-vpc/01-pod# k get svc -A -o wide | grep netshoot
default netshoot ClusterIP 10.233.53.147 <none> 80/TCP 40m app=netshoot
default slr-netshoot ClusterIP None <none> 80/TCP 23m app=netshoot
ns1 netshoot ClusterIP 10.233.59.75 <none> 80/TCP 107s app=netshoot
这三个 svc 中只有 default的 netshoot svc 是可以成功访问的,自定义 vpc 下的 ns1 netshoot svc 只是改了下 ns,但无法访问
root@empty:~/test/kube-ovn/lb-to-subnet/vip-cust-vpc/01-pod# cat 01-lb-client.yaml
---
apiVersion: v1
kind: Pod
metadata:
name: lb-client
namespace: ns1
spec:
containers:
- name: lb-client
image: docker.io/nicolaka/netshoot:latest
imagePullPolicy: Never
command:
- sh
- -c
- "sleep infinity"
securityContext:
capabilities:
add:
- NET_ADMIN
root@empty:~/test/kube-ovn/lb-to-subnet/vip-cust-vpc/01-pod# cat 02-sts.yaml
apiVersion: v1
kind: Service
metadata:
name: netshoot
namespace: ns1
spec:
internalTrafficPolicy: Cluster
ipFamilies:
- IPv4
ipFamilyPolicy: SingleStack
ports:
- name: python
port: 80
protocol: TCP
targetPort: 8000
selector:
app: netshoot
sessionAffinity: None
type: ClusterIP
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
labels:
app: netshoot
name: netshoot
namespace: ns1
spec:
replicas: 2
selector:
matchLabels:
app: netshoot
serviceName: netshoot
template:
metadata:
labels:
app: netshoot
spec:
containers:
- name: netshoot
image: nicolaka/netshoot
imagePullPolicy: Never
ports:
- name: python
containerPort: 8000
command:
- bash
- -c
- echo `hostname` > index.html; python -m http.server
上述模版 只需要把 ns 改到默认网络下 即是可以访问的
3. 手动测试
# 快速创建
kubectl ko nbctl create load_balancer name=test-vpc1-subnet1-tcp-lb0 protocol=tcp
kubectl ko nbctl ls-lb-add vpc1-subnet1 test-vpc1-subnet1-tcp-lb0
kubectl ko nbctl lb-add test-vpc1-subnet1-tcp-lb0 10.106.51.35:80 10.1.0.3:8000,10.1.0.4:8000 tcp
kubectl ko nbctl list load_balancer test-vpc1-subnet1-tcp-lb0
# pod 内测试
root@empty:~/test/kube-ovn/lb-to-subnet/vip-cust-vpc/01-pod# k exec -it -n ns1 lb-client -- bash
lb-client:~# curl 10.106.51.35:80
netshoot-0
lb-client:~# curl 10.106.51.35:80
netshoot-1
lb-client:~# curl 10.106.51.35:80
netshoot-1
lb-client:~# curl 10.106.51.35:80
netshoot-1
lb-client:~# ip a
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
valid_lft forever preferred_lft forever
27: eth0@if28: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1400 qdisc noqueue state UP group default
link/ether 00:00:00:ca:d8:ba brd ff:ff:ff:ff:ff:ff link-netnsid 0
inet 10.1.0.2/24 brd 10.1.0.255 scope global eth0
valid_lft forever preferred_lft forever
lb-client:~#
4. 继续测试健康检查
我关闭了一个pod后,健康检查稳定选到健康的后端,测试通过
5. 对比自定义 vpc 中 svc lb (不能通) 和 手动自建(能通)的区别
_uuid : efb2c91e-3c90-4c82-997d-296f491e7335
external_ids : {}
health_check : []
ip_port_mappings : {}
name : vpc-vpc1-tcp-load
options : {}
protocol : tcp
selection_fields : []
vips : {"10.233.59.75:80"="10.1.0.3:8000"}
_uuid : 5f9ed1d5-873e-4343-92b8-2ae39e9a5ef5
external_ids : {}
health_check : [0d1d8980-e067-4870-a418-c46b0f35265c]
ip_port_mappings : {"10.1.0.3"="netshoot-0.ns1:10.1.0.254", "10.1.0.4"="netshoot-1.ns1:10.1.0.254"}
name : xxx-vpc1-subnet1-tcp-lb0
options : {}
protocol : tcp
selection_fields : []
vips : {"10.106.51.35:80"="10.1.0.3:8000,10.1.0.4:8000"}
我感觉 在加健康检查之前 没有任何区别, 由于已经有health check了,重建了资源再对照下区别, 下面是添加健康检查之前的对照:
root@empty:~/test/kube-ovn/lb-to-subnet/vip-cust-vpc# kubectl ko nbctl ls-lb-list vpc1-subnet1
UUID LB PROTO VIP IPs
0afa900d-741e-4c9d-9333-9c65a6976037 vpc-vpc1-tcp-loa tcp 10.233.47.183:80 10.1.0.4:8000,10.1.0.7:8000
b2689101-f403-4de1-96cf-51a18e53683a xxx-vpc1-subnet1 tcp 10.106.51.35:80 10.1.0.4:8000,10.1.0.7:8000
root@empty:~/test/kube-ovn/lb-to-subnet/vip-cust-vpc# kubectl ko nbctl list Load_Balancer | grep -B 8 10.233.47.183
_uuid : 0afa900d-741e-4c9d-9333-9c65a6976037
external_ids : {}
health_check : []
ip_port_mappings : {}
name : vpc-vpc1-tcp-load
options : {}
protocol : tcp
selection_fields : []
vips : {"10.233.47.183:80"="10.1.0.4:8000,10.1.0.7:8000"}
root@empty:~/test/kube-ovn/lb-to-subnet/vip-cust-vpc#
root@empty:~/test/kube-ovn/lb-to-subnet/vip-cust-vpc# kubectl ko nbctl list Load_Balancer | grep -B 8 10.106.51.35
_uuid : b2689101-f403-4de1-96cf-51a18e53683a
external_ids : {}
health_check : []
ip_port_mappings : {}
name : xxx-vpc1-subnet1-tcp-lb0
options : {}
protocol : tcp
selection_fields : []
vips : {"10.106.51.35:80"="10.1.0.4:8000,10.1.0.7:8000"}
root@empty:~/test/kube-ovn/lb-to-subnet/vip-cust-vpc# k exec -it -n ns1 lb-client -- bash
lb-client:~#
lb-client:~# curl 10.106.51.35
netshoot-1
lb-client:~# curl 10.106.51.35
netshoot-0
lb-client:~# curl 10.233.47.183
^C
lb-client:~# curl 10.233.47.183
^C
lb-client:~#
# 可以看到 lb 配置本身没有任何区别,但就是不通
# /g/m/ovn/ovn-nb.ovsschema
...
"Load_Balancer": {
"columns": {
"name": {"type": "string"},
"vips": {
"type": {"key": "string", "value": "string",
"min": 0, "max": "unlimited"}},
"protocol": {
"type": {"key": {"type": "string",
"enum": ["set", ["tcp", "udp", "sctp"]]},
"min": 0, "max": 1}},
"health_check": {"type": {
"key": {"type": "uuid",
"refTable": "Load_Balancer_Health_Check",
"refType": "strong"},
"min": 0,
"max": "unlimited"}},
"ip_port_mappings": {
"type": {"key": "string", "value": "string",
"min": 0, "max": "unlimited"}},
"selection_fields": {
"type": {"key": {"type": "string",
"enum": ["set",
["eth_src", "eth_dst", "ip_src", "ip_dst",
"tp_src", "tp_dst"]]},
"min": 0, "max": "unlimited"}},
"options": {
"type": {"key": "string",
"value": "string",
"min": 0,
"max": "unlimited"}},
"external_ids": {
"type": {"key": "string", "value": "string",
"min": 0, "max": "unlimited"}}},
"isRoot": true},
...
再看下 南向数据库
_uuid : d7b7bb4c-96ca-4b54-a12f-06862285330e
datapath_group : b0340dbc-65fd-4b44-a7d7-8f59d57683c1
datapaths : []
external_ids : {lb_id="0afa900d-741e-4c9d-9333-9c65a6976037"}
name : vpc-vpc1-tcp-load
options : {hairpin_orig_tuple="true"}
protocol : tcp
vips : {"10.233.47.183:80"="10.1.0.4:8000,10.1.0.7:8000"}
_uuid : 7a3a68af-a79b-427f-980c-15ad0adc07df
datapath_group : b0340dbc-65fd-4b44-a7d7-8f59d57683c1
datapaths : []
external_ids : {lb_id="b2689101-f403-4de1-96cf-51a18e53683a"}
name : xxx-vpc1-subnet1-tcp-lb0
options : {hairpin_orig_tuple="true"}
protocol : tcp
vips : {"10.106.51.35:80"="10.1.0.4:8000,10.1.0.7:8000"}
6. 继续定位 排除 默认 vpc 的 lb 与自定义 slr 的 lb 以及 自建的 lb 本身有不同的地方
目前认为默认 vpc 下的 lb 和 手动创建的 lb 相比 没有特殊设置, 添加 lb rule 后,直接在 pod 内访问都是没问题的
确认代码上 默认 vpc subnet svc 和 自定义 vpc subnet svc 走的是同一套代码
流表也是正常的