电脑内网 IP 变化导致获取 pod 日志失败问题

29 阅读4分钟

问题表现

root@server-01:~kubectl logs -n kube-flannel kube-flannel-ds-g84gc -c kube-flannel -f-f
Error from server: Get "https://192.168.174.128:10250/containerLogs/kube-flannel/kube-flannel-ds-g84gc/kube-flannel?follow=true": proxyconnect tcp: dial tcp 192.168.43.196:1023: connect: connection refused

问题解释

控制平面虚拟机在之前初始化的时候会将当时 http_proxy 和 https_proxy 环境变量写进 apiserver manifest 当中,apiserver 会自动走代理,但是当内网 IP 发生变化时,代理 IP 还是指向旧的,导致网络错误。

root@server-01:/etc/kubernetes# cat /etc/kubernetes/manifests/kube-apiserver.yaml
apiVersion: v1
kind: Pod
metadata:
  annotations:
    kubeadm.kubernetes.io/kube-apiserver.advertise-address.endpoint: 192.168.174.128:6443
  labels:
    component: kube-apiserver
    tier: control-plane
  name: kube-apiserver
  namespace: kube-system
spec:
  containers:
  - command:
    - kube-apiserver
    - --advertise-address=192.168.174.128
    - --allow-privileged=true
    - --authorization-mode=Node,RBAC
    - --client-ca-file=/etc/kubernetes/pki/ca.crt
    - --enable-admission-plugins=NodeRestriction
    - --enable-bootstrap-token-auth=true
    - --etcd-cafile=/etc/kubernetes/pki/etcd/ca.crt
    - --etcd-certfile=/etc/kubernetes/pki/apiserver-etcd-client.crt
    - --etcd-keyfile=/etc/kubernetes/pki/apiserver-etcd-client.key
    - --etcd-servers=https://127.0.0.1:2379
    - --kubelet-client-certificate=/etc/kubernetes/pki/apiserver-kubelet-client.crt
    - --kubelet-client-key=/etc/kubernetes/pki/apiserver-kubelet-client.key
    - --kubelet-preferred-address-types=InternalIP,ExternalIP,Hostname
    - --proxy-client-cert-file=/etc/kubernetes/pki/front-proxy-client.crt
    - --proxy-client-key-file=/etc/kubernetes/pki/front-proxy-client.key
    - --requestheader-allowed-names=front-proxy-client
    - --requestheader-client-ca-file=/etc/kubernetes/pki/front-proxy-ca.crt
    - --requestheader-extra-headers-prefix=X-Remote-Extra-
    - --requestheader-group-headers=X-Remote-Group
    - --requestheader-username-headers=X-Remote-User
    - --secure-port=6443
    - --service-account-issuer=https://kubernetes.default.svc.cluster.local
    - --service-account-key-file=/etc/kubernetes/pki/sa.pub
    - --service-account-signing-key-file=/etc/kubernetes/pki/sa.key
    - --service-cluster-ip-range=10.96.0.0/12
    - --tls-cert-file=/etc/kubernetes/pki/apiserver.crt
    - --tls-private-key-file=/etc/kubernetes/pki/apiserver.key
    env:
    - name: http_proxy
      value: http://192.168.43.196:1023
    - name: https_proxy
      value: http://192.168.43.196:1023
    image: registry.k8s.io/kube-apiserver:v1.34.1
    imagePullPolicy: IfNotPresent
    livenessProbe:
      failureThreshold: 8
      httpGet:
        host: 192.168.174.128
        path: /livez
        port: probe-port
        scheme: HTTPS
      initialDelaySeconds: 10
      periodSeconds: 10
      timeoutSeconds: 15
    name: kube-apiserver
    ports:
    - containerPort: 6443
      name: probe-port
      protocol: TCP
    readinessProbe:
      failureThreshold: 3
      httpGet:
        host: 192.168.174.128
        path: /readyz
        port: probe-port
        scheme: HTTPS
      periodSeconds: 1
      timeoutSeconds: 15
    resources:
      requests:
        cpu: 250m
    startupProbe:
      failureThreshold: 24
      httpGet:
        host: 192.168.174.128
        path: /livez
        port: probe-port
        scheme: HTTPS
      initialDelaySeconds: 10
      periodSeconds: 10
      timeoutSeconds: 15
    volumeMounts:
    - mountPath: /etc/ssl/certs
      name: ca-certs
      readOnly: true
    - mountPath: /etc/ca-certificates
      name: etc-ca-certificates
      readOnly: true
    - mountPath: /etc/kubernetes/pki
      name: k8s-certs
      readOnly: true
    - mountPath: /usr/local/share/ca-certificates
      name: usr-local-share-ca-certificates
      readOnly: true
    - mountPath: /usr/share/ca-certificates
      name: usr-share-ca-certificates
      readOnly: true
  hostNetwork: true
  priority: 2000001000
  priorityClassName: system-node-critical
  securityContext:
    seccompProfile:
      type: RuntimeDefault
  volumes:
  - hostPath:
      path: /etc/ssl/certs
      type: DirectoryOrCreate
    name: ca-certs
  - hostPath:
      path: /etc/ca-certificates
      type: DirectoryOrCreate
    name: etc-ca-certificates
  - hostPath:
      path: /etc/kubernetes/pki
      type: DirectoryOrCreate
    name: k8s-certs
  - hostPath:
      path: /usr/local/share/ca-certificates
      type: DirectoryOrCreate
    name: usr-local-share-ca-certificates
  - hostPath:
      path: /usr/share/ca-certificates
      type: DirectoryOrCreate
    name: usr-share-ca-certificates
status: {}

问题解决

实际上 apiserver 应该不需要设定代理,我们清除配置文件中的环境变量 http_proxy 和 https_proxy 就好,修改 /etc/kubernetes/manifests/kube-apiserver.yaml 后,kubelet 会自动检测到变化并重启 API Server pod,无需手动重启。

同理需要处理的组件的配置

  • kube-controller-manager
  • kube-scheduler.yaml

kubeproxy 的 pod 也继承了当时 kubeadm init 时服务器上的环境变量,所以旧 IP 代理的配置在 kubeproxy pod 中也存在

  • 报错信息
root@server-01:/etc/kubernetes# kubectl logs -n kube-system -l k8s-app=kube-proxy --tail=50
I1120 03:24:14.014466       1 server_linux.go:53] "Using iptables proxy"
I1120 03:24:14.152447       1 shared_informer.go:349] "Waiting for caches to sync" controller="node informer cache"
E1120 03:24:35.244357       1 reflector.go:205] "Failed to watch" err="failed to list *v1.Node: Get \"https://192.168.174.128:6443/api/v1/nodes?fieldSelector=metadata.name%3Dserver-02&limit=500&resourceVersion=0\": proxyconnect tcp: dial tcp 192.168.43.196:1023: connect: connection refused" logger="UnhandledError" reflector="k8s.io/client-go/informers/factory.go:160" type="*v1.Node"
E1120 03:24:57.721330       1 reflector.go:205] "Failed to watch" err="failed to list *v1.Node: Get \"https://192.168.174.128:6443/api/v1/nodes?fieldSelector=metadata.name%3Dserver-02&limit=500&resourceVersion=0\": proxyconnect tcp: dial tcp 192.168.43.196:1023: connect: connection refused" logger="UnhandledError" reflector="k8s.io/client-go/informers/factory.go:160" type="*v1.Node"
E1120 03:25:20.501926       1 reflector.go:205] "Failed to watch" err="failed to list *v1.Node: Get \"https://192.168.174.128:6443/api/v1/nodes?fieldSelector=metadata.name%3Dserver-02&limit=500&resourceVersion=0\": proxyconnect tcp: dial tcp 192.168.43.196:1023: connect: connection refused" logger="UnhandledError" reflector="k8s.io/client-go/informers/factory.go:160" type="*v1.Node"
  • 配置修改 具体的旧配置展示
apiVersion: apps/v1
kind: DaemonSet
metadata:
  name: kube-proxy
  namespace: kube-system
  uid: a88cf688-e5bf-48ee-898a-6496454a1224
  resourceVersion: '18030'
  generation: 1
  creationTimestamp: '2025-10-30T08:45:43Z'
  labels:
    k8s-app: kube-proxy
  annotations:
    deprecated.daemonset.template.generation: '1'
  selfLink: /apis/apps/v1/namespaces/kube-system/daemonsets/kube-proxy
status:
  currentNumberScheduled: 4
  numberMisscheduled: 0
  desiredNumberScheduled: 4
  numberReady: 4
  observedGeneration: 1
  updatedNumberScheduled: 4
  numberAvailable: 4
spec:
  selector:
    matchLabels:
      k8s-app: kube-proxy
  template:
    metadata:
      labels:
        k8s-app: kube-proxy
    spec:
      volumes:
        - name: kube-proxy
          configMap:
            name: kube-proxy
            defaultMode: 420
        - name: xtables-lock
          hostPath:
            path: /run/xtables.lock
            type: FileOrCreate
        - name: lib-modules
          hostPath:
            path: /lib/modules
            type: ''
      containers:
        - name: kube-proxy
          image: registry.k8s.io/kube-proxy:v1.34.1
          command:
            - /usr/local/bin/kube-proxy
            - '--config=/var/lib/kube-proxy/config.conf'
            - '--hostname-override=$(NODE_NAME)'
          env:
            - name: NODE_NAME
              valueFrom:
                fieldRef:
                  apiVersion: v1
                  fieldPath: spec.nodeName
            - name: http_proxy
              value: http://192.168.43.196:1023
            - name: https_proxy
              value: http://192.168.43.196:1023
          resources: {}
          volumeMounts:
            - name: kube-proxy
              mountPath: /var/lib/kube-proxy
            - name: xtables-lock
              mountPath: /run/xtables.lock
            - name: lib-modules
              readOnly: true
              mountPath: /lib/modules
          terminationMessagePath: /dev/termination-log
          terminationMessagePolicy: File
          imagePullPolicy: IfNotPresent
          securityContext:
            privileged: true
      restartPolicy: Always
      terminationGracePeriodSeconds: 30
      dnsPolicy: ClusterFirst
      nodeSelector:
        kubernetes.io/os: linux
      serviceAccountName: kube-proxy
      serviceAccount: kube-proxy
      hostNetwork: true
      securityContext: {}
      schedulerName: default-scheduler
      tolerations:
        - operator: Exists
      priorityClassName: system-node-critical
  updateStrategy:
    type: RollingUpdate
    rollingUpdate:
      maxUnavailable: 1
      maxSurge: 0
  revisionHistoryLimit: 10

编辑 daemonset 配置文件,把旧代理 env 删除

kubectl edit ds -n kube-system kube-proxy

编辑后会滚动更新所有 kubeproxy pod,需要等待一段时间 kubeproxy 恢复正常后,kubectl-flannel 重启后也会恢复正常