etcd备份与崩溃恢复测试

235 阅读1分钟
  1. 参考命令:

在使用 API 3 时需要使用环境变量 ETCDCTL_API 明确指定。

在命令行设置:

# export ETCDCTL_API=3

备份数据:

# etcdctl --endpoints localhost:2379 snapshot save snapshot.db

恢复:

# etcdctl snapshot restore snapshot.db --name m3 --data-dir=/home/etcd_data

详细备份命令:

$ ETCDCTL_API=3 etcdctl 
--cacert=/opt/kubernetes/ssl/ca.pem 
--cert=/opt/kubernetes/ssl/server.pem 
--key=/opt/kubernetes/ssl/server-key.pem 
--endpoints=https://192.168.1.36:2379 snapshot save /data/etcd_backup_dir/etcd-snapshot-`date +%Y%m%d`.db
  1. 创建pv:(使用存储类创建pvc可省略)
apiVersion: v1
kind: PersistentVolume
metadata:
  name: etcd-bak
  namespace: ops
  labels:
    app: etcd-bak
spec:
  capacity:
    storage: 50Gi
  volumeMode: Filesystem
  accessModes: ["ReadWriteMany"]
  persistentVolumeReclaimPolicy: Retain
  storageClassName: nfs  #创建pvc引用的
  1. 创建pvc和cronjob:
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: bak-etcd
  namespace: ops
  annotations:
    volume.beta.kubernetes.io/storage-class: "nfs-storage"
spec:
  accessModes:
  - ReadWriteMany
  resources:
    requests:
      storage: 10Gi
---
# apiVersion: batch/v2alpha1
apiVersion: batch/v1
kind: CronJob
metadata:
  name: etcd-bak-recovery
  namespace: ops
spec:
 schedule: "0 23 * * *"
 # schedule: "*/5 * * * *"
 jobTemplate:
  spec:
    template:
      metadata:
       labels:
        app: etcd-bak-recovery
      spec:
        affinity:
          nodeAffinity:
            requiredDuringSchedulingIgnoredDuringExecution:
                  nodeSelectorTerms:
                  - matchExpressions:
                    - key: kubernetes.io/hostname
                      operator: In
                      values:
                      - k8s-master01
        containers:
        - name: etcd
          image: quay.io/coreos/etcd:v3.4.13
          command:
          - sh
          - -c
          - "export ETCDCTL_API=3; \
             etcdctl --endpoints=$ENDPOINT \
             --cacert=/etc/kubernetes/pki/etcd/ca.crt \
             --cert=/etc/kubernetes/pki/etcd/server.crt \
             --key=/etc/kubernetes/pki/etcd/server.key \
             --endpoints=$ENDPOINT snapshot save/snapshot/$(date +%Y%m%d_%H%M%S)_snapshot.db; \
             find /snapshot/data -name *snapshot.db -type f -mtime +30 -exec rm {}"
          env:
          - name: ENDPOINT
            value: "127.0.0.1:2379"
          volumeMounts:
            - mountPath: "/snapshot"
              name: snapshot
              subPath: data
            - mountPath: /etc/localtime
              name: lt-config
            - mountPath: /etc/kubernetes/pki/etcd
              name: pki
        restartPolicy: OnFailure
        volumes:
          - name: snapshot
            persistentVolumeClaim:
              claimName: bak-etcd
          - name: lt-config
            hostPath:
              path: /etc/localtime
          - name: pki
            hostPath:
              path: /etc/kubernetes/pki/etcd
        hostNetwork: true
        tolerations:
        - key: "node-role.kubernetes.io/master"
          operator: "Exists"
          effect: "NoSchedule"

崩溃测试:

删除所有namespace:

[root@bogon etcd-snapshot]# kubectl delete namespace --all
namespace "busybox" deleted
namespace "kube-node-lease" deleted
namespace "nfs-storage" deleted
namespace "ops" deleted
Error from server (Forbidden): namespaces "default" is forbidden: this namespace may not be deleted
Error from server (Forbidden): namespaces "kube-public" is forbidden: this namespace may not be deleted
Error from server (Forbidden): namespaces "kube-system" is forbidden: this namespace may not be deleted

image.png

恢复测试:

1. 备份并关闭etcd与apiserver
    mv /etc/kubernetes/manifests/etcd.yaml /etc/kubernetes/manifests/kube-apiserver.yaml /home

2. 查询服务状态:
    docker ps -a | grep -E ".*(etcd|kube-api).*kube-system.*"
如未停止,可手动docker stop

3. 重启kubelet:
    systemctl restart kubelet

4. 备份空的etcd目录:
    mv /var/lib/etcd /home

5. 用备份文件恢复etcd:(变量值可从etcd.yaml中查询)
ETCDCTL_API=3 ./etcdctl snapshot restore /home/20230921_150000_snapshot.db \
--name  etcd-master --initial-cluster etcd-master=http://127.0.0.1:2380 \
--initial-cluster-token etcd-cluster  \
--initial-advertise-peer-urls  http://127.0.0.1:2380 \
--data-dir /var/lib/etcd

6. 恢复文件,etcd与apiserver服务:
mv /home/etcd.yaml /etc/kubernetes/manifests/etcd.yaml
mv /home/kube-apiserver.yaml /etc/kubernetes/manifests/kube-apiserver.yaml

7. 执行2查看服务恢复情况