- 参考命令:
在使用 API 3 时需要使用环境变量 ETCDCTL_API 明确指定。
在命令行设置:
# export ETCDCTL_API=3
备份数据:
# etcdctl --endpoints localhost:2379 snapshot save snapshot.db
恢复:
# etcdctl snapshot restore snapshot.db --name m3 --data-dir=/home/etcd_data
详细备份命令:
$ ETCDCTL_API=3 etcdctl
--cacert=/opt/kubernetes/ssl/ca.pem
--cert=/opt/kubernetes/ssl/server.pem
--key=/opt/kubernetes/ssl/server-key.pem
--endpoints=https://192.168.1.36:2379 snapshot save /data/etcd_backup_dir/etcd-snapshot-`date +%Y%m%d`.db
- 创建pv:(使用存储类创建pvc可省略)
apiVersion: v1
kind: PersistentVolume
metadata:
name: etcd-bak
namespace: ops
labels:
app: etcd-bak
spec:
capacity:
storage: 50Gi
volumeMode: Filesystem
accessModes: ["ReadWriteMany"]
persistentVolumeReclaimPolicy: Retain
storageClassName: nfs #创建pvc引用的
- 创建pvc和cronjob:
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: bak-etcd
namespace: ops
annotations:
volume.beta.kubernetes.io/storage-class: "nfs-storage"
spec:
accessModes:
- ReadWriteMany
resources:
requests:
storage: 10Gi
---
# apiVersion: batch/v2alpha1
apiVersion: batch/v1
kind: CronJob
metadata:
name: etcd-bak-recovery
namespace: ops
spec:
schedule: "0 23 * * *"
# schedule: "*/5 * * * *"
jobTemplate:
spec:
template:
metadata:
labels:
app: etcd-bak-recovery
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- k8s-master01
containers:
- name: etcd
image: quay.io/coreos/etcd:v3.4.13
command:
- sh
- -c
- "export ETCDCTL_API=3; \
etcdctl --endpoints=$ENDPOINT \
--cacert=/etc/kubernetes/pki/etcd/ca.crt \
--cert=/etc/kubernetes/pki/etcd/server.crt \
--key=/etc/kubernetes/pki/etcd/server.key \
--endpoints=$ENDPOINT snapshot save/snapshot/$(date +%Y%m%d_%H%M%S)_snapshot.db; \
find /snapshot/data -name *snapshot.db -type f -mtime +30 -exec rm {}"
env:
- name: ENDPOINT
value: "127.0.0.1:2379"
volumeMounts:
- mountPath: "/snapshot"
name: snapshot
subPath: data
- mountPath: /etc/localtime
name: lt-config
- mountPath: /etc/kubernetes/pki/etcd
name: pki
restartPolicy: OnFailure
volumes:
- name: snapshot
persistentVolumeClaim:
claimName: bak-etcd
- name: lt-config
hostPath:
path: /etc/localtime
- name: pki
hostPath:
path: /etc/kubernetes/pki/etcd
hostNetwork: true
tolerations:
- key: "node-role.kubernetes.io/master"
operator: "Exists"
effect: "NoSchedule"
崩溃测试:
删除所有namespace:
[root@bogon etcd-snapshot]# kubectl delete namespace --all
namespace "busybox" deleted
namespace "kube-node-lease" deleted
namespace "nfs-storage" deleted
namespace "ops" deleted
Error from server (Forbidden): namespaces "default" is forbidden: this namespace may not be deleted
Error from server (Forbidden): namespaces "kube-public" is forbidden: this namespace may not be deleted
Error from server (Forbidden): namespaces "kube-system" is forbidden: this namespace may not be deleted
恢复测试:
1. 备份并关闭etcd与apiserver
mv /etc/kubernetes/manifests/etcd.yaml /etc/kubernetes/manifests/kube-apiserver.yaml /home
2. 查询服务状态:
docker ps -a | grep -E ".*(etcd|kube-api).*kube-system.*"
如未停止,可手动docker stop
3. 重启kubelet:
systemctl restart kubelet
4. 备份空的etcd目录:
mv /var/lib/etcd /home
5. 用备份文件恢复etcd:(变量值可从etcd.yaml中查询)
ETCDCTL_API=3 ./etcdctl snapshot restore /home/20230921_150000_snapshot.db \
--name etcd-master --initial-cluster etcd-master=http://127.0.0.1:2380 \
--initial-cluster-token etcd-cluster \
--initial-advertise-peer-urls http://127.0.0.1:2380 \
--data-dir /var/lib/etcd
6. 恢复文件,etcd与apiserver服务:
mv /home/etcd.yaml /etc/kubernetes/manifests/etcd.yaml
mv /home/kube-apiserver.yaml /etc/kubernetes/manifests/kube-apiserver.yaml
7. 执行2查看服务恢复情况