rook部署ceph集群出现的问题

885 阅读9分钟

由于以前初始化过rook,删除后再重新初始化会由如下问题

查看日志

在
[root@master ~]# kubectl logs rook-ceph-mon-a-56db84c87f-8f9n9 -n rook-ceph
debug 2023-05-02T22:52:52.338+0000 7f5add2f3700  0 log_channel(audit) log [DBG] : from='admin socket' entity='admin socket' cmd='mon_status' args=[]: dispatch
debug 2023-05-02T22:52:52.338+0000 7f5add2f3700  0 log_channel(audit) log [DBG] : from='admin socket' entity='admin socket' cmd=mon_status args=[]: finished
debug 2023-05-02T22:52:54.481+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:52:54.834+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:52:54.849+0000 7f5ad98f0700 -1 mon.a@0(probing) e3 get_health_metrics reporting 392 slow ops, oldest is log(1 entries from seq 1 at 2023-05-02T22:19:52.339629+0000)
debug 2023-05-02T22:52:55.375+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:52:56.328+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:52:57.481+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:52:57.835+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:52:58.386+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:52:59.338+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:52:59.849+0000 7f5ad98f0700 -1 mon.a@0(probing) e3 get_health_metrics reporting 392 slow ops, oldest is log(1 entries from seq 1 at 2023-05-02T22:19:52.339629+0000)
debug 2023-05-02T22:53:00.488+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:53:00.840+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:53:01.394+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:53:02.345+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:53:02.346+0000 7f5add2f3700  0 log_channel(audit) log [DBG] : from='admin socket' entity='admin socket' cmd='mon_status' args=[]: dispatch
debug 2023-05-02T22:53:02.346+0000 7f5add2f3700  0 log_channel(audit) log [DBG] : from='admin socket' entity='admin socket' cmd=mon_status args=[]: finished
debug 2023-05-02T22:53:03.482+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:53:03.836+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:53:04.389+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:53:04.850+0000 7f5ad98f0700 -1 mon.a@0(probing) e3 get_health_metrics reporting 394 slow ops, oldest is log(1 entries from seq 1 at 2023-05-02T22:19:52.339629+0000)
debug 2023-05-02T22:53:05.344+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:53:06.475+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:53:06.832+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:53:07.377+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:53:08.330+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:53:09.850+0000 7f5ad98f0700 -1 mon.a@0(probing) e3 get_health_metrics reporting 394 slow ops, oldest is log(1 entries from seq 1 at 2023-05-02T22:19:52.339629+0000)
debug 2023-05-02T22:53:12.341+0000 7f5add2f3700  0 log_channel(audit) log [DBG] : from='admin socket' entity='admin socket' cmd='mon_status' args=[]: dispatch
debug 2023-05-02T22:53:12.341+0000 7f5add2f3700  0 log_channel(audit) log [DBG] : from='admin socket' entity='admin socket' cmd=mon_status args=[]: finished
debug 2023-05-02T22:53:14.850+0000 7f5ad98f0700 -1 mon.a@0(probing) e3 get_health_metrics reporting 396 slow ops, oldest is log(1 entries from seq 1 at 2023-05-02T22:19:52.339629+0000)
debug 2023-05-02T22:53:15.072+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:53:15.429+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:53:15.970+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:53:16.922+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:53:18.064+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:53:18.417+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:53:18.973+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:53:19.850+0000 7f5ad98f0700 -1 mon.a@0(probing) e3 get_health_metrics reporting 396 slow ops, oldest is log(1 entries from seq 1 at 2023-05-02T22:19:52.339629+0000)
debug 2023-05-02T22:53:19.929+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:53:21.062+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:53:21.417+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:53:21.970+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:53:22.337+0000 7f5add2f3700  0 log_channel(audit) log [DBG] : from='admin socket' entity='admin socket' cmd='mon_status' args=[]: dispatch
debug 2023-05-02T22:53:22.337+0000 7f5add2f3700  0 log_channel(audit) log [DBG] : from='admin socket' entity='admin socket' cmd=mon_status args=[]: finished
debug 2023-05-02T22:53:22.924+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:53:24.075+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:53:24.427+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:53:24.850+0000 7f5ad98f0700 -1 mon.a@0(probing) e3 get_health_metrics reporting 398 slow ops, oldest is log(1 entries from seq 1 at 2023-05-02T22:19:52.339629+0000)
debug 2023-05-02T22:53:24.980+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:53:25.936+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:53:27.071+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:53:27.427+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
debug 2023-05-02T22:53:27.979+0000 7f5adb0f3700  1 mon.a@0(probing) e3 handle_auth_request failed to assign global_id
[root@master ~]# kubectl logs rook-ceph-operator-6d4df78d9b-2pmnp -n rook-ceph
2023-05-02 23:23:01.177164 I | cephclient: writing config file /var/lib/rook/rook-ceph/rook-ceph.config
2023-05-02 23:23:01.177325 I | cephclient: generated admin config in /var/lib/rook/rook-ceph
2023-05-02 23:23:02.395719 I | op-mon: targeting the mon count 3
2023-05-02 23:23:02.499765 I | op-config: applying ceph settings:
[global]
mon cluster log file    = 
mon allow pool size one = true
mon allow pool delete   = true
2023-05-02 23:23:17.501083 I | exec: exec timeout waiting for process ceph to return. Sending interrupt signal to the process
2023-05-02 23:23:17.503307 E | op-config: failed to run command ceph [config assimilate-conf -i /tmp/4265486729 -o /tmp/4265486729.out]
2023-05-02 23:23:17.503363 E | op-config: failed to open assimilate output file /tmp/4265486729.out. open /tmp/4265486729.out: no such file or directory
2023-05-02 23:23:17.503371 E | op-config: failed to apply ceph settings:
2023-05-02 23:23:17.503430 W | op-mon: failed to set Rook and/or user-defined Ceph config options before starting mons; will retry after starting mons. failed to apply default Ceph configurations: failed to set ceph config in the centralized mon configuration database; output: Cluster connection aborted: open /tmp/4265486729.out: no such file or directory
2023-05-02 23:23:17.503437 I | op-mon: checking for basic quorum with existing mons
2023-05-02 23:23:17.723801 I | op-mon: mon "a" ip is 10.98.36.162
2023-05-02 23:23:17.944696 I | op-mon: mon "b" ip is 10.100.201.130
2023-05-02 23:23:18.186950 I | op-mon: mon "c" ip is 10.103.49.58
2023-05-02 23:23:18.572718 I | op-mon: saved mon endpoints to config map map[csi-cluster-config-json:[{"clusterID":"rook-ceph","monitors":["10.98.36.162:6789","10.100.201.130:6789","10.103.49.58:6789"],"namespace":""}] data:a=10.98.36.162:6789,b=10.100.201.130:6789,c=10.103.49.58:6789 mapping:{"node":{"a":{"Name":"node1","Hostname":"node1","Address":"13.19.17.11"},"b":{"Name":"node2","Hostname":"node2","Address":"11.9.20.16"},"c":{"Name":"master","Hostname":"master","Address":"13.19.16.10"}}} maxMonId:2 outOfQuorum:]
2023-05-02 23:23:19.154818 I | cephclient: writing config file /var/lib/rook/rook-ceph/rook-ceph.config
2023-05-02 23:23:19.154988 I | cephclient: generated admin config in /var/lib/rook/rook-ceph
2023-05-02 23:23:20.359989 I | op-mon: deployment for mon rook-ceph-mon-a already exists. updating if needed
2023-05-02 23:23:20.589833 I | op-k8sutil: deployment "rook-ceph-mon-a" did not change, nothing to update
2023-05-02 23:23:20.589858 I | op-mon: waiting for mon quorum with [a b c]
2023-05-02 23:23:21.234924 I | op-mon: mons running: [a b c]
2023-05-02 23:23:41.988269 I | op-mon: mons running: [a b c]
2023-05-02 23:24:02.761942 I | op-mon: mons running: [a b c]
2023-05-02 23:24:23.521001 I | op-mon: mons running: [a b c]
2023-05-02 23:24:44.240099 I | op-mon: mons running: [a b c]
2023-05-02 23:25:05.006403 I | op-mon: mons running: [a b c]
2023-05-02 23:25:25.722755 I | op-mon: mons running: [a b c]
2023-05-02 23:25:46.483070 I | op-mon: mons running: [a b c]
2023-05-02 23:26:07.238395 I | op-mon: mons running: [a b c]
2023-05-02 23:26:27.992617 I | op-mon: mons running: [a b c]
2023-05-02 23:26:48.742360 I | op-mon: mons running: [a b c]
2023-05-02 23:27:09.496986 I | op-mon: mons running: [a b c]
2023-05-02 23:27:30.252042 I | op-mon: mons running: [a b c]
2023-05-02 23:27:51.009655 I | op-mon: mons running: [a b c]
2023-05-02 23:28:11.837820 I | op-mon: mons running: [a b c]
2023-05-02 23:28:32.641846 I | op-mon: mons running: [a b c]

解决办法

删除rook生成的配置文件,重新初始化

# 查看目录结构
[root@master ~]# tree /var/lib/rook
/var/lib/rook
├── mon-a
│   └── data
│       ├── external_log_to
│       ├── keyring
│       ├── kv_backend
│       ├── min_mon_release
│       └── store.db
│           ├── 000072.sst
│           ├── 000075.sst
│           ├── 000077.log
│           ├── CURRENT
│           ├── IDENTITY
│           ├── LOCK
│           ├── MANIFEST-000076
│           ├── OPTIONS-000012
│           └── OPTIONS-000079
├── mon-c
│   └── data
│       ├── external_log_to
│       ├── keyring
│       ├── kv_backend
│       └── store.db
│           ├── 000114.sst
│           ├── 000117.sst
│           ├── 000134.log
│           ├── CURRENT
│           ├── IDENTITY
│           ├── LOCK
│           ├── MANIFEST-000133
│           ├── OPTIONS-000131
│           └── OPTIONS-000136
├── mon-d
│   └── data
│       ├── external_log_to
│       ├── keyring
│       ├── kv_backend
│       ├── min_mon_release
│       └── store.db
│           ├── 000037.log
│           ├── 000039.sst
│           ├── CURRENT
│           ├── IDENTITY
│           ├── LOCK
│           ├── MANIFEST-000009
│           ├── OPTIONS-000006
│           └── OPTIONS-000012
└── rook-ceph
    ├── a6790216-24e1-47f1-9228-8887dbf073a0_ac6df81a-6b70-4eeb-ab8c-668c0f0d5661
    │   ├── block -> /dev/vdc
    │   ├── ceph_fsid
    │   ├── fsid
    │   ├── keyring
    │   ├── ready
    │   ├── require_osd_release
    │   ├── type
    │   └── whoami
    ├── client.admin.keyring
    ├── crash
    │   └── posted
    ├── log
    │   ├── ceph-mon.a.log
    │   ├── ceph-mon.c.log
    │   ├── ceph-mon.d.log
    │   ├── ceph-osd.1.log
    │   └── ceph-volume.log
    └── rook-ceph.config

14 directories, 52 files
# 删除目录的文件
[root@master ~]# rm -rf /var/lib/rook

重新部署rook ceph

参考链接