Hi I have a worst case,
osd's in a 3 node cluster each 4 nvme's won't start
we had a ip config change in public network, and mon's died so we managed mon's to come back with new ip's.
corosync on 2 rings is fine,
all 3 mon's are up
osd's won't start
how to get back to the pool, already 3vm's are configured and valuable data would be lost...
any help to reconstruct this storage is highly appreciated!
Gerhard
osd's in a 3 node cluster each 4 nvme's won't start
we had a ip config change in public network, and mon's died so we managed mon's to come back with new ip's.
corosync on 2 rings is fine,
all 3 mon's are up
osd's won't start
how to get back to the pool, already 3vm's are configured and valuable data would be lost...
any help to reconstruct this storage is highly appreciated!
Gerhard
Code:
root@pve01:/var/log# systemctl status ceph-osd@0.service.service
● ceph-osd@0.service.service - Ceph object storage daemon osd.0.service
Loaded: loaded (/lib/systemd/system/ceph-osd@.service; disabled; vendor preset: enabled)
Drop-In: /usr/lib/systemd/system/ceph-osd@.service.d
└─ceph-after-pve-cluster.conf
Active: failed (Result: exit-code) since Thu 2020-10-22 00:30:09 CEST; 37min ago
Process: 31402 ExecStartPre=/usr/lib/ceph/ceph-osd-prestart.sh --cluster ${CLUSTER} --id 0.service (code=exited, status=1/FAILURE)
Oct 22 00:30:09 pve01 systemd[1]: ceph-osd@0.service.service: Service RestartSec=100ms expired, scheduling restart.
Oct 22 00:30:09 pve01 systemd[1]: ceph-osd@0.service.service: Scheduled restart job, restart counter is at 3.
Oct 22 00:30:09 pve01 systemd[1]: Stopped Ceph object storage daemon osd.0.service.
Oct 22 00:30:09 pve01 systemd[1]: ceph-osd@0.service.service: Start request repeated too quickly.
Oct 22 00:30:09 pve01 systemd[1]: ceph-osd@0.service.service: Failed with result 'exit-code'.
Oct 22 00:30:09 pve01 systemd[1]: Failed to start Ceph object storage daemon osd.0.service.
Code:
ceph mon dump
dumped monmap epoch 3
epoch 3
fsid 92d063d7-647c-44b8-95d7-86057ee0ab22
last_changed 2020-10-21 23:31:50.584796
created 2020-10-21 21:00:54.077449
min_mon_release 14 (nautilus)
0: [v2:10.100.200.141:3300/0,v1:10.100.200.141:6789/0] mon.pve01
1: [v2:10.100.200.142:3300/0,v1:10.100.200.142:6789/0] mon.pve02
2: [v2:10.100.200.143:3300/0,v1:10.100.200.143:6789/0] mon.pve03
Networks:
auto lo
iface lo inet loopback
auto eno1np0
iface eno1np0 inet static
address 10.110.200.131/24
mtu 9000
#corosync1 10GB
auto eno2np1
iface eno2np1 inet static
address 10.111.200.131/24
mtu 9000
#Corosync2 10GB
iface enp69s0f0 inet manual
mtu 9000
auto enp69s0f1
iface enp69s0f1 inet static
address 10.112.200.131/24
mtu 9000
#Cluster private 100GB
auto vmbr0
iface vmbr0 inet static
address 10.100.200.141/24
gateway 10.100.200.1
bridge-ports enp69s0f0
bridge-stp off
bridge-fd 0
mtu 9000
#Cluster public 100GB
===================================================================================================
ceph.conf
[global]
auth_client_required = cephx
auth_cluster_required = cephx
auth_service_required = cephx
cluster_network = 10.112.200.0/24
fsid = 92d063d7-647c-44b8-95d7-86057ee0ab22
mon_allow_pool_delete = true
mon_host = 10.100.200.141 10.100.200.142 10.100.200.143
osd_pool_default_min_size = 2
osd_pool_default_size = 3
public_network = 10.100.200.0/24
[client]
keyring = /etc/pve/priv/$cluster.$name.keyring
[mon.pve01]
public_addr = 10.100.200.141
[mon.pve02]
public_addr = 10.100.200.142
[mon.pve03]
public_addr = 10.100.200.143
ceph -s
cluster:
id: 92d063d7-647c-44b8-95d7-86057ee0ab22
health: HEALTH_WARN
1 daemons have recently crashed
OSD count 0 < osd_pool_default_size 3
services:
mon: 3 daemons, quorum pve01,pve02,pve03 (age 63m)
mgr: pve01(active, since 64m)
osd: 0 osds: 0 up, 0 in
data:
pools: 0 pools, 0 pgs
objects: 0 objects, 0 B
usage: 0 B used, 0 B / 0 B avail
pgs:
df -h
Filesystem Size Used Avail Use% Mounted on
udev 252G 0 252G 0% /dev
tmpfs 51G 11M 51G 1% /run
rpool/ROOT/pve-1 229G 16G 214G 7% /
tmpfs 252G 63M 252G 1% /dev/shm
tmpfs 5.0M 0 5.0M 0% /run/lock
tmpfs 252G 0 252G 0% /sys/fs/cgroup
rpool 214G 128K 214G 1% /rpool
rpool/data 214G 128K 214G 1% /rpool/data
rpool/ROOT 214G 128K 214G 1% /rpool/ROOT
tmpfs 252G 24K 252G 1% /var/lib/ceph/osd/ceph-3
tmpfs 252G 24K 252G 1% /var/lib/ceph/osd/ceph-2
tmpfs 252G 24K 252G 1% /var/lib/ceph/osd/ceph-0
tmpfs 252G 24K 252G 1% /var/lib/ceph/osd/ceph-1
/dev/fuse 30M 32K 30M 1% /etc/pve
tmpfs 51G 0 51G 0% /run/user/0
lsblk
NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT
nvme4n1 259:0 0 238.5G 0 disk
├─nvme4n1p1 259:5 0 1007K 0 part
├─nvme4n1p2 259:6 0 512M 0 part
└─nvme4n1p3 259:7 0 238G 0 part
nvme5n1 259:1 0 238.5G 0 disk
├─nvme5n1p1 259:2 0 1007K 0 part
├─nvme5n1p2 259:3 0 512M 0 part
└─nvme5n1p3 259:4 0 238G 0 part
nvme0n1 259:12 0 2.9T 0 disk
└─ceph--cc77fe1b--c8d4--48be--a7c4--36109439c85c-osd--block--80e0127e--836e--44b8--882d--ac49bfc85866 253:3 0 2.9T 0 lvm
nvme1n1 259:13 0 2.9T 0 disk
└─ceph--eb8b2fc7--775e--4b94--8070--784e7bbf861e-osd--block--4d433222--e1e8--43ac--8dc7--2e6e998ff122 253:2 0 2.9T 0 lvm
nvme3n1 259:14 0 2.9T 0 disk
└─ceph--5724bdf7--5124--4244--91d6--e254210c2174-osd--block--2d6fe149--f330--415a--a762--44d037c900b1 253:1 0 2.9T 0 lvm
nvme2n1 259:15 0 2.9T 0 disk
└─ceph--cb5762e9--40fa--4148--98f4--5b5ddef4c1de-osd--block--793d52e6--4cc7--42aa--8326--df25b21c1237 253:0 0 2.9T 0 lvm