Hi, I have a 3 nodes PVE Cluster, running CEPH in 3 OSDs (1 per server), and 2 monitors (pve1, pve3).
My 3rd server died (pve3), and now CEPH is down. OSDs on pve1 and pve2 are up, but nothing shows up in GUI.
I would like to understand why CEPH failed if just 1 node went down. Moreover, to know if it is possible to restore ceph and/or recover VM disks that were in CEPH.
"ceph -s" hangs.
root@pve1:/var/lib/ceph/mon# pvecm status
Cluster information
-------------------
Name: cluster-pve
Config Version: 3
Transport: knet
Secure auth: on
Quorum information
------------------
Date: Sun Aug 25 13:33:27 2024
Quorum provider: corosync_votequorum
Nodes: 2
Node ID: 0x00000001
Ring ID: 1.aa
Quorate: Yes
Votequorum information
----------------------
Expected votes: 2
Highest expected: 2
Total votes: 2
Quorum: 2
Flags: Quorate
Membership information
----------------------
Nodeid Votes Name
0x00000001 1 192.168.30.11 (local)
0x00000003 1 192.168.30.12
root@pve1:/var/lib/ceph/mon# systemctl status ceph-mon.target
● ceph-mon.target - ceph target allowing to start/stop all ceph-mon@.service instances at once
Loaded: loaded (/lib/systemd/system/ceph-mon.target; enabled; preset: enabled)
Active: active since Fri 2024-08-23 16:53:23 CDT; 1 day 20h ago
root@pve1:/var/lib/ceph/mon# systemctl status ceph-osd@0.service
● ceph-osd@0.service - Ceph object storage daemon osd.0
Loaded: loaded (/lib/systemd/system/ceph-osd@.service; enabled-runtime; preset: enabled)
Drop-In: /usr/lib/systemd/system/ceph-osd@.service.d
└─ceph-after-pve-cluster.conf
Active: active (running) since Sun 2024-08-25 13:28:13 CDT; 1s ago
Process: 629343 ExecStartPre=/usr/libexec/ceph/ceph-osd-prestart.sh --cluster ${CLUSTER} --id 0 (code=exited, status=0/SUCCESS)
Main PID: 629347 (ceph-osd)
Tasks: 9
Memory: 11.0M
CPU: 33ms
CGroup: /system.slice/system-ceph\x2dosd.slice/ceph-osd@0.service
└─629347 /usr/bin/ceph-osd -f --cluster ceph --id 0 --setuser ceph --setgroup ceph
root@pve1:/var/lib/ceph/mon# systemctl status ceph-mgr.target
● ceph-mgr.target - ceph target allowing to start/stop all ceph-mgr@.service instances at once
Loaded: loaded (/lib/systemd/system/ceph-mgr.target; enabled; preset: enabled)
Active: active since Fri 2024-08-23 16:53:23 CDT; 1 day 20h ago
root@pve1:/var/lib/ceph/mon# systemctl status ceph-volume@lvm-0-1d18f9e4-119e-43f3-b8a1-e4bc78ae9966.service
○ ceph-volume@lvm-0-1d18f9e4-119e-43f3-b8a1-e4bc78ae9966.service - Ceph Volume activation: lvm-0-1d18f9e4-119e-43f3-b8a1-e4bc78ae9966
Loaded: loaded (/lib/systemd/system/ceph-volume@.service; enabled; preset: enabled)
Drop-In: /usr/lib/systemd/system/ceph-volume@.service.d
└─ceph-after-pve-cluster.conf
Active: inactive (dead) since Fri 2024-08-23 16:53:24 CDT; 1 day 20h ago
Main PID: 1339 (code=exited, status=0/SUCCESS)
CPU: 217ms
root@pve1:/var/lib/ceph/mon# systemctl status ceph-mgr@pve1.service
× ceph-mgr@pve1.service - Ceph cluster manager daemon
Loaded: loaded (/lib/systemd/system/ceph-mgr@.service; enabled; preset: enabled)
Drop-In: /usr/lib/systemd/system/ceph-mgr@.service.d
└─ceph-after-pve-cluster.conf
Active: failed (Result: exit-code) since Sun 2024-08-25 13:12:50 CDT; 17min ago
Duration: 29ms
Process: 624899 ExecStart=/usr/bin/ceph-mgr -f --cluster ${CLUSTER} --id pve1 --setuser ceph --setgroup ceph (code=exited, status=1/FAILURE)
Main PID: 624899 (code=exited, status=1/FAILURE)
CPU: 29ms
root@pve1:/var/lib/ceph/mon# ceph-volume lvm activate --all
--> OSD ID 0 FSID 1d18f9e4-119e-43f3-b8a1-e4bc78ae9966 process is active. Skipping activation
root@pve2:~# ceph-volume lvm activate --all
--> OSD ID 1 FSID 848fb3c5-8121-409f-9972-2df69f171074 process is active. Skipping activation
root@pve2:~# systemctl status ceph-osd@1
● ceph-osd@1.service - Ceph object storage daemon osd.1
Loaded: loaded (/lib/systemd/system/ceph-osd@.service; enabled-runtime; preset: enabled)
Drop-In: /usr/lib/systemd/system/ceph-osd@.service.d
└─ceph-after-pve-cluster.conf
Active: active (running) since Sun 2024-08-25 13:31:15 CDT; 24s ago
Process: 603630 ExecStartPre=/usr/libexec/ceph/ceph-osd-prestart.sh --cluster ${CLUSTER} --id 1 (code=exited, status=0/SUCCESS)
Main PID: 603634 (ceph-osd)
Tasks: 9
Memory: 10.9M
CPU: 44ms
CGroup: /system.slice/system-ceph\x2dosd.slice/ceph-osd@1.service
└─603634 /usr/bin/ceph-osd -f --cluster ceph --id 1 --setuser ceph --setgroup ceph
Any help will be highly appreciated!
My 3rd server died (pve3), and now CEPH is down. OSDs on pve1 and pve2 are up, but nothing shows up in GUI.
I would like to understand why CEPH failed if just 1 node went down. Moreover, to know if it is possible to restore ceph and/or recover VM disks that were in CEPH.
"ceph -s" hangs.
root@pve1:/var/lib/ceph/mon# pvecm status
Cluster information
-------------------
Name: cluster-pve
Config Version: 3
Transport: knet
Secure auth: on
Quorum information
------------------
Date: Sun Aug 25 13:33:27 2024
Quorum provider: corosync_votequorum
Nodes: 2
Node ID: 0x00000001
Ring ID: 1.aa
Quorate: Yes
Votequorum information
----------------------
Expected votes: 2
Highest expected: 2
Total votes: 2
Quorum: 2
Flags: Quorate
Membership information
----------------------
Nodeid Votes Name
0x00000001 1 192.168.30.11 (local)
0x00000003 1 192.168.30.12
root@pve1:/var/lib/ceph/mon# systemctl status ceph-mon.target
● ceph-mon.target - ceph target allowing to start/stop all ceph-mon@.service instances at once
Loaded: loaded (/lib/systemd/system/ceph-mon.target; enabled; preset: enabled)
Active: active since Fri 2024-08-23 16:53:23 CDT; 1 day 20h ago
root@pve1:/var/lib/ceph/mon# systemctl status ceph-osd@0.service
● ceph-osd@0.service - Ceph object storage daemon osd.0
Loaded: loaded (/lib/systemd/system/ceph-osd@.service; enabled-runtime; preset: enabled)
Drop-In: /usr/lib/systemd/system/ceph-osd@.service.d
└─ceph-after-pve-cluster.conf
Active: active (running) since Sun 2024-08-25 13:28:13 CDT; 1s ago
Process: 629343 ExecStartPre=/usr/libexec/ceph/ceph-osd-prestart.sh --cluster ${CLUSTER} --id 0 (code=exited, status=0/SUCCESS)
Main PID: 629347 (ceph-osd)
Tasks: 9
Memory: 11.0M
CPU: 33ms
CGroup: /system.slice/system-ceph\x2dosd.slice/ceph-osd@0.service
└─629347 /usr/bin/ceph-osd -f --cluster ceph --id 0 --setuser ceph --setgroup ceph
root@pve1:/var/lib/ceph/mon# systemctl status ceph-mgr.target
● ceph-mgr.target - ceph target allowing to start/stop all ceph-mgr@.service instances at once
Loaded: loaded (/lib/systemd/system/ceph-mgr.target; enabled; preset: enabled)
Active: active since Fri 2024-08-23 16:53:23 CDT; 1 day 20h ago
root@pve1:/var/lib/ceph/mon# systemctl status ceph-volume@lvm-0-1d18f9e4-119e-43f3-b8a1-e4bc78ae9966.service
○ ceph-volume@lvm-0-1d18f9e4-119e-43f3-b8a1-e4bc78ae9966.service - Ceph Volume activation: lvm-0-1d18f9e4-119e-43f3-b8a1-e4bc78ae9966
Loaded: loaded (/lib/systemd/system/ceph-volume@.service; enabled; preset: enabled)
Drop-In: /usr/lib/systemd/system/ceph-volume@.service.d
└─ceph-after-pve-cluster.conf
Active: inactive (dead) since Fri 2024-08-23 16:53:24 CDT; 1 day 20h ago
Main PID: 1339 (code=exited, status=0/SUCCESS)
CPU: 217ms
root@pve1:/var/lib/ceph/mon# systemctl status ceph-mgr@pve1.service
× ceph-mgr@pve1.service - Ceph cluster manager daemon
Loaded: loaded (/lib/systemd/system/ceph-mgr@.service; enabled; preset: enabled)
Drop-In: /usr/lib/systemd/system/ceph-mgr@.service.d
└─ceph-after-pve-cluster.conf
Active: failed (Result: exit-code) since Sun 2024-08-25 13:12:50 CDT; 17min ago
Duration: 29ms
Process: 624899 ExecStart=/usr/bin/ceph-mgr -f --cluster ${CLUSTER} --id pve1 --setuser ceph --setgroup ceph (code=exited, status=1/FAILURE)
Main PID: 624899 (code=exited, status=1/FAILURE)
CPU: 29ms
root@pve1:/var/lib/ceph/mon# ceph-volume lvm activate --all
--> OSD ID 0 FSID 1d18f9e4-119e-43f3-b8a1-e4bc78ae9966 process is active. Skipping activation
root@pve2:~# ceph-volume lvm activate --all
--> OSD ID 1 FSID 848fb3c5-8121-409f-9972-2df69f171074 process is active. Skipping activation
root@pve2:~# systemctl status ceph-osd@1
● ceph-osd@1.service - Ceph object storage daemon osd.1
Loaded: loaded (/lib/systemd/system/ceph-osd@.service; enabled-runtime; preset: enabled)
Drop-In: /usr/lib/systemd/system/ceph-osd@.service.d
└─ceph-after-pve-cluster.conf
Active: active (running) since Sun 2024-08-25 13:31:15 CDT; 24s ago
Process: 603630 ExecStartPre=/usr/libexec/ceph/ceph-osd-prestart.sh --cluster ${CLUSTER} --id 1 (code=exited, status=0/SUCCESS)
Main PID: 603634 (ceph-osd)
Tasks: 9
Memory: 10.9M
CPU: 44ms
CGroup: /system.slice/system-ceph\x2dosd.slice/ceph-osd@1.service
└─603634 /usr/bin/ceph-osd -f --cluster ceph --id 1 --setuser ceph --setgroup ceph
Any help will be highly appreciated!