Hello,
i have a 4+1 node Proxmox Setup. 4 full nodes with one node just for quorum purposes (no osds in this node and also no vms running on it). 2 nodes are located in one location, 2 in another and the quorum in a third.
The reasoning is that i wanted this setup to withstand the failure of a full location.
As storage backend i setup a ceph cluster. Each full node has 6 SSDs that are configured as ceph osds.
I figured the default ceph setting of 3 file copies would make it resilient to any 2 node failure, since at least 1 data copy would still be available.
Ive now had one location (node 3 and 4), so 2 of the full nodes, shut down. This caused the ceph cluster to lock up. Seems like it is not as resilient as i thought, but now i'm trying to figure out why and what to to better.
Here is my ceph config:
and the crush map:
Any advice?
i have a 4+1 node Proxmox Setup. 4 full nodes with one node just for quorum purposes (no osds in this node and also no vms running on it). 2 nodes are located in one location, 2 in another and the quorum in a third.
The reasoning is that i wanted this setup to withstand the failure of a full location.
As storage backend i setup a ceph cluster. Each full node has 6 SSDs that are configured as ceph osds.
I figured the default ceph setting of 3 file copies would make it resilient to any 2 node failure, since at least 1 data copy would still be available.
Ive now had one location (node 3 and 4), so 2 of the full nodes, shut down. This caused the ceph cluster to lock up. Seems like it is not as resilient as i thought, but now i'm trying to figure out why and what to to better.
Here is my ceph config:
Code:
[global]
auth_client_required = cephx
auth_cluster_required = cephx
auth_service_required = cephx
cluster_network = 192.168.3.1/24
fsid = b7c21f77-5a2b-4a9a-8e50-dd07adaf5f69
mon_allow_pool_delete = true
mon_host = 192.168.3.2 192.168.3.201 192.168.3.4
ms_bind_ipv4 = true
ms_bind_ipv6 = false
osd_pool_default_min_size = 2
osd_pool_default_size = 3
public_network = 192.168.3.1/24
[client]
keyring = /etc/pve/priv/$cluster.$name.keyring
[mds]
keyring = /var/lib/ceph/mds/ceph-$id/keyring
[mds.proxmox-1]
host = dmz-proxmox-1
mds_standby_for_name = pve
[mds.proxmox-2]
host = dmz-proxmox-2
mds_standby_for_name = pve
[mds.proxmox-3]
host = dmz-proxmox-3
mds_standby_for_name = pve
[mon.proxmox-2]
public_addr = 192.168.3.2
[mon.proxmox-4]
public_addr = 192.168.3.4
[mon.proxmox-q]
public_addr = 192.168.3.201
and the crush map:
Code:
# begin crush map
tunable choose_local_tries 0
tunable choose_local_fallback_tries 0
tunable choose_total_tries 50
tunable chooseleaf_descend_once 1
tunable chooseleaf_vary_r 1
tunable chooseleaf_stable 1
tunable straw_calc_version 1
tunable allowed_bucket_algs 54
# devices
device 0 osd.0 class ssd
device 1 osd.1 class ssd
device 2 osd.2 class ssd
device 3 osd.3 class ssd
device 4 osd.4 class ssd
device 5 osd.5 class ssd
device 6 osd.6 class ssd
device 7 osd.7 class ssd
device 8 osd.8 class ssd
device 9 osd.9 class ssd
device 10 osd.10 class ssd
device 11 osd.11 class ssd
device 12 osd.12 class ssd
device 13 osd.13 class ssd
device 14 osd.14 class ssd
device 15 osd.15 class ssd
device 16 osd.16 class ssd
device 17 osd.17 class ssd
device 18 osd.18 class ssd
device 19 osd.19 class ssd
device 20 osd.20 class ssd
device 21 osd.21 class ssd
device 22 osd.22 class ssd
device 23 osd.23 class ssd
# types
type 0 osd
type 1 host
type 2 chassis
type 3 rack
type 4 row
type 5 pdu
type 6 pod
type 7 room
type 8 datacenter
type 9 zone
type 10 region
type 11 root
# buckets
host proxmox-4 {
id -3 # do not change unnecessarily
id -4 class ssd # do not change unnecessarily
# weight 10.480
alg straw2
hash 0 # rjenkins1
item osd.0 weight 1.747
item osd.4 weight 1.747
item osd.8 weight 1.747
item osd.12 weight 1.747
item osd.16 weight 1.747
item osd.20 weight 1.747
}
host proxmox-3 {
id -5 # do not change unnecessarily
id -6 class ssd # do not change unnecessarily
# weight 10.480
alg straw2
hash 0 # rjenkins1
item osd.1 weight 1.747
item osd.5 weight 1.747
item osd.9 weight 1.747
item osd.13 weight 1.747
item osd.17 weight 1.747
item osd.21 weight 1.747
}
host proxmox-2 {
id -7 # do not change unnecessarily
id -8 class ssd # do not change unnecessarily
# weight 10.480
alg straw2
hash 0 # rjenkins1
item osd.2 weight 1.747
item osd.6 weight 1.747
item osd.10 weight 1.747
item osd.14 weight 1.747
item osd.18 weight 1.747
item osd.22 weight 1.747
}
host proxmox-1 {
id -9 # do not change unnecessarily
id -10 class ssd # do not change unnecessarily
# weight 10.480
alg straw2
hash 0 # rjenkins1
item osd.3 weight 1.747
item osd.7 weight 1.747
item osd.11 weight 1.747
item osd.15 weight 1.747
item osd.19 weight 1.747
item osd.23 weight 1.747
}
root default {
id -1 # do not change unnecessarily
id -2 class ssd # do not change unnecessarily
# weight 41.918
alg straw2
hash 0 # rjenkins1
item proxmox-4 weight 10.480
item proxmox-3 weight 10.480
item proxmox-2 weight 10.480
item proxmox-1 weight 10.480
}
# rules
rule replicated_rule {
id 0
type replicated
min_size 1
max_size 10
step take default
step chooseleaf firstn 0 type host
step emit
}
# end crush map
Any advice?
Last edited: