[SOLVED] Proxmox / CEPH maintenance causes VM's to be unresponsive

bitflow · Oct 14, 2021

Hi.

We have a three node cluster running Proxmox 7 with CEPH 16.

Each node has 4 SSD OSD's and 8 HDD OSD's.

We are testing a lot now to see how we can mitigate problems that arise and one thing we noticed is that whenever we perform maintenance on one node we get some write issues which means that VM's are crippled.

Pre-maintenance we run ceph osd add-noout osd.X for every OSD in that node.

One error message that we noticed was:

Code:

pvestatd[3953]: unable to activate storage 'cephfs' - directory '/mnt/pve/cephfs' does not exist or is unreachable

This is how it looks in the Proxmox GUI after a few minutes if a node has been brought down

Here is the Ceph configuration:

Code:

[global]
     auth_client_required = cephx
     auth_cluster_required = cephx
     auth_service_required = cephx
     cluster_network = 10.5.33.0/24
     fsid = 776bf0f9-8aba-42c5-b1e4-e94824ba0b7a
     mon_allow_pool_delete = true
     mon_host = 10.5.33.113 10.5.33.112 10.5.33.111
     ms_bind_ipv4 = true
     ms_bind_ipv6 = false
     osd_pool_default_min_size = 2
     osd_pool_default_size = 2
     public_network = 10.5.33.0/24

[client]
     keyring = /etc/pve/priv/$cluster.$name.keyring

[mds]
     keyring = /var/lib/ceph/mds/ceph-$id/keyring

[mds.pve-11]
     host = pve-11
     mds_standby_for_name = pve

[mds.pve-12]
     host = pve-12
     mds_standby_for_name = pve

[mds.pve-13]
     host = pve-13
     mds_standby_for_name = pve

[mon.pve-11]
     public_addr = 10.5.33.111

[mon.pve-12]
     public_addr = 10.5.33.112

[mon.pve-13]
     public_addr = 10.5.33.113

# begin crush map
tunable choose_local_tries 0
tunable choose_local_fallback_tries 0
tunable choose_total_tries 50
tunable chooseleaf_descend_once 1
tunable chooseleaf_vary_r 1
tunable chooseleaf_stable 1
tunable straw_calc_version 1
tunable allowed_bucket_algs 54

# devices
device 0 osd.0 class hdd
device 1 osd.1 class hdd
device 2 osd.2 class hdd
device 3 osd.3 class hdd
device 4 osd.4 class hdd
device 5 osd.5 class hdd
device 6 osd.6 class hdd
device 7 osd.7 class hdd
device 8 osd.8 class hdd
device 9 osd.9 class hdd
device 10 osd.10 class hdd
device 11 osd.11 class hdd
device 12 osd.12 class hdd
device 13 osd.13 class ssd
device 14 osd.14 class ssd
device 15 osd.15 class ssd
device 16 osd.16 class ssd
device 17 osd.17 class ssd
device 18 osd.18 class ssd
device 19 osd.19 class ssd
device 20 osd.20 class ssd
device 21 osd.21 class hdd
device 22 osd.22 class hdd
device 23 osd.23 class hdd
device 24 osd.24 class ssd
device 25 osd.25 class ssd
device 26 osd.26 class ssd
device 27 osd.27 class ssd
device 28 osd.28 class hdd
device 29 osd.29 class hdd
device 30 osd.30 class hdd
device 31 osd.31 class hdd
device 32 osd.32 class hdd
device 33 osd.33 class hdd
device 34 osd.34 class hdd
device 35 osd.35 class hdd

# types
type 0 osd
type 1 host
type 2 chassis
type 3 rack
type 4 row
type 5 pdu
type 6 pod
type 7 room
type 8 datacenter
type 9 zone
type 10 region
type 11 root

# buckets
host pve-13 {
    id -3        # do not change unnecessarily
    id -4 class hdd        # do not change unnecessarily
    id -7 class ssd        # do not change unnecessarily
    # weight 11.644
    alg straw2
    hash 0    # rjenkins1
    item osd.0 weight 1.092
    item osd.1 weight 1.092
    item osd.2 weight 1.092
    item osd.3 weight 1.092
    item osd.4 weight 1.092
    item osd.5 weight 1.092
    item osd.6 weight 1.092
    item osd.7 weight 1.092
    item osd.13 weight 0.728
    item osd.18 weight 0.728
    item osd.19 weight 0.728
    item osd.20 weight 0.728
}
host pve-12 {
    id -5        # do not change unnecessarily
    id -6 class hdd        # do not change unnecessarily
    id -8 class ssd        # do not change unnecessarily
    # weight 11.644
    alg straw2
    hash 0    # rjenkins1
    item osd.8 weight 1.092
    item osd.9 weight 1.092
    item osd.10 weight 1.092
    item osd.11 weight 1.092
    item osd.12 weight 1.092
    item osd.14 weight 0.728
    item osd.15 weight 0.728
    item osd.16 weight 0.728
    item osd.17 weight 0.728
    item osd.21 weight 1.092
    item osd.22 weight 1.092
    item osd.23 weight 1.092
}
host pve-11 {
    id -10        # do not change unnecessarily
    id -11 class hdd        # do not change unnecessarily
    id -12 class ssd        # do not change unnecessarily
    # weight 11.644
    alg straw2
    hash 0    # rjenkins1
    item osd.24 weight 0.728
    item osd.25 weight 0.728
    item osd.26 weight 0.728
    item osd.27 weight 0.728
    item osd.28 weight 1.092
    item osd.29 weight 1.092
    item osd.30 weight 1.092
    item osd.31 weight 1.092
    item osd.32 weight 1.092
    item osd.33 weight 1.092
    item osd.34 weight 1.092
    item osd.35 weight 1.092
}
root default {
    id -1        # do not change unnecessarily
    id -2 class hdd        # do not change unnecessarily
    id -9 class ssd        # do not change unnecessarily
    # weight 34.931
    alg straw2
    hash 0    # rjenkins1
    item pve-13 weight 11.644
    item pve-12 weight 11.644
    item pve-11 weight 11.644
}

# rules
rule replicated_rule {
    id 0
    type replicated
    min_size 1
    max_size 10
    step take default
    step chooseleaf firstn 0 type host
    step emit
}
rule replicated_ssd {
    id 1
    type replicated
    min_size 1
    max_size 10
    step take default class ssd
    step chooseleaf firstn 0 type host
    step emit
}
rule replicated_hdd {
    id 2
    type replicated
    min_size 1
    max_size 10
    step take default class hdd
    step chooseleaf firstn 0 type host
    step emit
}

# end crush map

EDIT: Noticed that CEPH pool size was set to 2 and not 3 (default). Changed back to 3 and after that we seem to be able to take down a node for a longer period of time without having any issues with the guests.

Klaus Steinberger · Oct 14, 2021

The design of CEPH requires pool size at least with 3. A Pool size of 2 should never be used in production.

When a pool with size 2 misses an OSD, it has to block traffic as the data protection can not be guaranteed anymore.

[SOLVED] Proxmox / CEPH maintenance causes VM's to be unresponsive

bitflow

Member

Klaus Steinberger

Renowned Member

We value your privacy