3 node cluster.
I just upgraded to the newest Proxmox 7.2 and Ceph Pacific 15.2.16 to 16.2.7 and after I upgraded and rebooted the second node, I am getting stuck pgs.
At this time I have one last node to upgrade but
I did set this to true and rebooted them:
ceph config set osd bluestore_fsck_quick_fix_on_mount true
I just upgraded to the newest Proxmox 7.2 and Ceph Pacific 15.2.16 to 16.2.7 and after I upgraded and rebooted the second node, I am getting stuck pgs.
At this time I have one last node to upgrade but
Code:
[global]
auth_client_required = none
auth_cluster_required = none
auth_service_required = none
auth_supported = none
cluster_network = 10.10.1.0/24
debug_asok = 0/0
debug_auth = 0/0
debug_buffer = 0/0
debug_client = 0/0
debug_context = 0/0
debug_crush = 0/0
debug_filer = 0/0
debug_filestore = 0/0
debug_finisher = 0/0
debug_heartbeatmap = 0/0
debug_journal = 0/0
debug_journaler = 0/0
debug_lockdep = 0/0
debug_mds = 0/0
debug_mds_balancer = 0/0
debug_mds_locker = 0/0
debug_mds_log = 0/0
debug_mds_log_expire = 0/0
debug_mds_migrator = 0/0
debug_mon = 0/0
debug_monc = 0/0
debug_ms = 0/0
debug_objclass = 0/0
debug_objectcacher = 0/0
debug_objecter = 0/0
debug_optracker = 0/0
debug_osd = 0/0
debug_paxos = 0/0
debug_perfcounter = 0/0
debug_rados = 0/0
debug_rbd = 0/0
debug_rgw = 0/0
debug_throttle = 0/0
debug_timer = 0/0
debug_tp = 0/0
fsid = 54da8900-a9db-4a57-923c-a62dbec8c82a
mon_allow_pool_delete = true
mon_host = 10.10.1.14 10.10.1.16 10.10.1.13
ms_bind_ipv4 = true
osd_journal_size = 5120
osd_pool_default_min_size = 2
osd_pool_default_size = 3
public_network = 10.10.1.0/24
[mgr]
mgr_modules = zabbix
[mon.vmhost3]
public_addr = 10.10.1.13
Code:
# begin crush map
tunable choose_local_tries 0
tunable choose_local_fallback_tries 0
tunable choose_total_tries 50
tunable chooseleaf_descend_once 1
tunable chooseleaf_vary_r 1
tunable chooseleaf_stable 1
tunable straw_calc_version 1
tunable allowed_bucket_algs 54
# devices
device 0 osd.0 class ssd
device 1 osd.1 class ssd
device 2 osd.2 class ssd
device 3 osd.3 class ssd
device 4 osd.4 class ssd
device 8 osd.8 class ssd
# types
type 0 osd
type 1 host
type 2 chassis
type 3 rack
type 4 row
type 5 pdu
type 6 pod
type 7 room
type 8 datacenter
type 9 region
type 10 root
# buckets
host VMHost2 {
id -3 # do not change unnecessarily
id -4 class hdd # do not change unnecessarily
id -9 class ssd # do not change unnecessarily
# weight 3.492
alg straw2
hash 0 # rjenkins1
item osd.3 weight 1.746
item osd.4 weight 1.746
}
host VMHost4 {
id -7 # do not change unnecessarily
id -8 class hdd # do not change unnecessarily
id -11 class ssd # do not change unnecessarily
# weight 3.492
alg straw2
hash 0 # rjenkins1
item osd.8 weight 1.746
item osd.1 weight 1.746
}
host vmhost3 {
id -5 # do not change unnecessarily
id -6 class hdd # do not change unnecessarily
id -10 class ssd # do not change unnecessarily
# weight 3.493
alg straw2
hash 0 # rjenkins1
item osd.2 weight 1.747
item osd.0 weight 1.747
}
root default {
id -1 # do not change unnecessarily
id -2 class hdd # do not change unnecessarily
id -12 class ssd # do not change unnecessarily
# weight 10.477
alg straw2
hash 0 # rjenkins1
item VMHost2 weight 3.492
item VMHost4 weight 3.492
item vmhost3 weight 3.493
}
# rules
rule replicated_hdd {
id 1
type replicated
min_size 1
max_size 10
step take default class hdd
step chooseleaf firstn 0 type host
step emit
}
rule replicated_ssd {
id 2
type replicated
min_size 1
max_size 10
step take default class ssd
step chooseleaf firstn 0 type host
step emit
}
# end crush map
Code:
root@vmhost3:~# pve version -v
-bash: pve: command not found
root@vmhost3:~# pveversion -v
proxmox-ve: 7.2-1 (running kernel: 5.15.30-2-pve)
pve-manager: 7.2-3 (running version: 7.2-3/c743d6c1)
pve-kernel-helper: 7.2-2
pve-kernel-5.15: 7.2-1
pve-kernel-5.13: 7.1-9
pve-kernel-5.11: 7.0-10
pve-kernel-5.15.30-2-pve: 5.15.30-3
pve-kernel-5.13.19-6-pve: 5.13.19-15
pve-kernel-5.13.19-2-pve: 5.13.19-4
pve-kernel-5.11.22-7-pve: 5.11.22-12
pve-kernel-5.11.22-1-pve: 5.11.22-2
ceph: 16.2.7
ceph-fuse: 16.2.7
corosync: 3.1.5-pve2
criu: 3.15-1+pve-1
glusterfs-client: 9.2-1
ifupdown2: 3.1.0-1+pmx3
ksm-control-daemon: 1.4-1
libjs-extjs: 7.0.0-1
libknet1: 1.22-pve2
libproxmox-acme-perl: 1.4.2
libproxmox-backup-qemu0: 1.2.0-1
libpve-access-control: 7.1-8
libpve-apiclient-perl: 3.2-1
libpve-common-perl: 7.1-6
libpve-guest-common-perl: 4.1-2
libpve-http-server-perl: 4.1-1
libpve-storage-perl: 7.2-2
libspice-server1: 0.14.3-2.1
lvm2: 2.03.11-2.1
lxc-pve: 4.0.12-1
lxcfs: 4.0.12-pve1
novnc-pve: 1.3.0-3
proxmox-backup-client: 2.1.8-1
proxmox-backup-file-restore: 2.1.8-1
proxmox-mini-journalreader: 1.3-1
proxmox-widget-toolkit: 3.4-10
pve-cluster: 7.2-1
pve-container: 4.2-1
pve-docs: 7.2-2
pve-edk2-firmware: 3.20210831-2
pve-firewall: 4.2-5
pve-firmware: 3.4-1
pve-ha-manager: 3.3-4
pve-i18n: 2.7-1
pve-qemu-kvm: 6.2.0-5
pve-xtermjs: 4.16.0-1
qemu-server: 7.2-2
smartmontools: 7.2-pve3
spiceterm: 3.2-2
swtpm: 0.7.1~bpo11+1
vncterm: 1.7-1
zfsutils-linux: 2.1.4-pve1
Code:
root@vmhost3:~# ceph health detail
HEALTH_WARN Reduced data availability: 65 pgs inactive
[WRN] PG_AVAILABILITY: Reduced data availability: 65 pgs inactive
pg 14.0 is stuck inactive for 28m, current state unknown, last acting []
pg 14.1 is stuck inactive for 28m, current state unknown, last acting []
pg 14.2 is stuck inactive for 28m, current state unknown, last acting []
pg 14.3 is stuck inactive for 28m, current state unknown, last acting []
pg 14.4 is stuck inactive for 28m, current state unknown, last acting []
pg 14.5 is stuck inactive for 28m, current state unknown, last acting []
pg 14.6 is stuck inactive for 28m, current state unknown, last acting []
pg 14.7 is stuck inactive for 28m, current state unknown, last acting []
pg 14.8 is stuck inactive for 28m, current state unknown, last acting []
pg 14.9 is stuck inactive for 28m, current state unknown, last acting []
pg 14.a is stuck inactive for 28m, current state unknown, last acting []
pg 14.b is stuck inactive for 28m, current state unknown, last acting []
pg 14.c is stuck inactive for 28m, current state unknown, last acting []
pg 14.d is stuck inactive for 28m, current state unknown, last acting []
pg 14.e is stuck inactive for 28m, current state unknown, last acting []
pg 14.f is stuck inactive for 28m, current state unknown, last acting []
pg 14.10 is stuck inactive for 28m, current state unknown, last acting []
pg 14.11 is stuck inactive for 28m, current state unknown, last acting []
pg 14.12 is stuck inactive for 28m, current state unknown, last acting []
pg 14.13 is stuck inactive for 28m, current state unknown, last acting []
pg 14.14 is stuck inactive for 28m, current state unknown, last acting []
pg 14.15 is stuck inactive for 28m, current state unknown, last acting []
pg 14.16 is stuck inactive for 28m, current state unknown, last acting []
pg 14.17 is stuck inactive for 28m, current state unknown, last acting []
pg 14.18 is stuck inactive for 28m, current state unknown, last acting []
pg 14.19 is stuck inactive for 28m, current state unknown, last acting []
pg 14.1a is stuck inactive for 28m, current state unknown, last acting []
pg 14.1b is stuck inactive for 28m, current state unknown, last acting []
pg 14.1c is stuck inactive for 28m, current state unknown, last acting []
pg 14.1d is stuck inactive for 28m, current state unknown, last acting []
pg 14.1e is stuck inactive for 28m, current state unknown, last acting []
pg 14.1f is stuck inactive for 28m, current state unknown, last acting []
pg 14.20 is stuck inactive for 28m, current state unknown, last acting []
pg 14.21 is stuck inactive for 28m, current state unknown, last acting []
pg 14.22 is stuck inactive for 28m, current state unknown, last acting []
pg 14.23 is stuck inactive for 28m, current state unknown, last acting []
pg 14.24 is stuck inactive for 28m, current state unknown, last acting []
pg 14.25 is stuck inactive for 28m, current state unknown, last acting []
pg 14.26 is stuck inactive for 28m, current state unknown, last acting []
pg 14.27 is stuck inactive for 28m, current state unknown, last acting []
pg 14.30 is stuck inactive for 28m, current state unknown, last acting []
pg 14.31 is stuck inactive for 28m, current state unknown, last acting []
pg 14.32 is stuck inactive for 28m, current state unknown, last acting []
pg 14.33 is stuck inactive for 28m, current state unknown, last acting []
pg 14.35 is stuck inactive for 28m, current state unknown, last acting []
pg 14.3b is stuck inactive for 28m, current state unknown, last acting []
pg 14.3c is stuck inactive for 28m, current state unknown, last acting []
pg 14.3d is stuck inactive for 28m, current state unknown, last acting []
pg 14.3e is stuck inactive for 28m, current state unknown, last acting []
pg 14.3f is stuck inactive for 28m, current state unknown, last acting []
pg 16.0 is stuck inactive for 28m, current state unknown, last acting []
I did set this to true and rebooted them:
ceph config set osd bluestore_fsck_quick_fix_on_mount true