ceph gone near full and cant start any vm now

hassoon · Jan 28, 2020

here they are

root@pve2:~# ceph osd df tree
ID CLASS WEIGHT REWEIGHT SIZE USE AVAIL %USE VAR PGS TYPE NAME
-1 6.90857 - 7075G 4867G 2207G 68.80 1.00 - root default
-3 1.63614 - 1675G 1183G 492G 70.62 1.03 - host pve1
0 hdd 0.27269 0.85004 279G 196G 85154M 70.22 1.02 100 osd.0
1 hdd 0.27269 0.80005 279G 225G 55040M 80.75 1.17 101 osd.1
2 hdd 0.27269 1.00000 279G 214G 65967M 76.93 1.12 108 osd.2
3 hdd 0.27269 1.00000 279G 170G 108G 61.09 0.89 88 osd.3
4 hdd 0.27269 1.00000 279G 210G 69948M 75.54 1.10 111 osd.4
5 hdd 0.27269 1.00000 279G 165G 114G 59.18 0.86 95 osd.5
-5 1.63614 - 1675G 1144G 530G 68.33 0.99 - host pve2
6 hdd 0.27269 1.00000 279G 183G 98417M 65.58 0.95 93 osd.6
7 hdd 0.27269 1.00000 279G 214G 66421M 76.77 1.12 108 osd.7
8 hdd 0.27269 1.00000 279G 160G 119G 57.36 0.83 82 osd.8
9 hdd 0.27269 1.00000 279G 177G 102G 63.40 0.92 101 osd.9
10 hdd 0.27269 1.00000 279G 201G 79273M 72.28 1.05 97 osd.10
12 hdd 0.27269 1.00000 279G 208G 72712M 74.57 1.08 96 osd.12
-7 1.63614 - 1675G 1255G 420G 74.90 1.09 - host pve3
11 hdd 0.27269 1.00000 279G 211G 69583M 75.67 1.10 104 osd.11
13 hdd 0.27269 1.00000 279G 225G 54751M 80.85 1.18 116 osd.13
14 hdd 0.27269 1.00000 279G 183G 98053M 65.71 0.96 103 osd.14
15 hdd 0.27269 1.00000 279G 182G 99123M 65.34 0.95 103 osd.15
16 hdd 0.27269 1.00000 279G 213G 67362M 76.44 1.11 105 osd.16
17 hdd 0.27269 0.95001 279G 238G 41768M 85.39 1.24 114 osd.17
-9 2.00015 - 2048G 1284G 764G 62.70 0.91 - host pve4
18 hdd 0.27269 1.00000 279G 159G 119G 57.22 0.83 87 osd.18
19 hdd 0.27269 1.00000 279G 168G 110G 60.33 0.88 88 osd.19
20 hdd 0.27269 1.00000 279G 187G 94007M 67.13 0.98 89 osd.20
21 hdd 0.27269 1.00000 279G 174G 104G 62.64 0.91 85 osd.21
22 hdd 0.45470 1.00000 465G 281G 184G 60.43 0.88 141 osd.22
23 hdd 0.45470 1.00000 465G 312G 153G 67.04 0.97 164 osd.23
TOTAL 7075G 4867G 2207G 68.80
MIN/MAX VAR: 0.83/1.24 STDDEV: 8.01
root@pve2:~#

conf

[global]
auth client required = cephx
auth cluster required = cephx
auth service required = cephx
cluster network = 10.10.10.0/24
fsid = a9926f78-4366-4be5-xxxxxxxxxxxx
keyring = /etc/pve/priv/$cluster.$name.keyring
mon allow pool delete = true
osd journal size = 5120
osd pool default min size = 2
osd pool default size = 3
public network = 10.10.10.0/24
[osd]
keyring = /var/lib/ceph/osd/ceph-$id/keyring [mon.pve3]
host = pve3
mon addr = 10.10.10.13:6789
[mon.pve2]
host = pve2
mon addr = 10.10.10.12:6789
[mon.pve1]
host = pve1
mon addr = 10.10.10.11:6789
[mon.pve4]
host = pve4
mon addr = 10.10.10.14:6789

crush map

# begin crush map
tunable choose_local_tries 0
tunable choose_local_fallback_tries 0
tunable choose_total_tries 50
tunable chooseleaf_descend_once 1
tunable chooseleaf_vary_r 1
tunable chooseleaf_stable 1
tunable straw_calc_version 1
tunable allowed_bucket_algs 54

# devices
device 0 osd.0 class hdd
device 1 osd.1 class hdd
device 2 osd.2 class hdd
device 3 osd.3 class hdd
device 4 osd.4 class hdd
device 5 osd.5 class hdd
device 6 osd.6 class hdd
device 7 osd.7 class hdd
device 8 osd.8 class hdd
device 9 osd.9 class hdd
device 10 osd.10 class hdd
device 11 osd.11 class hdd
device 12 osd.12 class hdd
device 13 osd.13 class hdd
device 14 osd.14 class hdd
device 15 osd.15 class hdd
device 16 osd.16 class hdd
device 17 osd.17 class hdd
device 18 osd.18 class hdd
device 19 osd.19 class hdd
device 20 osd.20 class hdd
device 21 osd.21 class hdd
device 22 osd.22 class hdd
device 23 osd.23 class hdd

# types
type 0 osd
type 1 host
type 2 chassis
type 3 rack
type 4 row
type 5 pdu
type 6 pod
type 7 room
type 8 datacenter
type 9 region
type 10 root

# buckets
host pve1 {
id -3 # do not change unnecessarily
id -4 class hdd # do not change unnecessarily
# weight 1.636
alg straw2
hash 0 # rjenkins1
item osd.0 weight 0.273
item osd.1 weight 0.273
item osd.2 weight 0.273
item osd.4 weight 0.273
item osd.5 weight 0.273
item osd.3 weight 0.273
}
host pve2 {
id -5 # do not change unnecessarily
id -6 class hdd # do not change unnecessarily
# weight 1.636
alg straw2
hash 0 # rjenkins1
item osd.6 weight 0.273
item osd.7 weight 0.273
item osd.8 weight 0.273
item osd.9 weight 0.273
item osd.10 weight 0.273
item osd.12 weight 0.273
}
host pve3 {
id -7 # do not change unnecessarily
id -8 class hdd # do not change unnecessarily
# weight 1.636
alg straw2
hash 0 # rjenkins1
item osd.11 weight 0.273
item osd.13 weight 0.273
item osd.14 weight 0.273
item osd.15 weight 0.273
item osd.16 weight 0.273
item osd.17 weight 0.273
}
host pve4 {
id -9 # do not change unnecessarily
id -10 class hdd # do not change unnecessarily
# weight 2.000
alg straw2
hash 0 # rjenkins1
item osd.18 weight 0.273
item osd.19 weight 0.273
item osd.20 weight 0.273
item osd.21 weight 0.273
item osd.22 weight 0.455
item osd.23 weight 0.455
}
root default {
id -1 # do not change unnecessarily
id -2 class hdd # do not change unnecessarily
# weight 6.909
alg straw2
hash 0 # rjenkins1
item pve1 weight 1.636
item pve2 weight 1.636
item pve3 weight 1.636
item pve4 weight 2.000
}

# rules
rule replicated_rule {
id 0
type replicated
min_size 1
max_size 10
step take default
step chooseleaf firstn 0 type host
step emit
}

# end crush map

Alwin · Jan 28, 2020

Can you please post in CODE tags, it will keep to formatting. You can find it under the three dots in the editor.

hassoon · Jan 28, 2020

just to inform you osd3 is the one replaced

hassoon · Jan 28, 2020

Alwin said:
Can you please post in CODE tags, it will keep to formatting. You can find it under the three dots in the editor.

Code:

# begin crush map
tunable choose_local_tries 0
tunable choose_local_fallback_tries 0
tunable choose_total_tries 50
tunable chooseleaf_descend_once 1
tunable chooseleaf_vary_r 1
tunable chooseleaf_stable 1
tunable straw_calc_version 1
tunable allowed_bucket_algs 54

# devices
device 0 osd.0 class hdd
device 1 osd.1 class hdd
device 2 osd.2 class hdd
device 3 osd.3 class hdd
device 4 osd.4 class hdd
device 5 osd.5 class hdd
device 6 osd.6 class hdd
device 7 osd.7 class hdd
device 8 osd.8 class hdd
device 9 osd.9 class hdd
device 10 osd.10 class hdd
device 11 osd.11 class hdd
device 12 osd.12 class hdd
device 13 osd.13 class hdd
device 14 osd.14 class hdd
device 15 osd.15 class hdd
device 16 osd.16 class hdd
device 17 osd.17 class hdd
device 18 osd.18 class hdd
device 19 osd.19 class hdd
device 20 osd.20 class hdd
device 21 osd.21 class hdd
device 22 osd.22 class hdd
device 23 osd.23 class hdd

# types
type 0 osd
type 1 host
type 2 chassis
type 3 rack
type 4 row
type 5 pdu
type 6 pod
type 7 room
type 8 datacenter
type 9 region
type 10 root

# buckets
host pve1 {
    id -3        # do not change unnecessarily
    id -4 class hdd        # do not change unnecessarily
    # weight 1.636
    alg straw2
    hash 0    # rjenkins1
    item osd.0 weight 0.273
    item osd.1 weight 0.273
    item osd.2 weight 0.273
    item osd.4 weight 0.273
    item osd.5 weight 0.273
    item osd.3 weight 0.273
}
host pve2 {
    id -5        # do not change unnecessarily
    id -6 class hdd        # do not change unnecessarily
    # weight 1.636
    alg straw2
    hash 0    # rjenkins1
    item osd.6 weight 0.273
    item osd.7 weight 0.273
    item osd.8 weight 0.273
    item osd.9 weight 0.273
    item osd.10 weight 0.273
    item osd.12 weight 0.273
}
host pve3 {
    id -7        # do not change unnecessarily
    id -8 class hdd        # do not change unnecessarily
    # weight 1.636
    alg straw2
    hash 0    # rjenkins1
    item osd.11 weight 0.273
    item osd.13 weight 0.273
    item osd.14 weight 0.273
    item osd.15 weight 0.273
    item osd.16 weight 0.273
    item osd.17 weight 0.273
}
host pve4 {
    id -9        # do not change unnecessarily
    id -10 class hdd        # do not change unnecessarily
    # weight 2.000
    alg straw2
    hash 0    # rjenkins1
    item osd.18 weight 0.273
    item osd.19 weight 0.273
    item osd.20 weight 0.273
    item osd.21 weight 0.273
    item osd.22 weight 0.455
    item osd.23 weight 0.455
}
root default {
    id -1        # do not change unnecessarily
    id -2 class hdd        # do not change unnecessarily
    # weight 6.909
    alg straw2
    hash 0    # rjenkins1
    item pve1 weight 1.636
    item pve2 weight 1.636
    item pve3 weight 1.636
    item pve4 weight 2.000
}

# rules
rule replicated_rule {
    id 0
    type replicated
    min_size 1
    max_size 10
    step take default
    step chooseleaf firstn 0 type host
    step emit
}

# end crush map

hassoon · Jan 28, 2020

Code:

[global]
     auth client required = cephx
     auth cluster required = cephx
     auth service required = cephx
     cluster network = 10.10.10.0/24
     fsid = a9926f78-4366-4be5-a77c-7db26a419e86
     keyring = /etc/pve/priv/$cluster.$name.keyring
     mon allow pool delete = true
     osd journal size = 5120
     osd pool default min size = 2
     osd pool default size = 3
     public network = 10.10.10.0/24

[osd]
     keyring = /var/lib/ceph/osd/ceph-$id/keyring

[mon.pve3]
     host = pve3
     mon addr = 10.10.10.13:6789

[mon.pve2]
     host = pve2
     mon addr = 10.10.10.12:6789

[mon.pve1]
     host = pve1
     mon addr = 10.10.10.11:6789

[mon.pve4]
     host = pve4
     mon addr = 10.10.10.14:6789

hassoon · Jan 28, 2020

Code:

root@pve2:~# ceph osd df tree
ID CLASS WEIGHT  REWEIGHT SIZE  USE   AVAIL  %USE  VAR  PGS TYPE NAME
-1       6.90857        - 7075G 4867G  2207G 68.80 1.00   - root default
-3       1.63614        - 1675G 1183G   492G 70.62 1.03   -     host pve1
 0   hdd 0.27269  0.85004  279G  196G 85154M 70.22 1.02 100         osd.0
 1   hdd 0.27269  0.80005  279G  225G 55040M 80.75 1.17 101         osd.1
 2   hdd 0.27269  1.00000  279G  214G 65967M 76.93 1.12 108         osd.2
 3   hdd 0.27269  1.00000  279G  170G   108G 61.09 0.89  88         osd.3
 4   hdd 0.27269  1.00000  279G  210G 69948M 75.54 1.10 111         osd.4
 5   hdd 0.27269  1.00000  279G  165G   114G 59.18 0.86  95         osd.5
-5       1.63614        - 1675G 1144G   530G 68.33 0.99   -     host pve2
 6   hdd 0.27269  1.00000  279G  183G 98417M 65.58 0.95  93         osd.6
 7   hdd 0.27269  1.00000  279G  214G 66421M 76.77 1.12 108         osd.7
 8   hdd 0.27269  1.00000  279G  160G   119G 57.36 0.83  82         osd.8
 9   hdd 0.27269  1.00000  279G  177G   102G 63.40 0.92 101         osd.9
10   hdd 0.27269  1.00000  279G  201G 79273M 72.28 1.05  97         osd.10
12   hdd 0.27269  1.00000  279G  208G 72712M 74.57 1.08  96         osd.12
-7       1.63614        - 1675G 1255G   420G 74.90 1.09   -     host pve3
11   hdd 0.27269  1.00000  279G  211G 69583M 75.67 1.10 104         osd.11
13   hdd 0.27269  1.00000  279G  225G 54751M 80.85 1.18 116         osd.13
14   hdd 0.27269  1.00000  279G  183G 98053M 65.71 0.96 103         osd.14
15   hdd 0.27269  1.00000  279G  182G 99123M 65.34 0.95 103         osd.15
16   hdd 0.27269  1.00000  279G  213G 67362M 76.44 1.11 105         osd.16
17   hdd 0.27269  0.95001  279G  238G 41768M 85.39 1.24 114         osd.17
-9       2.00015        - 2048G 1284G   764G 62.70 0.91   -     host pve4
18   hdd 0.27269  1.00000  279G  159G   119G 57.22 0.83  87         osd.18
19   hdd 0.27269  1.00000  279G  168G   110G 60.33 0.88  88         osd.19
20   hdd 0.27269  1.00000  279G  187G 94007M 67.13 0.98  89         osd.20
21   hdd 0.27269  1.00000  279G  174G   104G 62.64 0.91  85         osd.21
22   hdd 0.45470  1.00000  465G  281G   184G 60.43 0.88 141         osd.22
23   hdd 0.45470  1.00000  465G  312G   153G 67.04 0.97 164         osd.23
                    TOTAL 7075G 4867G  2207G 68.80
MIN/MAX VAR: 0.83/1.24  STDDEV: 8.01
root@pve2:~#

hassoon · Jan 28, 2020

just a quick note although since day one it never changed but I cant recall if firewall was on when setup but current firewall state is on on each node. shall I turn off? although im not sure the behaviour of proxmox when things go wrong in ceph? does it shutdown and turn off or turn on things as precaution?? its been few days since the drive failure so Im not sure what proxmox would do in case of drive failure?

hassoon · Jan 28, 2020

any clue on the root problem pls??

Alwin · Jan 28, 2020

Well, maybe a simple INPUT - > ACCEPT might already do some difference, if the firewall rules weren't correct.

hassoon · Jan 28, 2020

Alwin said:
Well, maybe a simple INPUT - > ACCEPT might already do some difference, if the firewall rules weren't correct.

I didnt touch the firewall to be honest its as is since day one
but will try now

hassoon · Jan 28, 2020

u know what its better to turn off firewall as its a secure lan

hassoon · Jan 28, 2020

nothing changing to be honest ive turned off firewall but cant see a progress

hassoon · Jan 28, 2020

Alwin said:
Well, maybe a simple INPUT - > ACCEPT might already do some difference, if the firewall rules weren't correct.

did you spot anything in configs I sent you earlier?

hassoon · Jan 28, 2020

anyone there to help pls????

hassoon · Jan 28, 2020

Here is my latest update, due to the fact im remotely connected and not on site it took me longer time to find out one of the actual ethernet cables for bonding was blinking orange and I recall it was green, this gave me an insight that on port isnt working as it used to be, I have asked them to replace the cable for that port and they did.
the server started going up and down for few minutes but ceph started to re adjust itself and the current reading is
root@pve1:~# ceph -s
cluster:
id: a9926f78-4366-4be5-a77c-7db26a419e86
health: HEALTH_OK

services:
mon: 4 daemons, quorum pve1,pve2,pve3,pve4
mgr: pve3(active), standbys: pve1, pve4, pve2
osd: 24 osds: 24 up, 24 in

data:
pools: 4 pools, 832 pgs
objects: 419k objects, 1575 GB
usage: 4797 GB used, 2278 GB / 7075 GB avail
pgs: 832 active+clean

io:
client: 15407 kB/s rd, 171 kB/s wr, 135 op/s rd, 14 op/s wr

which is fine for now. I will need to find out whats the reason for this sudden change.
but anyway we are alive and surviving again.
thanks alot for the help guys. we have learned a valuable lesson on this.
so always it will be helpful looking at syslog to check what is going on the background.
keep in mind it was pinging normal just the speed was low

Alwin · Jan 28, 2020

hassoon said:
17 hdd 0.27269 0.95001 279G 238G 41768M 85.39 1.24 114 osd.17

This OSD is already near full and all in all there is not much space to redistribute. Delete data or add more OSDs. Otherwise you will sooner than later get into similar situation again.

hassoon said:
health: HEALTH_OK

Good that it works. It might just have been an issue on the switch. You never know.

Search

Search

ceph gone near full and cant start any vm now

hassoon

Active Member

Alwin

Proxmox Retired Staff

hassoon

Active Member

hassoon

Active Member

hassoon

Active Member

hassoon

Active Member

hassoon

Active Member

hassoon

Active Member

Alwin

Proxmox Retired Staff

hassoon

Active Member

hassoon

Active Member

hassoon

Active Member

hassoon

Active Member

hassoon

Active Member

hassoon

Active Member

Alwin

Proxmox Retired Staff