Hello Guys,
Single Node ceph guy again.
Recently suffered a two drive failure and lost some PGs so I rebuilt my pools and restored from external backups. The restores are taking FOREVER. Testing against my SSD pool which used to perform closer to 500MB/s it's maxing out at 60MB/s.
Node hosts ALL the storage. Two monitors. 10GBit Fibre. Jumbo enabled. Blue Stores.
Writing to spinning OSDs (8 of them all WD RED) also maxes out around 60MB/s.
Any ideas?
rados -p SSDStorage bench -b 4194304 60 write -t 128 --no-cleanup
Total time run: 61.726422
Total writes made: 940
Write size: 4194304
Object size: 4194304
Bandwidth (MB/sec): 60.9139
Stddev Bandwidth: 56.7327
Max bandwidth (MB/sec): 276
Min bandwidth (MB/sec): 0
Average IOPS: 15
Stddev IOPS: 14
Max IOPS: 69
Min IOPS: 0
Average Latency(s): 8.10854
Stddev Latency(s): 1.80821
Max latency(s): 11.8255
Min latency(s): 2.68556
AHH POO! Looks like my Samy EVO 850 has some SERIOUS performance issues:
Single Node ceph guy again.
Recently suffered a two drive failure and lost some PGs so I rebuilt my pools and restored from external backups. The restores are taking FOREVER. Testing against my SSD pool which used to perform closer to 500MB/s it's maxing out at 60MB/s.
Node hosts ALL the storage. Two monitors. 10GBit Fibre. Jumbo enabled. Blue Stores.
Writing to spinning OSDs (8 of them all WD RED) also maxes out around 60MB/s.
Any ideas?
rados -p SSDStorage bench -b 4194304 60 write -t 128 --no-cleanup
Total time run: 61.726422
Total writes made: 940
Write size: 4194304
Object size: 4194304
Bandwidth (MB/sec): 60.9139
Stddev Bandwidth: 56.7327
Max bandwidth (MB/sec): 276
Min bandwidth (MB/sec): 0
Average IOPS: 15
Stddev IOPS: 14
Max IOPS: 69
Min IOPS: 0
Average Latency(s): 8.10854
Stddev Latency(s): 1.80821
Max latency(s): 11.8255
Min latency(s): 2.68556
Code:
# begin crush map
tunable choose_local_tries 0
tunable choose_local_fallback_tries 0
tunable choose_total_tries 50
tunable chooseleaf_descend_once 1
tunable chooseleaf_vary_r 1
tunable chooseleaf_stable 1
tunable straw_calc_version 1
tunable allowed_bucket_algs 54
# devices
device 0 osd.0 class hdd
device 1 osd.1 class hdd
device 2 osd.2 class hdd
device 3 osd.3 class hdd
device 4 osd.4 class hdd
device 5 osd.5 class ssd
device 6 osd.6 class hdd
device 7 osd.7 class hdd
device 8 osd.8 class ssd
device 9 osd.9 class ssd
device 10 osd.10 class hdd
device 11 osd.11 class ssd
# types
type 0 osd
type 1 host
type 2 chassis
type 3 rack
type 4 row
type 5 pdu
type 6 pod
type 7 room
type 8 datacenter
type 9 region
type 10 root
# buckets
host positronic {
id -10 # do not change unnecessarily
id -11 class hdd # do not change unnecessarily
id -12 class ssd # do not change unnecessarily
# weight 26.400
alg straw2
hash 0 # rjenkins1
item osd.2 weight 2.729
item osd.7 weight 2.729
item osd.10 weight 2.729
item osd.9 weight 0.465
item osd.8 weight 0.465
item osd.11 weight 1.819
item osd.5 weight 1.819
item osd.0 weight 2.729
item osd.1 weight 2.729
item osd.6 weight 2.729
item osd.3 weight 2.729
item osd.4 weight 2.729
}
root default {
id -1 # do not change unnecessarily
id -2 class hdd # do not change unnecessarily
id -6 class ssd # do not change unnecessarily
# weight 26.399
alg straw2
hash 0 # rjenkins1
item positronic weight 26.399
}
# rules
rule replicated_rule {
id 0
type replicated
min_size 1
max_size 10
step take positronic class hdd
step choose firstn 0 type osd
step emit
}
rule FastStoreage {
id 1
type replicated
min_size 1
max_size 10
step take default class ssd
step choose firstn 0 type osd
step emit
}
# end crush map
Code:
[global]
auth client required = cephx
auth cluster required = cephx
auth service required = cephx
cluster network = 10.10.10.0/24
debug asok = 0/0
debug auth = 0/0
debug buffer = 0/0
debug client = 0/0
debug context = 0/0
debug crush = 0/0
debug filer = 0/0
debug filestore = 0/0
debug finisher = 0/0
debug heartbeatmap = 0/0
debug journal = 0/0
debug journaler = 0/0
debug lockdep = 0/0
debug mds = 0/0
debug mds balancer = 0/0
debug mds locker = 0/0
debug mds log = 0/0
debug mds log expire = 0/0
debug mds migrator = 0/0
debug mon = 0/0
debug monc = 0/0
debug ms = 0/0
debug objclass = 0/0
debug objectcacher = 0/0
debug objecter = 0/0
debug optracker = 0/0
debug osd = 0/0
debug paxos = 0/0
debug perfcounter = 0/0
debug rados = 0/0
debug rbd = 0/0
debug rgw = 0/0
debug throttle = 0/0
debug timer = 0/0
debug tp = 0/0
fsid = e791fb44-9c38-4e86-8edf-cdbc5a3f7d63
keyring = /etc/pve/priv/$cluster.$name.keyring
mon allow pool delete = true
osd crush chooseleaf type = 0
osd journal size = 5120
osd pool default min size = 2
osd pool default size = 3
osd_memory_target = 3221225472
osd recovery op priority = 2
osd max backfills = 1
osd recovery max active = 1
osd recovery threads = 1
public network = 10.10.10.0/24
[mds]
keyring = /var/lib/ceph/mds/ceph-$id/keyring
[osd]
keyring = /var/lib/ceph/osd/ceph-$id/keyring
[mon.Q]
host = Q
mon addr = 10.10.10.5:6789
[mon.positronic]
host = positronic
mon addr = 10.10.10.7:6789
AHH POO! Looks like my Samy EVO 850 has some SERIOUS performance issues:
Code:
root@positronic:~/crush20190705# ceph tell osd.9 bench
{
"bytes_written": 1073741824,
"blocksize": 4194304,
"elapsed_sec": 2.935917,
"bytes_per_sec": 365726185.092001,
"iops": 87.195917
}
root@positronic:~/crush20190705# ceph tell osd.8 bench
{
"bytes_written": 1073741824,
"blocksize": 4194304,
"elapsed_sec": 55.418091,
"bytes_per_sec": 19375294.320168,
"iops": 4.619430
}
root@positronic:~/crush20190705# ceph tell osd.5 bench
{
"bytes_written": 1073741824,
"blocksize": 4194304,
"elapsed_sec": 6.348679,
"bytes_per_sec": 169128386.978483,
"iops": 40.323350
}
root@positronic:~/crush20190705# ceph tell osd.11 bench
{
"bytes_written": 1073741824,
"blocksize": 4194304,
"elapsed_sec": 5.240622,
"bytes_per_sec": 204888238.978340,
"iops": 48.849163
}
Last edited: