Hello,
I am currently facing performance issue with my Ceph SSD pool.
There are 4 nodes (connected with 10Gbps) on two datacenter, each of them have 3 SSD OSDs.
iperf benchmark reports minimal 5Gbps with 1500 maximum transmission units on the network interfaces between the two datacenter.
SSD disk reference : Intel Hard Drive 960GB - S4500 Series
Please find below the results of the command :
This setup is observed on PVE 5.1 with Ceph Luminous, installed following proxmox wiki and documentations. There is no specific tuning, I followed the default and recommended values.
Regarding the pool, it is named rbd-ssd and it is configured with 2 replicas on the default replicated_rules.
I am trying to figure out where this performance issue comes from but I am currently stuck.
My concerns is about the node datacenter2-node1 which performs not very well compared to the three other nodes. It indeed has write speed of 149.347 MB/sec.
Help and hints are welcome !
I am currently facing performance issue with my Ceph SSD pool.
There are 4 nodes (connected with 10Gbps) on two datacenter, each of them have 3 SSD OSDs.
iperf benchmark reports minimal 5Gbps with 1500 maximum transmission units on the network interfaces between the two datacenter.
SSD disk reference : Intel Hard Drive 960GB - S4500 Series
Please find below the results of the command :
Code:
rados -p rbd-ssd bench -b 4194304 60 write -t 128 --no-cleanup
Code:
root@datacenter1-node1:~# rados -p rbd-ssd bench -b 4194304 60 write -t 128 --no-cleanup
Total time run: 60.951115
Total writes made: 5634
Write size: 4194304
Object size: 4194304
Bandwidth (MB/sec): 369.739
Stddev Bandwidth: 92.156
Max bandwidth (MB/sec): 548
Min bandwidth (MB/sec): 0
Average IOPS: 92
Stddev IOPS: 23
Max IOPS: 137
Min IOPS: 0
Average Latency(s): 1.37068
Stddev Latency(s): 0.317161
Max latency(s): 4.11179
Min latency(s): 0.598524
Code:
root@datacenter1-node2:~# rados -p rbd-ssd bench -b 4194304 60 write -t 128 --no-cleanup
Total time run: 60.629361
Total writes made: 5640
Write size: 4194304
Object size: 4194304
Bandwidth (MB/sec): 372.097
Stddev Bandwidth: 91.4801
Max bandwidth (MB/sec): 564
Min bandwidth (MB/sec): 0
Average IOPS: 93
Stddev IOPS: 22
Max IOPS: 141
Min IOPS: 0
Average Latency(s): 1.36226
Stddev Latency(s): 0.369185
Max latency(s): 4.56999
Min latency(s): 0.57865
Code:
root@datacenter2-node1:~# rados -p rbd-ssd bench -b 4194304 60 write -t 128 --no-cleanup
Total time run: 62.351253
Total writes made: 2328
Write size: 4194304
Object size: 4194304
Bandwidth (MB/sec): 149.347
Stddev Bandwidth: 49.9059
Max bandwidth (MB/sec): 240
Min bandwidth (MB/sec): 0
Average IOPS: 37
Stddev IOPS: 12
Max IOPS: 60
Min IOPS: 0
Average Latency(s): 3.38754
Stddev Latency(s): 0.508423
Max latency(s): 6.03627
Min latency(s): 2.06605
Code:
root@datacenter2-node2:~# rados -p rbd-ssd bench -b 4194304 60 write -t 128 --no-cleanup
Total time run: 61.383307
Total writes made: 3758
Write size: 4194304
Object size: 4194304
Bandwidth (MB/sec): 244.887
Stddev Bandwidth: 65.4416
Max bandwidth (MB/sec): 424
Min bandwidth (MB/sec): 0
Average IOPS: 61
Stddev IOPS: 16
Max IOPS: 106
Min IOPS: 0
Average Latency(s): 2.05124
Stddev Latency(s): 0.398137
Max latency(s): 4.74372
Min latency(s): 0.370294
This setup is observed on PVE 5.1 with Ceph Luminous, installed following proxmox wiki and documentations. There is no specific tuning, I followed the default and recommended values.
Regarding the pool, it is named rbd-ssd and it is configured with 2 replicas on the default replicated_rules.
Code:
[global]
auth client required = cephx
auth cluster required = cephx
auth service required = cephx
cluster network = 192.168.2.0/24
fsid = 5b172e1e-d3a8-441b-95c2-d82074a87fd3
keyring = /etc/pve/priv/$cluster.$name.keyring
mon allow pool delete = true
osd journal size = 5120
osd pool default min size = 2
osd pool default size = 3
public network = 192.168.2.0/24
[osd]
keyring = /var/lib/ceph/osd/ceph-$id/keyring
[mon.datacenter2-node2]
host = datacenter2-node2
mon addr = 192.168.2.4:6789
[mon.datacenter1-node1]
host = datacenter1-node1
mon addr = 192.168.2.2:6789
[mon.datacenter1-node2]
host = datacenter1-node2
mon addr = 192.168.2.1:6789
[mon.datacenter2-node1]
host = datacenter2-node1
mon addr = 192.168.2.3:6789
Code:
# begin crush map
tunable choose_local_tries 0
tunable choose_local_fallback_tries 0
tunable choose_total_tries 50
tunable chooseleaf_descend_once 1
tunable chooseleaf_vary_r 1
tunable chooseleaf_stable 1
tunable straw_calc_version 1
tunable allowed_bucket_algs 54
# devices
device 0 osd.0 class ssd
device 1 osd.1 class ssd
device 2 osd.2 class ssd
device 3 osd.3 class ssd
device 4 osd.4 class ssd
device 5 osd.5 class ssd
device 6 osd.6 class ssd
device 7 osd.7 class ssd
device 8 osd.8 class ssd
device 9 osd.9 class ssd
device 10 osd.10 class ssd
device 11 osd.11 class ssd
# types
type 0 osd
type 1 host
type 2 chassis
type 3 rack
type 4 row
type 5 pdu
type 6 pod
type 7 room
type 8 datacenter
type 9 region
type 10 root
# buckets
host datacenter1-node2 {
id -3 # do not change unnecessarily
id -4 class ssd # do not change unnecessarily
# weight 2.620
alg straw2
hash 0 # rjenkins1
item osd.0 weight 0.873
item osd.3 weight 0.873
item osd.4 weight 0.873
}
host datacenter2-node1 {
id -5 # do not change unnecessarily
id -6 class ssd # do not change unnecessarily
# weight 2.620
alg straw2
hash 0 # rjenkins1
item osd.1 weight 0.873
item osd.5 weight 0.873
item osd.7 weight 0.873
}
host datacenter1-node1 {
id -7 # do not change unnecessarily
id -8 class ssd # do not change unnecessarily
# weight 2.620
alg straw2
hash 0 # rjenkins1
item osd.2 weight 0.873
item osd.6 weight 0.873
item osd.8 weight 0.873
}
host datacenter2-node2{
id -9 # do not change unnecessarily
id -10 class ssd # do not change unnecessarily
# weight 2.620
alg straw2
hash 0 # rjenkins1
item osd.9 weight 0.873
item osd.10 weight 0.873
item osd.11 weight 0.873
}
root default {
id -1 # do not change unnecessarily
id -2 class ssd # do not change unnecessarily
# weight 10.478
alg straw2
hash 0 # rjenkins1
item datacenter1-node2 weight 2.620
item datacenter2-node1 weight 2.620
item datacenter1-node1 weight 2.620
item datacenter2-node2weight 2.620
}
# rules
rule replicated_rule {
id 0
type replicated
min_size 1
max_size 10
step take default
step chooseleaf firstn 0 type host
step emit
}
# end crush map
I am trying to figure out where this performance issue comes from but I am currently stuck.
My concerns is about the node datacenter2-node1 which performs not very well compared to the three other nodes. It indeed has write speed of 149.347 MB/sec.
Help and hints are welcome !
Last edited: