Good morning all.
I have setup a 5 node cluster with Ceph. I made sure to buy enterprise SSDs that have PLP and I am still getting slow performance. About 1200 MB/s without load and about 600 MB/s under load. I have read online that my raid controller is causing the bottle neck. But I have not found any good direction on how to prove that diagnostic commands. Here are my server specs. Any help would be greatly appreciated.
5x
Dell PowerEdge R640
Intel(R) Xeon(R) Gold 6154 CPU @ 3.00GHz
128GB DDR4 2666 MT/s
Storage
PERC H740P Mini in Enhanced HBA mode
OS Disks
300GB 12G Sas 15k
6x
SAMSUNG PM1643 2.5" SAS 12Gb/s SSDs
Network
Intel Gigabit 4P X710/I350
2x 10GB both used for Ceph - Iperf show s 11.5 GBytes on test
2x 1GB both used for LAN
Switch is a Juniper 4650
All connected via DAC cables.
rbd bench --io-type write --io-size 4096 --io-threads 16 --io-total 1G ch3-ceph/vm-100-disk-3
bench type write io_size 4096 io_threads 16 bytes 1073741824 pattern sequential
SEC OPS OPS/SEC BYTES/SEC
1 114048 114283 446 MiB/s
2 231296 115762 452 MiB/s
elapsed: 2 ops: 262144 ops/sec: 115574 bytes/sec: 451 MiB/s
Ceph Config
[global]
auth_client_required = cephx
auth_cluster_required = cephx
auth_service_required = cephx
cluster_network = 192.168.1.3/24
fsid = dfc17811-34c2-4ca5-ba01-3a076963ec8d
mon_allow_pool_delete = true
mon_host = 192.168.0.3 192.168.0.4 192.168.0.5 192.168.0.6
ms_bind_ipv4 = true
ms_bind_ipv6 = false
osd_pool_default_min_size = 2
osd_pool_default_size = 3
public_network = 192.168.0.3/24
[client]
keyring = /etc/pve/priv/$cluster.$name.keyring
[client.crash]
keyring = /etc/pve/ceph/$cluster.$name.keyring
[mon.vh-ch-07]
public_addr = 192.168.0.3
[mon.vh-ch-08]
public_addr = 192.168.0.4
[mon.vh-ch-09]
public_addr = 192.168.0.5
[mon.vh-ch-10]
public_addr = 192.168.0.6
Crushmap
# begin crush map
tunable choose_local_tries 0
tunable choose_local_fallback_tries 0
tunable choose_total_tries 50
tunable chooseleaf_descend_once 1
tunable chooseleaf_vary_r 1
tunable chooseleaf_stable 1
tunable straw_calc_version 1
tunable allowed_bucket_algs 54
# devices
device 0 osd.0 class ssd
device 1 osd.1 class ssd
device 2 osd.2 class ssd
device 3 osd.3 class ssd
device 4 osd.4 class ssd
device 5 osd.5 class ssd
device 6 osd.6 class ssd
device 7 osd.7 class ssd
device 8 osd.8 class ssd
device 9 osd.9 class ssd
device 10 osd.10 class ssd
device 11 osd.11 class ssd
device 12 osd.12 class ssd
device 13 osd.13 class ssd
device 14 osd.14 class ssd
device 15 osd.15 class ssd
device 16 osd.16 class ssd
device 17 osd.17 class ssd
device 18 osd.18 class ssd
device 19 osd.19 class ssd
device 20 osd.20 class ssd
device 21 osd.21 class ssd
device 22 osd.22 class ssd
device 23 osd.23 class ssd
# types
type 0 osd
type 1 host
type 2 chassis
type 3 rack
type 4 row
type 5 pdu
type 6 pod
type 7 room
type 8 datacenter
type 9 zone
type 10 region
type 11 root
# buckets
host vh-ch-07 {
id -3 # do not change unnecessarily
id -4 class ssd # do not change unnecessarily
# weight 20.95863
alg straw2
hash 0 # rjenkins1
item osd.0 weight 3.49309
item osd.3 weight 3.49309
item osd.4 weight 3.49309
item osd.5 weight 3.49319
item osd.6 weight 3.49309
item osd.7 weight 3.49309
}
host vh-ch-08 {
id -5 # do not change unnecessarily
id -6 class ssd # do not change unnecessarily
# weight 20.95853
alg straw2
hash 0 # rjenkins1
item osd.1 weight 3.49309
item osd.11 weight 3.49309
item osd.12 weight 3.49309
item osd.13 weight 3.49309
item osd.14 weight 3.49309
item osd.15 weight 3.49309
}
host vh-ch-09 {
id -7 # do not change unnecessarily
id -8 class ssd # do not change unnecessarily
# weight 20.95853
alg straw2
hash 0 # rjenkins1
item osd.2 weight 3.49309
item osd.19 weight 3.49309
item osd.20 weight 3.49309
item osd.21 weight 3.49309
item osd.22 weight 3.49309
item osd.23 weight 3.49309
}
host vh-ch-10 {
id -9 # do not change unnecessarily
id -10 class ssd # do not change unnecessarily
# weight 20.95874
alg straw2
hash 0 # rjenkins1
item osd.10 weight 3.49309
item osd.8 weight 3.49309
item osd.9 weight 3.49319
item osd.16 weight 3.49309
item osd.17 weight 3.49309
item osd.18 weight 3.49319
}
host vh-ch-11 {
id -11 # do not change unnecessarily
id -12 class ssd # do not change unnecessarily
# weight 0.00000
alg straw2
hash 0 # rjenkins1
}
root default {
id -1 # do not change unnecessarily
id -2 class ssd # do not change unnecessarily
# weight 83.83443
alg straw2
hash 0 # rjenkins1
item vh-ch-07 weight 20.95863
item vh-ch-08 weight 20.95853
item vh-ch-09 weight 20.95853
item vh-ch-10 weight 20.95874
item vh-ch-11 weight 0.00000
}
# rules
rule replicated_rule {
id 0
type replicated
step take default
step chooseleaf firstn 0 type host
step emit
}
# end crush map
I have setup a 5 node cluster with Ceph. I made sure to buy enterprise SSDs that have PLP and I am still getting slow performance. About 1200 MB/s without load and about 600 MB/s under load. I have read online that my raid controller is causing the bottle neck. But I have not found any good direction on how to prove that diagnostic commands. Here are my server specs. Any help would be greatly appreciated.
5x
Dell PowerEdge R640
Intel(R) Xeon(R) Gold 6154 CPU @ 3.00GHz
128GB DDR4 2666 MT/s
Storage
PERC H740P Mini in Enhanced HBA mode
OS Disks
300GB 12G Sas 15k
6x
SAMSUNG PM1643 2.5" SAS 12Gb/s SSDs
Network
Intel Gigabit 4P X710/I350
2x 10GB both used for Ceph - Iperf show s 11.5 GBytes on test
2x 1GB both used for LAN
Switch is a Juniper 4650
All connected via DAC cables.
rbd bench --io-type write --io-size 4096 --io-threads 16 --io-total 1G ch3-ceph/vm-100-disk-3
bench type write io_size 4096 io_threads 16 bytes 1073741824 pattern sequential
SEC OPS OPS/SEC BYTES/SEC
1 114048 114283 446 MiB/s
2 231296 115762 452 MiB/s
elapsed: 2 ops: 262144 ops/sec: 115574 bytes/sec: 451 MiB/s
Ceph Config
[global]
auth_client_required = cephx
auth_cluster_required = cephx
auth_service_required = cephx
cluster_network = 192.168.1.3/24
fsid = dfc17811-34c2-4ca5-ba01-3a076963ec8d
mon_allow_pool_delete = true
mon_host = 192.168.0.3 192.168.0.4 192.168.0.5 192.168.0.6
ms_bind_ipv4 = true
ms_bind_ipv6 = false
osd_pool_default_min_size = 2
osd_pool_default_size = 3
public_network = 192.168.0.3/24
[client]
keyring = /etc/pve/priv/$cluster.$name.keyring
[client.crash]
keyring = /etc/pve/ceph/$cluster.$name.keyring
[mon.vh-ch-07]
public_addr = 192.168.0.3
[mon.vh-ch-08]
public_addr = 192.168.0.4
[mon.vh-ch-09]
public_addr = 192.168.0.5
[mon.vh-ch-10]
public_addr = 192.168.0.6
Crushmap
# begin crush map
tunable choose_local_tries 0
tunable choose_local_fallback_tries 0
tunable choose_total_tries 50
tunable chooseleaf_descend_once 1
tunable chooseleaf_vary_r 1
tunable chooseleaf_stable 1
tunable straw_calc_version 1
tunable allowed_bucket_algs 54
# devices
device 0 osd.0 class ssd
device 1 osd.1 class ssd
device 2 osd.2 class ssd
device 3 osd.3 class ssd
device 4 osd.4 class ssd
device 5 osd.5 class ssd
device 6 osd.6 class ssd
device 7 osd.7 class ssd
device 8 osd.8 class ssd
device 9 osd.9 class ssd
device 10 osd.10 class ssd
device 11 osd.11 class ssd
device 12 osd.12 class ssd
device 13 osd.13 class ssd
device 14 osd.14 class ssd
device 15 osd.15 class ssd
device 16 osd.16 class ssd
device 17 osd.17 class ssd
device 18 osd.18 class ssd
device 19 osd.19 class ssd
device 20 osd.20 class ssd
device 21 osd.21 class ssd
device 22 osd.22 class ssd
device 23 osd.23 class ssd
# types
type 0 osd
type 1 host
type 2 chassis
type 3 rack
type 4 row
type 5 pdu
type 6 pod
type 7 room
type 8 datacenter
type 9 zone
type 10 region
type 11 root
# buckets
host vh-ch-07 {
id -3 # do not change unnecessarily
id -4 class ssd # do not change unnecessarily
# weight 20.95863
alg straw2
hash 0 # rjenkins1
item osd.0 weight 3.49309
item osd.3 weight 3.49309
item osd.4 weight 3.49309
item osd.5 weight 3.49319
item osd.6 weight 3.49309
item osd.7 weight 3.49309
}
host vh-ch-08 {
id -5 # do not change unnecessarily
id -6 class ssd # do not change unnecessarily
# weight 20.95853
alg straw2
hash 0 # rjenkins1
item osd.1 weight 3.49309
item osd.11 weight 3.49309
item osd.12 weight 3.49309
item osd.13 weight 3.49309
item osd.14 weight 3.49309
item osd.15 weight 3.49309
}
host vh-ch-09 {
id -7 # do not change unnecessarily
id -8 class ssd # do not change unnecessarily
# weight 20.95853
alg straw2
hash 0 # rjenkins1
item osd.2 weight 3.49309
item osd.19 weight 3.49309
item osd.20 weight 3.49309
item osd.21 weight 3.49309
item osd.22 weight 3.49309
item osd.23 weight 3.49309
}
host vh-ch-10 {
id -9 # do not change unnecessarily
id -10 class ssd # do not change unnecessarily
# weight 20.95874
alg straw2
hash 0 # rjenkins1
item osd.10 weight 3.49309
item osd.8 weight 3.49309
item osd.9 weight 3.49319
item osd.16 weight 3.49309
item osd.17 weight 3.49309
item osd.18 weight 3.49319
}
host vh-ch-11 {
id -11 # do not change unnecessarily
id -12 class ssd # do not change unnecessarily
# weight 0.00000
alg straw2
hash 0 # rjenkins1
}
root default {
id -1 # do not change unnecessarily
id -2 class ssd # do not change unnecessarily
# weight 83.83443
alg straw2
hash 0 # rjenkins1
item vh-ch-07 weight 20.95863
item vh-ch-08 weight 20.95853
item vh-ch-09 weight 20.95853
item vh-ch-10 weight 20.95874
item vh-ch-11 weight 0.00000
}
# rules
rule replicated_rule {
id 0
type replicated
step take default
step chooseleaf firstn 0 type host
step emit
}
# end crush map