I am getting extremely low write speeds on my minimal Ceph cluster of 3 nodes with 2 1TB Samsung QVO 860 SSDs each (total of 6 SSDs across 3 nodes). My 3 nodes each have 4 10G links in a LAG group separated into 5 VLANs. iperf3 results show I am getting 10 Gbits between each node on all VLANs.
Ceph config:
crush map:

Here are the results of my fio test (I got similar results inside of a Windows VM):
It does not seem to be my network, because read speeds are pretty good and definitely approach 10 Gbit speeds. Any ideas why the write speeds are so crippled?
Thanks,
Stan
Ceph config:
Code:
[global]
auth_client_required = cephx
auth_cluster_required = cephx
auth_service_required = cephx
cluster_network = 10.60.10.1/28
fsid = cb9aebb9-2aef-4797-872f-75a138c81ac0
mon_allow_pool_delete = true
mon_host = 10.60.10.2 10.60.10.3 10.60.10.1
osd_pool_default_min_size = 2
osd_pool_default_size = 2
public_network = 10.60.10.1/28
[client]
keyring = /etc/pve/priv/$cluster.$name.keyring
[mon.prox1]
public_addr = 10.60.10.1
crush map:
Code:
# begin crush map
tunable choose_local_tries 0
tunable choose_local_fallback_tries 0
tunable choose_total_tries 50
tunable chooseleaf_descend_once 1
tunable chooseleaf_vary_r 1
tunable chooseleaf_stable 1
tunable straw_calc_version 1
tunable allowed_bucket_algs 54
# devices
device 0 osd.0 class ssd
device 1 osd.1 class ssd
device 2 osd.2 class ssd
device 3 osd.3 class ssd
device 4 osd.4 class ssd
device 5 osd.5 class ssd
# types
type 0 osd
type 1 host
type 2 chassis
type 3 rack
type 4 row
type 5 pdu
type 6 pod
type 7 room
type 8 datacenter
type 9 zone
type 10 region
type 11 root
# buckets
host prox3 {
id -3 # do not change unnecessarily
id -4 class ssd # do not change unnecessarily
# weight 1.818
alg straw2
hash 0 # rjenkins1
item osd.0 weight 0.909
item osd.1 weight 0.909
}
host prox2 {
id -5 # do not change unnecessarily
id -6 class ssd # do not change unnecessarily
# weight 1.818
alg straw2
hash 0 # rjenkins1
item osd.2 weight 0.909
item osd.3 weight 0.909
}
host prox1 {
id -7 # do not change unnecessarily
id -8 class ssd # do not change unnecessarily
# weight 1.818
alg straw2
hash 0 # rjenkins1
item osd.4 weight 0.909
item osd.5 weight 0.909
}
root default {
id -1 # do not change unnecessarily
id -2 class ssd # do not change unnecessarily
# weight 5.455
alg straw2
hash 0 # rjenkins1
item prox3 weight 1.818
item prox2 weight 1.818
item prox1 weight 1.818
}
# rules
rule replicated_rule {
id 0
type replicated
min_size 1
max_size 10
step take default
step chooseleaf firstn 0 type host
step emit
}
# end crush map
Server View
Logs

Here are the results of my fio test (I got similar results inside of a Windows VM):
Code:
root@prox1:~# fio -ioengine=rbd -name=test -direct=1 -rw=read -bs=4M -iodepth=16 -pool=Ceph1 -rbdname=vm-111-disk-0
test: (g=0): rw=read, bs=(R) 4096KiB-4096KiB, (W) 4096KiB-4096KiB, (T) 4096KiB-4096KiB, ioengine=rbd, iodepth=16
fio-3.12
Starting 1 process
Jobs: 1 (f=1): [R(1)][95.6%][r=2258MiB/s][r=564 IOPS][eta 00m:02s]
test: (groupid=0, jobs=1): err= 0: pid=3192071: Tue Feb 16 15:59:22 2021
read: IOPS=192, BW=771MiB/s (809MB/s)(32.0GiB/42477msec)
slat (nsec): min=1867, max=169154, avg=15109.84, stdev=8685.58
clat (msec): min=6, max=2865, avg=82.94, stdev=148.54
lat (msec): min=6, max=2865, avg=82.95, stdev=148.54
clat percentiles (msec):
| 1.00th=[ 13], 5.00th=[ 15], 10.00th=[ 17], 20.00th=[ 20],
| 30.00th=[ 23], 40.00th=[ 26], 50.00th=[ 27], 60.00th=[ 47],
| 70.00th=[ 82], 80.00th=[ 112], 90.00th=[ 165], 95.00th=[ 279],
| 99.00th=[ 735], 99.50th=[ 1083], 99.90th=[ 1703], 99.95th=[ 1921],
| 99.99th=[ 2869]
bw ( KiB/s): min=49152, max=2932736, per=98.53%, avg=778365.46, stdev=673798.19, samples=83
iops : min= 12, max= 716, avg=189.99, stdev=164.53, samples=83
lat (msec) : 10=0.56%, 20=21.19%, 50=39.25%, 100=15.05%, 250=18.21%
lat (msec) : 500=3.58%, 750=1.21%, 1000=0.38%
cpu : usr=0.92%, sys=0.39%, ctx=8209, majf=13, minf=134
IO depths : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=99.8%, 32=0.0%, >=64=0.0%
submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.1%, 32=0.0%, 64=0.0%, >=64=0.0%
issued rwts: total=8192,0,0,0 short=0,0,0,0 dropped=0,0,0,0
latency : target=0, window=0, percentile=100.00%, depth=16
Run status group 0 (all jobs):
READ: bw=771MiB/s (809MB/s), 771MiB/s-771MiB/s (809MB/s-809MB/s), io=32.0GiB (34.4GB), run=42477-42477msec
root@prox1:~# fio -ioengine=rbd -name=test -direct=1 -rw=write -bs=4M -iodepth=16 -pool=Ceph1 -rbdname=vm-111-disk-0
test: (g=0): rw=write, bs=(R) 4096KiB-4096KiB, (W) 4096KiB-4096KiB, (T) 4096KiB-4096KiB, ioengine=rbd, iodepth=16
fio-3.12
Starting 1 process
Jobs: 1 (f=0): [f(1)][100.0%][w=60.0MiB/s][w=15 IOPS][eta 00m:00s]
test: (groupid=0, jobs=1): err= 0: pid=3195178: Tue Feb 16 16:34:25 2021
write: IOPS=3, BW=15.9MiB/s (16.7MB/s)(32.0GiB/2056095msec); 0 zone resets
slat (usec): min=707, max=15717, avg=2536.78, stdev=1700.41
clat (msec): min=246, max=14489, avg=4013.08, stdev=2070.79
lat (msec): min=248, max=14491, avg=4015.62, stdev=2071.01
clat percentiles (msec):
| 1.00th=[ 506], 5.00th=[ 894], 10.00th=[ 1401], 20.00th=[ 2165],
| 30.00th=[ 2802], 40.00th=[ 3306], 50.00th=[ 3842], 60.00th=[ 4396],
| 70.00th=[ 4933], 80.00th=[ 5738], 90.00th=[ 6812], 95.00th=[ 7617],
| 99.00th=[ 9597], 99.50th=[10537], 99.90th=[12013], 99.95th=[13489],
| 99.99th=[14429]
bw ( KiB/s): min= 8175, max=139264, per=100.00%, avg=30679.23, stdev=25862.97, samples=2183
iops : min= 1, max= 34, avg= 7.41, stdev= 6.31, samples=2183
lat (msec) : 250=0.01%, 500=0.96%, 750=2.04%, 1000=2.89%
cpu : usr=0.76%, sys=0.26%, ctx=4217, majf=1, minf=2430160
IO depths : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=99.8%, 32=0.0%, >=64=0.0%
submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.1%, 32=0.0%, 64=0.0%, >=64=0.0%
issued rwts: total=0,8192,0,0 short=0,0,0,0 dropped=0,0,0,0
latency : target=0, window=0, percentile=100.00%, depth=16
Run status group 0 (all jobs):
WRITE: bw=15.9MiB/s (16.7MB/s), 15.9MiB/s-15.9MiB/s (16.7MB/s-16.7MB/s), io=32.0GiB (34.4GB), run=2056095-2056095msec
It does not seem to be my network, because read speeds are pretty good and definitely approach 10 Gbit speeds. Any ideas why the write speeds are so crippled?
Thanks,
Stan
Last edited by a moderator: