Install Proxmox-Ceph on Exadata

Discussion in 'Proxmox VE: Installation and configuration' started by afrugone, Nov 27, 2017.

  1. afrugone

    afrugone Member

    Joined:
    Nov 26, 2008
    Messages:
    99
    Likes Received:
    0
    - Two weeks ago I received a second handed Exadata X3-2 with:

    Two Processing servers (Oracle Database Server)
    2 x Eight-Core Intel® Xeon® E5-2690 Processors (2.9 GHz)
    256GB Memory
    2 Infiniband 40gbs
    4 x 10GB ethernet
    Three Storage Servers (Exadata Storage Servers)
    2 x Eight-Core Intel® Xeon® E5-2690 Processors (2.9 GHz)
    64GB Memory
    16 x 100GB SSD
    12 x 600GB SAS (megaraid- Raid5)
    2 x Infiniband 40gbs
    4 x 10GB ethernet

    Two Infiniband Switches

    One Cisco 1GB Ethernet Switch


    I've installed:
    - Cluster CEPH on Three storage servers
    - Cluster PVE on Two Processing Servers

    The installation process was easy and had no problems. So now I'd like to install everithing again but take care on best possible configuration.

    1- For CEPH in storage servers I added all disks, 16 SSD and 1 HDD and Infiniband as Storage network, is this the right configuration, any reccommnedation, see bench below, are these number reasonable for this configuration?

    2- I want to use CEPH storage from Processeing serves, I've to install CEPH in this servers? or only have to copy ceph_storage.keyring, I have follow instrction i in https://pve.proxmox.com/wiki/Ceph_Server#Ceph_Client, but didn't work, communication is working with infiniband I can ping between storage adn processing servers.

    Thanks for your help

    Regards
    Alfredo

    rados bench -p CEPH 10 write --no-cleanup
    Total time run: 10.076562
    Total writes made: 1253
    Write size: 4194304
    Object size: 4194304
    Bandwidth (MB/sec): 497.392
    Stddev Bandwidth: 25.3719
    Max bandwidth (MB/sec): 520
    Min bandwidth (MB/sec): 448
    Average IOPS: 124
    Stddev IOPS: 6
    Max IOPS: 130
    Min IOPS: 112
    Average Latency(s): 0.128631
    Stddev Latency(s): 0.0738061
    Max latency(s): 0.771186
    Min latency(s): 0.0183194

    rados bench -p CEPH 10 seq
    Total time run: 5.351899
    Total reads made: 1262
    Read size: 4194304
    Object size: 4194304
    Bandwidth (MB/sec): 943.217
    Average IOPS: 235
    Stddev IOPS: 8
    Max IOPS: 247
    Min IOPS: 231
    Average Latency(s): 0.0667516
    Max latency(s): 0.268906
    Min latency(s): 0.0057336

    [​IMG]

    CONFIGURATION:
    Virtual Environment 5.1-35
    Node 'STO002'
    [global]
    auth client required = cephx
    auth cluster required = cephx
    auth service required = cephx
    cluster network = 192.168.111.0/24
    fsid = 08f4b78e-8201-4bdf-b8ee-cf87505a40dc
    keyring = /etc/pve/priv/$cluster.$name.keyring
    mon allow pool delete = true
    osd journal size = 5120
    osd pool default min size = 2
    osd pool default size = 3
    public network = 192.168.111.0/24

    [osd]
    keyring = /var/lib/ceph/osd/ceph-$id/keyring

    [mon.STO003]
    host = STO003
    mon addr = 192.168.111.113:6789

    [mon.STO002]
    host = STO002
    mon addr = 192.168.111.112:6789

    [mon.STO001]
    host = STO001
    mon addr = 192.168.111.111:6789

    # begin crush map
    tunable choose_local_tries 0
    tunable choose_local_fallback_tries 0
    tunable choose_total_tries 50
    tunable chooseleaf_descend_once 1
    tunable chooseleaf_vary_r 1
    tunable chooseleaf_stable 1
    tunable straw_calc_version 1
    tunable allowed_bucket_algs 54

    # devices
    device 0 osd.0 class ssd
    device 1 osd.1 class ssd
    device 2 osd.2 class ssd
    device 3 osd.3 class ssd
    device 4 osd.4 class ssd
    device 5 osd.5 class ssd
    device 6 osd.6 class ssd
    device 7 osd.7 class ssd
    device 8 osd.8 class ssd
    device 9 osd.9 class ssd
    device 10 osd.10 class ssd
    device 11 osd.11 class ssd
    device 12 osd.12 class ssd
    device 13 osd.13 class ssd
    device 14 osd.14 class ssd
    device 15 osd.15 class ssd
    device 16 osd.16 class ssd
    device 17 osd.17 class ssd
    device 18 osd.18 class ssd
    device 19 osd.19 class ssd
    device 20 osd.20 class ssd
    device 21 osd.21 class ssd
    device 22 osd.22 class ssd
    device 23 osd.23 class ssd
    device 24 osd.24 class ssd
    device 25 osd.25 class ssd
    device 26 osd.26 class ssd
    device 27 osd.27 class ssd
    device 28 osd.28 class ssd
    device 29 osd.29 class ssd
    device 30 osd.30 class ssd
    device 31 osd.31 class ssd
    device 32 osd.32 class ssd
    device 33 osd.33 class ssd
    device 34 osd.34 class ssd
    device 35 osd.35 class ssd
    device 36 osd.36 class ssd
    device 37 osd.37 class ssd
    device 38 osd.38 class ssd
    device 39 osd.39 class ssd
    device 40 osd.40 class ssd
    device 41 osd.41 class ssd
    device 42 osd.42 class ssd
    device 43 osd.43 class ssd
    device 44 osd.44 class ssd
    device 45 osd.45 class ssd
    device 46 osd.46 class ssd
    device 47 osd.47 class ssd
    device 48 osd.48 class hdd
    device 49 osd.49 class hdd
    device 50 osd.50 class hdd

    # types
    type 0 osd
    type 1 host
    type 2 chassis
    type 3 rack
    type 4 row
    type 5 pdu
    type 6 pod
    type 7 room
    type 8 datacenter
    type 9 region
    type 10 root

    # buckets
    host STO001 {
    id -3 # do not change unnecessarily
    id -4 class ssd # do not change unnecessarily
    id -9 class hdd # do not change unnecessarily
    # weight 6.902
    alg straw2
    hash 0 # rjenkins1
    item osd.0 weight 0.091
    item osd.1 weight 0.091
    item osd.2 weight 0.091
    item osd.3 weight 0.091
    item osd.4 weight 0.091
    item osd.5 weight 0.091
    item osd.6 weight 0.091
    item osd.7 weight 0.091
    item osd.8 weight 0.091
    item osd.9 weight 0.091
    item osd.10 weight 0.091
    item osd.11 weight 0.091
    item osd.12 weight 0.091
    item osd.13 weight 0.091
    item osd.14 weight 0.091
    item osd.15 weight 0.091
    item osd.48 weight 5.448
    }
    host STO002 {
    id -5 # do not change unnecessarily
    id -6 class ssd # do not change unnecessarily
    id -10 class hdd # do not change unnecessarily
    # weight 6.902
    alg straw2
    hash 0 # rjenkins1
    item osd.16 weight 0.091
    item osd.17 weight 0.091
    item osd.18 weight 0.091
    item osd.19 weight 0.091
    item osd.20 weight 0.091
    item osd.21 weight 0.091
    item osd.22 weight 0.091
    item osd.23 weight 0.091
    item osd.24 weight 0.091
    item osd.25 weight 0.091
    item osd.26 weight 0.091
    item osd.27 weight 0.091
    item osd.28 weight 0.091
    item osd.29 weight 0.091
    item osd.30 weight 0.091
    item osd.31 weight 0.091
    item osd.49 weight 5.448
    }
    host STO003 {
    id -7 # do not change unnecessarily
    id -8 class ssd # do not change unnecessarily
    id -11 class hdd # do not change unnecessarily
    # weight 6.902
    alg straw2
    hash 0 # rjenkins1
    item osd.32 weight 0.091
    item osd.33 weight 0.091
    item osd.34 weight 0.091
    item osd.35 weight 0.091
    item osd.36 weight 0.091
    item osd.37 weight 0.091
    item osd.38 weight 0.091
    item osd.39 weight 0.091
    item osd.40 weight 0.091
    item osd.41 weight 0.091
    item osd.42 weight 0.091
    item osd.43 weight 0.091
    item osd.44 weight 0.091
    item osd.45 weight 0.091
    item osd.46 weight 0.091
    item osd.47 weight 0.091
    item osd.50 weight 5.448
    }
    root default {
    id -1 # do not change unnecessarily
    id -2 class ssd # do not change unnecessarily
    id -12 class hdd # do not change unnecessarily
    # weight 20.706
    alg straw2
    hash 0 # rjenkins1
    item STO001 weight 6.902
    item STO002 weight 6.902
    item STO003 weight 6.902
    }

    # rules
    rule replicated_rule {
    id 0
    type replicated
    min_size 1
    max_size 10
    step take default
    step chooseleaf firstn 0 type host
    step emit
    }
    rule replicated-ssd {
    id 1
    type replicated
    min_size 1
    max_size 10
    step take default class ssd
    step chooseleaf firstn 0 type datacenter
    step emit
    }
    rule replicated-hdd {
    id 2
    type replicated
    min_size 1
    max_size 10
    step take default class hdd
    step chooseleaf firstn 0 type datacenter
    step emit
    }

    # end crush map
    Logs
    ()
     
    #1 afrugone, Nov 27, 2017
    Last edited: Nov 27, 2017
  2. Alwin

    Alwin Proxmox Staff Member
    Staff Member

    Joined:
    Aug 1, 2017
    Messages:
    2,157
    Likes Received:
    191
    Which rule does your pool use? You didn't say, what model those SSDs are, depending on the write performance (assuming a replica of 3) it could be good or bad. How did you configure your SSDs, are the single disks with the RAID controller in IT/JBOD mode? In general, it is wise not to use a RAID controller, instead use a HBA.

    You copy the keyring and name it the same as your storage, check out the docs (also offline on your PVE available).
    https://pve.proxmox.com/pve-docs/chapter-pveceph.html#_ceph_client

    In your setup, it might be also practical to separate the cluster and public network. So you separate the OSD traffic (especially on recovery) onto its own network.

    To bad that you don't have a 10GbE switch or does the Cisco have enough 10GbE ports? Depending on your VM/CT client traffic, you may need more then just multiple 1GbE links.

    So my 2 cents, I sure may have forgotten something, but I hope others can fill in. ;)
     
    Stop hovering to collapse... Click to collapse... Hover to expand... Click to expand...
  3. afrugone

    afrugone Member

    Joined:
    Nov 26, 2008
    Messages:
    99
    Likes Received:
    0
    Alwin,

    Many thanks for your comments, I've received this complete system as is, I thought it will not work with Proxmox, but surpringly for me is working.

    1.- I've researched and found the SSD PCIe flash cards ar the "SUN FLASH ACCELERATOR F40 PCIe CARD" tha is an "LSI Nytro warpdrive" with these specs:
    200GB Nytro WarpDrive WLP4-200
    • Sequential IOPS (4K) - 238,000 Read, 133,000 Write.
    • Sequential Read and Write IOPS (8K) - 189,000 Read, 137,000 Write.
    • Bandwidth (256K) - 2.0GB/s Read, 1.7GB/s Write.
    • Average Latency < 50 microseconds.
    And all of them direct PCIe, not Raid.

    The HDD are 12x600GB SAS 15K, connected to a raid controller, and I'ce configured it as Raid5, has no option for JBOD with this controller, but could use other Raid options, I don't have an HBA.

    All storage traffic is running trough the Infiniband network, so is not open to publc network.

    As show in the figure, the comunication between Storage servers and Data processing servers (PVE) is by infiniband, the 192.168.111.0/24 only exist in the infiniband network, the public network only access through 1GB ethernet, in a different subnet.

    So my questions could be:

    1.- Intially i simply add all sdd and hdd to OSD, as show in the crush map, is this correct? Do you think the rados benchmark results are ok or must be better for this configuration? There is another better way to configure the CEPH storage?

    2.-I dont have a 10GB switch, Do you have BAD experience with Infiniband (40GB)?, in theory is better than 10GB ethernet.

    3.- At client PVE I copy the keyring and name it the same as my storage , I can ping to inifinibad address, but I can't contact the CEPH storage. How can I check if everything is OK in configuration?

    I'm sorry for this long message, but I'm trying to be clear, Thanks again, Regards
    Alfredo
     
  4. Alwin

    Alwin Proxmox Staff Member
    Staff Member

    Joined:
    Aug 1, 2017
    Messages:
    2,157
    Likes Received:
    191
    AFAICT, the Nytro uses two SSDs with separate Controllers on a PCIe Card (but mostly guessing from pictures), possibly they could be setup as separated devices or maybe a RAID-0, but that needs also some testing. Then it could be possible to get even more speed.

    I guess, that is also a LSI controller, there is a tool called MegaCLI or StorCLI, that might help in setting them to IT/JBOD mode. RAID Controller tend to let OSDs starve and introduce latency.

    Not what I meant, on your IB, you are running IP, that means your bandwidth will half, ~20Gb. To separate the OSD traffic on recover and rebalance from the Ceph client traffic, it is advisable to separate the traffic. That's also why I suggested the 10GbE switch.

    I guess, you used the wrong replication rule for your pool, as with all the SSDs and a replication of 3, you should be able to achieve higher values. Change the replication rule on your pool and try again with rados bench.

    Not that, you will have traffic to your VMs/CTs and that will probably max out the 1GbE. Also to separate Ceph cluster & public, it would be a good upgrade.

    If you can access the storage through PVE, then it works. A 'ceph -s' is not working, because there is no ceph.conf in '/etc/ceph/'. Thus, the stock ceph packages in Debian 9 are Jewel, to get Luminous, you may add our ceph repository and do a upgrade.
    deb http://download.proxmox.com/debian/ceph-luminous stretch main
     
    Stop hovering to collapse... Click to collapse... Hover to expand... Click to expand...
  5. alexskysilk

    alexskysilk Active Member

    Joined:
    Oct 16, 2015
    Messages:
    555
    Likes Received:
    58
    Both ceph and cluster IPC are latency sensitive, which makes 1Gbit not advisable. you definately should make use of your 40gbit links; I would bond the links and vlan/partition (depending on protocol) them for your ceph and cluster traffic links. leave all other traffic on 1gbit (VLANs) including a second ring interface for corosync.
     
  6. afrugone

    afrugone Member

    Joined:
    Nov 26, 2008
    Messages:
    99
    Likes Received:
    0
    Regarding Netwoking, how can I ensure that storage traffic is using Infiniband and Not 1Gb lan?

    This is my network configuration:

    auto bond0 (infiniband)
    iface bond0 inet static
    address 192.168.111.111
    netmask 255.255.255.0
    slaves ib0 ib1
    bond_miimon 100
    bond_mode active-backup

    auto vmbr0 (1GB)
    iface vmbr0 inet static
    address 172.27.111.111
    netmask 255.255.252.0
    gateway 172.27.110.252
    bridge_ports eno3
    bridge_stp off
    bridge_fd 0
     
  7. alexskysilk

    alexskysilk Active Member

    Joined:
    Oct 16, 2015
    Messages:
    555
    Likes Received:
    58
    This interface file has a few assumptions:
    1. I'm guessing you're using IPOIB.
    2. you have a subnet manager you can control and add pkeys to. For this configuration we need two pkeys assigned, lets call them 8011 and 8012.
    3. you have a switch you can assign vlans to. we'll need two vlans, lets call them 0 (untagged) and 1.
    4. you are using 2 ethernet ports. if you have 4, redo the vlan interface with physical nics.
    5. your hosts file should reflect addresses in the bond1 ip range BEFORE YOU CREATE THE CLUSTER. this is extremely important. you can add ring 2 manually after the cluster is created.

    Change the addresses as suitable; I recommend keeping the third octate the same as your vlan/pkey id for clarity although this is probably not doable for your primary internet capable subnet. Note that the order of the NICs is reversed for bond1 and bond2- this is on purporse; when all IB interfaces are available the cluster and ceph traffic will use their own interface.

    Code:
    auto bond0
    # Ethernet
    iface bond0 inet manual
            slaves eno2 eno3
            bond_miimon 100
            bond_mode balance-alb
    
    auto bond1
    # IB Cluster traffic- corosync ring 1
    iface bond1 inet static
            address  10.0.11.1
            netmask  255.255.255.0
            slaves ib0.8011 ib1.8011
            pre-up modprobe ib_ipoib
            pre-up echo datagram > /sys/class/net/ib0/mode || true
            pre-up echo datagram > /sys/class/net/ib1/mode || true
            pre-up echo 0x8011 > /sys/class/net/ib0/create_child || true
            pre-up echo 0x8011 > /sys/class/net/ib1/create_child || true
            bond_miimon 100
            bond_mode active-backup
            mtu 2044 # can be 4092
    
    auto bond2
    # IB Ceph Cluster traffic
    iface bond2 inet static
            address  10.0.12.1
            netmask  255.255.255.0
            slaves ib1.8012 ib0.8012
            pre-up modprobe ib_ipoib
            pre-up echo datagram > /sys/class/net/ib0/mode || true
            pre-up echo datagram > /sys/class/net/ib1/mode || true
            pre-up echo 0x8012 > /sys/class/net/ib0/create_child || true
            pre-up echo 0x8012 > /sys/class/net/ib1/create_child || true
            bond_miimon 100
            bond_mode active-backup
            mtu 2044 # can be 4092
    
    auto vlan1
    # Corosync ring 2
    iface vlan1 inet static
            address 10.10.1.1
            netmask 255.255.255.0
            network 10.10.1.0
            vlan-raw-device bond0
    
    auto vmbr0
    # Service network (ethernet)
    iface vmbr0 inet static
            address  10.10.0.100
            netmask  255.255.255.0
            gateway  10.10.0.1
            bridge_ports bond0
            bridge_stp off
            bridge_fd 0
    
     
  8. afrugone

    afrugone Member

    Joined:
    Nov 26, 2008
    Messages:
    99
    Likes Received:
    0
    Thanks for your Help, I'll reinstall everything and try your network configuraton.
     
  1. This site uses cookies to help personalise content, tailor your experience and to keep you logged in if you register.
    By continuing to use this site, you are consenting to our use of cookies.
    Dismiss Notice