I've a cluster with 6 nodes and two separated networks (Ethernet via Cisco switches and Ethenet over Infiniband (Mellanox))
Following the wiki I've set up two totem rings.
root@storageB:/dev/disk# cat /etc/pve/corosync.conf
logging {
debug: off
to_syslog: yes
}
nodelist {
node {
name: pve-node1
nodeid: 1
quorum_votes: 1
ring0_addr: 172.16.253.101
ring1_addr: 172.16.252.101
}
node {
name: pve-node2
nodeid: 2
quorum_votes: 1
ring0_addr: 172.16.253.102
ring1_addr: 172.16.252.102
}
node {
name: pve-node3
nodeid: 3
quorum_votes: 1
ring0_addr: 172.16.253.103
ring1_addr: 172.16.252.103
}
node {
name: pve-node4
nodeid: 4
quorum_votes: 1
ring0_addr: 172.16.253.104
ring1_addr: 172.16.252.104
}
node {
name: storageA
nodeid: 5
quorum_votes: 1
ring0_addr: 172.16.253.251
ring1_addr: 172.16.252.251
}
node {
name: storageB
nodeid: 6
quorum_votes: 1
ring0_addr: 172.16.253.252
ring1_addr: 172.16.252.252
}
}
quorum {
provider: corosync_votequorum
}
totem {
cluster_name: amarao-cluster
config_version: 6
interface {
bindnetaddr: 172.16.253.101
ringnumber: 0
}
interface {
bindnetaddr: 172.16.252.101
ringnumber: 1
}
ip_version: ipv4
rrp_mode: passive
secauth: on
version: 2
}
Network interface config on the same node:
root@storageB:/dev/disk# cat /etc/network/interfaces
auto lo
iface lo inet loopback
iface eno1 inet manual
auto bond0
iface bond0 inet manual
slaves eno1 eno2
bond_miimon 100
bond_mode 0
mtu 9000
auto bond1
iface bond1 inet static
address 172.16.253.252
netmask 255.255.255.0
slaves ib0 ib1
bond_miimon 100
bond_mode active-backup
# pre-up modprobe ib_ipoib
pre-up echo connected > /sys/class/net/ib0/mode
pre-up echo connected > /sys/class/net/ib1/mode
pre-up modprobe bond1
mtu 65520
auto vmbr0
iface vmbr0 inet static
address 172.16.252.252
netmask 255.255.255.0
gateway 172.16.252.1
bridge_ports bond0
bridge_stp off
bridge_fd 0
Cluster works. All intercluster communications go via Infiniband (network 172.16.253.0/24). Web interface is accessed via Ethernet (network 172.16.252.0/24). And everything seems to be OK but...
In syslog on all the nodes I see to following:
Dec 19 12:49:11 storageB corosync[12618]: error [TOTEM ] Marking ringid 1 interface 172.16.252.252 FAULTY
Dec 19 12:49:11 storageB corosync[12618]: [TOTEM ] Marking ringid 1 interface 172.16.252.252 FAULTY
Dec 19 12:49:12 storageB corosync[12618]: notice [TOTEM ] Automatically recovered ring 1
Dec 19 12:49:12 storageB corosync[12618]: [TOTEM ] Automatically recovered ring 1
Dec 19 12:50:00 storageB systemd[1]: Starting Proxmox VE replication runner...
Dec 19 12:50:00 storageB systemd[1]: Started Proxmox VE replication runner.
Dec 19 12:51:00 storageB systemd[1]: Starting Proxmox VE replication runner...
Dec 19 12:51:00 storageB systemd[1]: Started Proxmox VE replication runner.
Dec 19 12:52:00 storageB systemd[1]: Starting Proxmox VE replication runner...
Dec 19 12:52:00 storageB systemd[1]: Started Proxmox VE replication runner.
Dec 19 12:52:17 storageB pmxcfs[10529]: [status] notice: received log
Dec 19 12:53:00 storageB systemd[1]: Starting Proxmox VE replication runner...
Dec 19 12:53:00 storageB systemd[1]: Started Proxmox VE replication runner.
Dec 19 12:53:11 storageB corosync[12618]: error [TOTEM ] Marking ringid 1 interface 172.16.252.252 FAULTY
Dec 19 12:53:11 storageB corosync[12618]: [TOTEM ] Marking ringid 1 interface 172.16.252.252 FAULTY
Dec 19 12:53:12 storageB corosync[12618]: notice [TOTEM ] Automatically recovered ring 1
Dec 19 12:53:12 storageB corosync[12618]: [TOTEM ] Automatically recovered ring 1
Any ideas what could be wrong with my setup?
Following the wiki I've set up two totem rings.
root@storageB:/dev/disk# cat /etc/pve/corosync.conf
logging {
debug: off
to_syslog: yes
}
nodelist {
node {
name: pve-node1
nodeid: 1
quorum_votes: 1
ring0_addr: 172.16.253.101
ring1_addr: 172.16.252.101
}
node {
name: pve-node2
nodeid: 2
quorum_votes: 1
ring0_addr: 172.16.253.102
ring1_addr: 172.16.252.102
}
node {
name: pve-node3
nodeid: 3
quorum_votes: 1
ring0_addr: 172.16.253.103
ring1_addr: 172.16.252.103
}
node {
name: pve-node4
nodeid: 4
quorum_votes: 1
ring0_addr: 172.16.253.104
ring1_addr: 172.16.252.104
}
node {
name: storageA
nodeid: 5
quorum_votes: 1
ring0_addr: 172.16.253.251
ring1_addr: 172.16.252.251
}
node {
name: storageB
nodeid: 6
quorum_votes: 1
ring0_addr: 172.16.253.252
ring1_addr: 172.16.252.252
}
}
quorum {
provider: corosync_votequorum
}
totem {
cluster_name: amarao-cluster
config_version: 6
interface {
bindnetaddr: 172.16.253.101
ringnumber: 0
}
interface {
bindnetaddr: 172.16.252.101
ringnumber: 1
}
ip_version: ipv4
rrp_mode: passive
secauth: on
version: 2
}
Network interface config on the same node:
root@storageB:/dev/disk# cat /etc/network/interfaces
auto lo
iface lo inet loopback
iface eno1 inet manual
auto bond0
iface bond0 inet manual
slaves eno1 eno2
bond_miimon 100
bond_mode 0
mtu 9000
auto bond1
iface bond1 inet static
address 172.16.253.252
netmask 255.255.255.0
slaves ib0 ib1
bond_miimon 100
bond_mode active-backup
# pre-up modprobe ib_ipoib
pre-up echo connected > /sys/class/net/ib0/mode
pre-up echo connected > /sys/class/net/ib1/mode
pre-up modprobe bond1
mtu 65520
auto vmbr0
iface vmbr0 inet static
address 172.16.252.252
netmask 255.255.255.0
gateway 172.16.252.1
bridge_ports bond0
bridge_stp off
bridge_fd 0
Cluster works. All intercluster communications go via Infiniband (network 172.16.253.0/24). Web interface is accessed via Ethernet (network 172.16.252.0/24). And everything seems to be OK but...
In syslog on all the nodes I see to following:
Dec 19 12:49:11 storageB corosync[12618]: error [TOTEM ] Marking ringid 1 interface 172.16.252.252 FAULTY
Dec 19 12:49:11 storageB corosync[12618]: [TOTEM ] Marking ringid 1 interface 172.16.252.252 FAULTY
Dec 19 12:49:12 storageB corosync[12618]: notice [TOTEM ] Automatically recovered ring 1
Dec 19 12:49:12 storageB corosync[12618]: [TOTEM ] Automatically recovered ring 1
Dec 19 12:50:00 storageB systemd[1]: Starting Proxmox VE replication runner...
Dec 19 12:50:00 storageB systemd[1]: Started Proxmox VE replication runner.
Dec 19 12:51:00 storageB systemd[1]: Starting Proxmox VE replication runner...
Dec 19 12:51:00 storageB systemd[1]: Started Proxmox VE replication runner.
Dec 19 12:52:00 storageB systemd[1]: Starting Proxmox VE replication runner...
Dec 19 12:52:00 storageB systemd[1]: Started Proxmox VE replication runner.
Dec 19 12:52:17 storageB pmxcfs[10529]: [status] notice: received log
Dec 19 12:53:00 storageB systemd[1]: Starting Proxmox VE replication runner...
Dec 19 12:53:00 storageB systemd[1]: Started Proxmox VE replication runner.
Dec 19 12:53:11 storageB corosync[12618]: error [TOTEM ] Marking ringid 1 interface 172.16.252.252 FAULTY
Dec 19 12:53:11 storageB corosync[12618]: [TOTEM ] Marking ringid 1 interface 172.16.252.252 FAULTY
Dec 19 12:53:12 storageB corosync[12618]: notice [TOTEM ] Automatically recovered ring 1
Dec 19 12:53:12 storageB corosync[12618]: [TOTEM ] Automatically recovered ring 1
Any ideas what could be wrong with my setup?