I have 7 Nodes in a cluster and currently when I access one of the nodes it shows all the others offline. I can confirm all devices are online, accessible by ping/SSH and I can SSH from one node to the other using the alias contained in the hosts file without authentication.
vm01 10.99.31.11
vm02 10.99.31.12
vm03 10.99.31.13
vm04 10.99.31.14
vm05 10.99.31.15
vm06 10.99.31.16
vm07 10.99.31.17
I had some issues with vm07, so I have upgraded this to the latest verison but no all my vms do not appear when accessing the GUI. Only "local" is shown. So I am now using VM05 for testing as this has nothing important.
I believe the issue may either be to do with quorum or potentially multicast but I could be way off. Currently with the exception of VM07 all nodes are up and the vms running as expected.
root@vm07:~# cat /etc/hosts
127.0.0.1 localhost.localdomain localhost
10.99.31.11 vm01
10.99.31.12 vm02
10.99.31.13 vm03
10.99.31.14 vm04
10.99.31.15 vm05.blu.es vm05 pvelocalhost
10.99.31.16 vm06
10.99.31.17 vm07
10.99.31.18 vm08
10.99.31.19 vm09
root@vm05:~# service corosync status
● corosync.service - Corosync Cluster Engine
Loaded: loaded (/lib/systemd/system/corosync.service; enabled)
Active: active (running) since Thu 2018-04-05 13:10:24 CEST; 5min ago
Process: 2195 ExecStart=/usr/share/corosync/corosync start (code=exited, status=0/SUCCESS)
Main PID: 2208 (corosync)
CGroup: /system.slice/corosync.service
└─2208 corosync
Apr 05 13:14:41 vm05 corosync[2208]: [TOTEM ] A new membership (10.99.31.15:180864) was formed. Members
Apr 05 13:14:41 vm05 corosync[2208]: [QUORUM] Members[1]: 5
Apr 05 13:14:41 vm05 corosync[2208]: [MAIN ] Completed service synchronization, ready to provide service.
Apr 05 13:14:47 vm05 corosync[2208]: [TOTEM ] A new membership (10.99.31.15:180872) was formed. Members
Apr 05 13:14:47 vm05 corosync[2208]: [QUORUM] Members[1]: 5
Apr 05 13:14:47 vm05 corosync[2208]: [MAIN ] Completed service synchronization, ready to provide service.
Apr 05 13:14:53 vm05 corosync[2208]: [TOTEM ] A new membership (10.99.31.11:180876) was formed. Members joined: 1 2 3 7
Apr 05 13:14:53 vm05 corosync[2208]: [QUORUM] This node is within the primary component and will provide service.
Apr 05 13:14:53 vm05 corosync[2208]: [QUORUM] Members[5]: 1 2 3 5 7
Apr 05 13:14:53 vm05 corosync[2208]: [MAIN ] Completed service synchronization, ready to provide service.
root@vm05:~# pvecm status
Quorum information
------------------
Date: Thu Apr 5 13:15:50 2018
Quorum provider: corosync_votequorum
Nodes: 5
Node ID: 0x00000005
Ring ID: 180876
Quorate: Yes
Votequorum information
----------------------
Expected votes: 7
Highest expected: 7
Total votes: 5
Quorum: 4
Flags: Quorate
Membership information
----------------------
Nodeid Votes Name
0x00000001 1 10.99.31.11
0x00000002 1 10.99.31.12
0x00000003 1 10.99.31.13
0x00000005 1 10.99.31.15 (local)
0x00000007 1 10.99.31.17
root@vm05:~# service pve-cluster status
● pve-cluster.service - The Proxmox VE cluster filesystem
Loaded: loaded (/lib/systemd/system/pve-cluster.service; enabled)
Active: active (running) since Thu 2018-04-05 13:10:23 CEST; 5min ago
Process: 2158 ExecStartPost=/usr/bin/pvecm updatecerts --silent (code=exited, status=0/SUCCESS)
Process: 2034 ExecStart=/usr/bin/pmxcfs $DAEMON_OPTS (code=exited, status=0/SUCCESS)
Main PID: 2156 (pmxcfs)
CGroup: /system.slice/pve-cluster.service
└─2156 /usr/bin/pmxcfs
Apr 05 13:14:53 vm05 pmxcfs[2156]: [dcdb] notice: members: 1/1601, 2/2071, 3/1488, 5/2156
Apr 05 13:14:53 vm05 pmxcfs[2156]: [dcdb] notice: members: 1/1601, 2/2071, 3/1488, 5/2156, 7/1198
Apr 05 13:14:53 vm05 pmxcfs[2156]: [status] notice: members: 2/2071, 3/1488, 5/2156
Apr 05 13:14:53 vm05 pmxcfs[2156]: [status] notice: members: 1/1601, 2/2071, 3/1488, 5/2156
Apr 05 13:14:53 vm05 pmxcfs[2156]: [status] notice: members: 1/1601, 2/2071, 3/1488, 5/2156, 7/1198
Apr 05 13:14:53 vm05 pmxcfs[2156]: [status] notice: node has quorum
Apr 05 13:14:53 vm05 pmxcfs[2156]: [dcdb] notice: received sync request (epoch 1/1601/00000BC0)
Apr 05 13:14:53 vm05 pmxcfs[2156]: [dcdb] notice: received sync request (epoch 1/1601/00000BC1)
Apr 05 13:14:53 vm05 pmxcfs[2156]: [status] notice: received sync request (epoch 1/1601/0000052F)
Apr 05 13:14:53 vm05 pmxcfs[2156]: [status] notice: received sync request (epoch 1/1601/00000530)
root@vm05:~# pveversion -v
proxmox-ve: 4.2-48 (running kernel: 4.4.6-1-pve)
pve-manager: 4.2-2 (running version: 4.2-2/725d76f0)
pve-kernel-4.4.6-1-pve: 4.4.6-48
lvm2: 2.02.116-pve2
corosync-pve: 2.3.5-2
libqb0: 1.0-1
pve-cluster: 4.0-39
qemu-server: 4.0-72
pve-firmware: 1.1-8
libpve-common-perl: 4.0-59
libpve-access-control: 4.0-16
libpve-storage-perl: 4.0-50
pve-libspice-server1: 0.12.5-2
vncterm: 1.2-1
pve-qemu-kvm: 2.5-14
pve-container: 1.0-62
pve-firewall: 2.0-25
pve-ha-manager: 1.0-28
ksm-control-daemon: 1.2-1
glusterfs-client: 3.5.2-2+deb8u1
lxc-pve: 1.1.5-7
lxcfs: 2.0.0-pve2
cgmanager: 0.39-pve1
criu: 1.6.0-1
zfsutils: 0.6.5-pve9~jessie
root@vm05:~# cat /etc/pve/.members
{
"nodename": "vm05",
"version": 7,
"cluster": { "name": "dc-cluster", "version": 7, "nodes": 7, "quorate": 1 },
"nodelist": {
"vm07": { "id": 7, "online": 1},
"vm01": { "id": 1, "online": 1},
"vm03": { "id": 3, "online": 1},
"vm04": { "id": 4, "online": 0},
"vm02": { "id": 2, "online": 1},
"vm06": { "id": 6, "online": 0},
"vm05": { "id": 5, "online": 1, "ip": "10.99.31.15"}
}
}
root@vm01:~# omping 10.99.31.11 10.99.31.15
10.99.31.15 : waiting for response msg
10.99.31.15 : joined (S,G) = (*, 232.43.211.234), pinging
10.99.31.15 : unicast, seq=1, size=69 bytes, dist=0, time=0.129ms
10.99.31.15 : unicast, seq=2, size=69 bytes, dist=0, time=0.174ms
10.99.31.15 : multicast, seq=2, size=69 bytes, dist=0, time=0.190ms
10.99.31.15 : unicast, seq=3, size=69 bytes, dist=0, time=0.236ms
10.99.31.15 : multicast, seq=3, size=69 bytes, dist=0, time=0.251ms
10.99.31.15 : unicast, seq=4, size=69 bytes, dist=0, time=0.167ms
10.99.31.15 : multicast, seq=4, size=69 bytes, dist=0, time=0.180ms
10.99.31.15 : unicast, seq=5, size=69 bytes, dist=0, time=0.139ms
10.99.31.15 : multicast, seq=5, size=69 bytes, dist=0, time=0.161ms
10.99.31.15 : unicast, seq=6, size=69 bytes, dist=0, time=0.156ms
10.99.31.15 : multicast, seq=6, size=69 bytes, dist=0, time=0.177ms
10.99.31.15 : unicast, seq=7, size=69 bytes, dist=0, time=0.161ms
10.99.31.15 : multicast, seq=7, size=69 bytes, dist=0, time=0.186ms
root@vm05:~# omping 10.99.31.15 10.99.31.11
10.99.31.11 : joined (S,G) = (*, 232.43.211.234), pinging
10.99.31.11 : unicast, seq=1, size=69 bytes, dist=0, time=0.158ms
10.99.31.11 : multicast, seq=1, size=69 bytes, dist=0, time=0.142ms
10.99.31.11 : unicast, seq=2, size=69 bytes, dist=0, time=0.188ms
10.99.31.11 : multicast, seq=2, size=69 bytes, dist=0, time=0.173ms
10.99.31.11 : unicast, seq=3, size=69 bytes, dist=0, time=0.171ms
10.99.31.11 : multicast, seq=3, size=69 bytes, dist=0, time=0.156ms
10.99.31.11 : unicast, seq=4, size=69 bytes, dist=0, time=0.144ms
10.99.31.11 : multicast, seq=4, size=69 bytes, dist=0, time=0.165ms
10.99.31.11 : unicast, seq=5, size=69 bytes, dist=0, time=0.159ms
10.99.31.11 : multicast, seq=5, size=69 bytes, dist=0, time=0.181ms
root@vm05:~# service pvestatd status
● pvestatd.service - PVE Status Daemon
Loaded: loaded (/lib/systemd/system/pvestatd.service; enabled)
Active: active (running) since Thu 2018-04-05 13:10:24 CEST; 9min ago
Process: 2189 ExecStart=/usr/bin/pvestatd start (code=exited, status=0/SUCCESS)
Main PID: 2219 (pvestatd)
CGroup: /system.slice/pvestatd.service
└─2219 pvestatd
Apr 05 13:10:24 vm05 pvestatd[2219]: starting server
vm01 10.99.31.11
vm02 10.99.31.12
vm03 10.99.31.13
vm04 10.99.31.14
vm05 10.99.31.15
vm06 10.99.31.16
vm07 10.99.31.17
I had some issues with vm07, so I have upgraded this to the latest verison but no all my vms do not appear when accessing the GUI. Only "local" is shown. So I am now using VM05 for testing as this has nothing important.
I believe the issue may either be to do with quorum or potentially multicast but I could be way off. Currently with the exception of VM07 all nodes are up and the vms running as expected.
root@vm07:~# cat /etc/hosts
127.0.0.1 localhost.localdomain localhost
10.99.31.11 vm01
10.99.31.12 vm02
10.99.31.13 vm03
10.99.31.14 vm04
10.99.31.15 vm05.blu.es vm05 pvelocalhost
10.99.31.16 vm06
10.99.31.17 vm07
10.99.31.18 vm08
10.99.31.19 vm09
root@vm05:~# service corosync status
● corosync.service - Corosync Cluster Engine
Loaded: loaded (/lib/systemd/system/corosync.service; enabled)
Active: active (running) since Thu 2018-04-05 13:10:24 CEST; 5min ago
Process: 2195 ExecStart=/usr/share/corosync/corosync start (code=exited, status=0/SUCCESS)
Main PID: 2208 (corosync)
CGroup: /system.slice/corosync.service
└─2208 corosync
Apr 05 13:14:41 vm05 corosync[2208]: [TOTEM ] A new membership (10.99.31.15:180864) was formed. Members
Apr 05 13:14:41 vm05 corosync[2208]: [QUORUM] Members[1]: 5
Apr 05 13:14:41 vm05 corosync[2208]: [MAIN ] Completed service synchronization, ready to provide service.
Apr 05 13:14:47 vm05 corosync[2208]: [TOTEM ] A new membership (10.99.31.15:180872) was formed. Members
Apr 05 13:14:47 vm05 corosync[2208]: [QUORUM] Members[1]: 5
Apr 05 13:14:47 vm05 corosync[2208]: [MAIN ] Completed service synchronization, ready to provide service.
Apr 05 13:14:53 vm05 corosync[2208]: [TOTEM ] A new membership (10.99.31.11:180876) was formed. Members joined: 1 2 3 7
Apr 05 13:14:53 vm05 corosync[2208]: [QUORUM] This node is within the primary component and will provide service.
Apr 05 13:14:53 vm05 corosync[2208]: [QUORUM] Members[5]: 1 2 3 5 7
Apr 05 13:14:53 vm05 corosync[2208]: [MAIN ] Completed service synchronization, ready to provide service.
root@vm05:~# pvecm status
Quorum information
------------------
Date: Thu Apr 5 13:15:50 2018
Quorum provider: corosync_votequorum
Nodes: 5
Node ID: 0x00000005
Ring ID: 180876
Quorate: Yes
Votequorum information
----------------------
Expected votes: 7
Highest expected: 7
Total votes: 5
Quorum: 4
Flags: Quorate
Membership information
----------------------
Nodeid Votes Name
0x00000001 1 10.99.31.11
0x00000002 1 10.99.31.12
0x00000003 1 10.99.31.13
0x00000005 1 10.99.31.15 (local)
0x00000007 1 10.99.31.17
root@vm05:~# service pve-cluster status
● pve-cluster.service - The Proxmox VE cluster filesystem
Loaded: loaded (/lib/systemd/system/pve-cluster.service; enabled)
Active: active (running) since Thu 2018-04-05 13:10:23 CEST; 5min ago
Process: 2158 ExecStartPost=/usr/bin/pvecm updatecerts --silent (code=exited, status=0/SUCCESS)
Process: 2034 ExecStart=/usr/bin/pmxcfs $DAEMON_OPTS (code=exited, status=0/SUCCESS)
Main PID: 2156 (pmxcfs)
CGroup: /system.slice/pve-cluster.service
└─2156 /usr/bin/pmxcfs
Apr 05 13:14:53 vm05 pmxcfs[2156]: [dcdb] notice: members: 1/1601, 2/2071, 3/1488, 5/2156
Apr 05 13:14:53 vm05 pmxcfs[2156]: [dcdb] notice: members: 1/1601, 2/2071, 3/1488, 5/2156, 7/1198
Apr 05 13:14:53 vm05 pmxcfs[2156]: [status] notice: members: 2/2071, 3/1488, 5/2156
Apr 05 13:14:53 vm05 pmxcfs[2156]: [status] notice: members: 1/1601, 2/2071, 3/1488, 5/2156
Apr 05 13:14:53 vm05 pmxcfs[2156]: [status] notice: members: 1/1601, 2/2071, 3/1488, 5/2156, 7/1198
Apr 05 13:14:53 vm05 pmxcfs[2156]: [status] notice: node has quorum
Apr 05 13:14:53 vm05 pmxcfs[2156]: [dcdb] notice: received sync request (epoch 1/1601/00000BC0)
Apr 05 13:14:53 vm05 pmxcfs[2156]: [dcdb] notice: received sync request (epoch 1/1601/00000BC1)
Apr 05 13:14:53 vm05 pmxcfs[2156]: [status] notice: received sync request (epoch 1/1601/0000052F)
Apr 05 13:14:53 vm05 pmxcfs[2156]: [status] notice: received sync request (epoch 1/1601/00000530)
root@vm05:~# pveversion -v
proxmox-ve: 4.2-48 (running kernel: 4.4.6-1-pve)
pve-manager: 4.2-2 (running version: 4.2-2/725d76f0)
pve-kernel-4.4.6-1-pve: 4.4.6-48
lvm2: 2.02.116-pve2
corosync-pve: 2.3.5-2
libqb0: 1.0-1
pve-cluster: 4.0-39
qemu-server: 4.0-72
pve-firmware: 1.1-8
libpve-common-perl: 4.0-59
libpve-access-control: 4.0-16
libpve-storage-perl: 4.0-50
pve-libspice-server1: 0.12.5-2
vncterm: 1.2-1
pve-qemu-kvm: 2.5-14
pve-container: 1.0-62
pve-firewall: 2.0-25
pve-ha-manager: 1.0-28
ksm-control-daemon: 1.2-1
glusterfs-client: 3.5.2-2+deb8u1
lxc-pve: 1.1.5-7
lxcfs: 2.0.0-pve2
cgmanager: 0.39-pve1
criu: 1.6.0-1
zfsutils: 0.6.5-pve9~jessie
root@vm05:~# cat /etc/pve/.members
{
"nodename": "vm05",
"version": 7,
"cluster": { "name": "dc-cluster", "version": 7, "nodes": 7, "quorate": 1 },
"nodelist": {
"vm07": { "id": 7, "online": 1},
"vm01": { "id": 1, "online": 1},
"vm03": { "id": 3, "online": 1},
"vm04": { "id": 4, "online": 0},
"vm02": { "id": 2, "online": 1},
"vm06": { "id": 6, "online": 0},
"vm05": { "id": 5, "online": 1, "ip": "10.99.31.15"}
}
}
root@vm01:~# omping 10.99.31.11 10.99.31.15
10.99.31.15 : waiting for response msg
10.99.31.15 : joined (S,G) = (*, 232.43.211.234), pinging
10.99.31.15 : unicast, seq=1, size=69 bytes, dist=0, time=0.129ms
10.99.31.15 : unicast, seq=2, size=69 bytes, dist=0, time=0.174ms
10.99.31.15 : multicast, seq=2, size=69 bytes, dist=0, time=0.190ms
10.99.31.15 : unicast, seq=3, size=69 bytes, dist=0, time=0.236ms
10.99.31.15 : multicast, seq=3, size=69 bytes, dist=0, time=0.251ms
10.99.31.15 : unicast, seq=4, size=69 bytes, dist=0, time=0.167ms
10.99.31.15 : multicast, seq=4, size=69 bytes, dist=0, time=0.180ms
10.99.31.15 : unicast, seq=5, size=69 bytes, dist=0, time=0.139ms
10.99.31.15 : multicast, seq=5, size=69 bytes, dist=0, time=0.161ms
10.99.31.15 : unicast, seq=6, size=69 bytes, dist=0, time=0.156ms
10.99.31.15 : multicast, seq=6, size=69 bytes, dist=0, time=0.177ms
10.99.31.15 : unicast, seq=7, size=69 bytes, dist=0, time=0.161ms
10.99.31.15 : multicast, seq=7, size=69 bytes, dist=0, time=0.186ms
root@vm05:~# omping 10.99.31.15 10.99.31.11
10.99.31.11 : joined (S,G) = (*, 232.43.211.234), pinging
10.99.31.11 : unicast, seq=1, size=69 bytes, dist=0, time=0.158ms
10.99.31.11 : multicast, seq=1, size=69 bytes, dist=0, time=0.142ms
10.99.31.11 : unicast, seq=2, size=69 bytes, dist=0, time=0.188ms
10.99.31.11 : multicast, seq=2, size=69 bytes, dist=0, time=0.173ms
10.99.31.11 : unicast, seq=3, size=69 bytes, dist=0, time=0.171ms
10.99.31.11 : multicast, seq=3, size=69 bytes, dist=0, time=0.156ms
10.99.31.11 : unicast, seq=4, size=69 bytes, dist=0, time=0.144ms
10.99.31.11 : multicast, seq=4, size=69 bytes, dist=0, time=0.165ms
10.99.31.11 : unicast, seq=5, size=69 bytes, dist=0, time=0.159ms
10.99.31.11 : multicast, seq=5, size=69 bytes, dist=0, time=0.181ms
root@vm05:~# service pvestatd status
● pvestatd.service - PVE Status Daemon
Loaded: loaded (/lib/systemd/system/pvestatd.service; enabled)
Active: active (running) since Thu 2018-04-05 13:10:24 CEST; 9min ago
Process: 2189 ExecStart=/usr/bin/pvestatd start (code=exited, status=0/SUCCESS)
Main PID: 2219 (pvestatd)
CGroup: /system.slice/pvestatd.service
└─2219 pvestatd
Apr 05 13:10:24 vm05 pvestatd[2219]: starting server