Hi folks,
I have a weird behaviour while setting up a brand new cluster.
In the 5 node pve 4.4-18 cluster all nodes start after reboot in green, available state in the GUI. After a minute or so, node by node turn from green state to red cross until all nodes seems to be offline. It takes an other minute and node by node returns to green state until all nodes are ok.
According GUI and pvecm status the HA-Cluster never leave health condition, /etc/pve is still writeable and in /etc/pve/.members are all nodes listed. I can't find any hint in in the logs. Date is 100% in sync.
First thought was mulitcast but all 5 nodes joined the 239.192.145.115 multicast group.
Some snippets from logs and cat from config files:
# dis igmp-snooping group vlan 21
Total 2 entries.
----------
VLAN 21: Total 2 entries.
(0.0.0.0, 232.43.211.234)
Host slots (0 in total):
Host ports (1 in total):
BAGG5 (00:03:08)
(0.0.0.0, 239.192.145.115)
Host slots (0 in total):
Host ports (5 in total):
BAGG1 (00:03:12)
BAGG2 (00:03:04)
BAGG3 (00:03:04)
BAGG4 (00:03:12)
BAGG5 (00:03:13)
----------
proxmox-ve: 4.4-95 (running kernel: 4.4.79-1-pve)
pve-manager: 4.4-18 (running version: 4.4-18/ef2610e8)
pve-kernel-4.4.35-1-pve: 4.4.35-77
pve-kernel-4.4.79-1-pve: 4.4.79-95
lvm2: 2.02.116-pve3
corosync-pve: 2.4.2-2~pve4+1
libqb0: 1.0.1-1
pve-cluster: 4.0-52
qemu-server: 4.0-112
pve-firmware: 1.1-11
libpve-common-perl: 4.0-96
libpve-access-control: 4.0-23
libpve-storage-perl: 4.0-76
pve-libspice-server1: 0.12.8-2
vncterm: 1.3-2
pve-docs: 4.4-4
pve-qemu-kvm: 2.7.1-4
pve-container: 1.0-101
pve-firewall: 2.0-33
pve-ha-manager: 1.0-41
ksm-control-daemon: 1.2-1
glusterfs-client: 3.5.2-2+deb8u3
lxc-pve: 2.0.7-4
lxcfs: 2.0.6-pve1
criu: 1.6.0-1
novnc-pve: 0.5-9
smartmontools: 6.5+svn4324-1~pve80
zfsutils: 0.6.5.9-pve15~bpo80
ceph: 10.2.9-1~bpo80+1
----------
# /etc/pve/.clusterlog (no other entries, then listed below)
, "msg": "successful auth for user 'root@pam'"},
{"uid": 101, "time": 1506171300, "pri": 6, "tag": "pvedaemon", "pid": 1085, "node": "node1708vm-5", "user": "root@pam", "msg": "successful auth for user 'root@pam'"}
----------
# /var/log/daemon.log
Sep 23 20:55:01 node1708vm-4 pmxcfs[2281]: [status] notice: received log
Sep 23 20:56:41 node1708vm-4 pvedaemon[14733]: worker exit
Sep 23 20:56:41 node1708vm-4 pvedaemon[2530]: worker 14733 finished
Sep 23 20:56:41 node1708vm-4 pvedaemon[2530]: starting 1 worker(s)
Sep 23 20:56:41 node1708vm-4 pvedaemon[2530]: worker 15883 started
Sep 23 20:58:05 node1708vm-4 pveproxy[3010]: worker exit
Sep 23 20:58:05 node1708vm-4 pveproxy[16101]: worker 3010 finished
Sep 23 20:58:05 node1708vm-4 pveproxy[16101]: starting 1 worker(s)
Sep 23 20:58:05 node1708vm-4 pveproxy[16101]: worker 16998 started
Sep 23 20:58:39 node1708vm-4 pvestatd[2479]: status update time (600.276 seconds)
----------
# cat /etc/pve/corosync.conf
logging {
debug: off
to_syslog: yes
}
nodelist {
node {
name: node1708vm-5
nodeid: 5
quorum_votes: 1
ring0_addr: node1708vm-5
}
node {
name: node1708vm-3
nodeid: 3
quorum_votes: 1
ring0_addr: node1708vm-3
}
node {
name: node1708vm-4
nodeid: 4
quorum_votes: 1
ring0_addr: node1708vm-4
}
node {
name: node1708vm-2
nodeid: 2
quorum_votes: 1
ring0_addr: node1708vm-2
}
node {
name: node1708vm-1
nodeid: 1
quorum_votes: 1
ring0_addr: node1708vm-1
}
}
quorum {
provider: corosync_votequorum
}
totem {
cluster_name: CLUSTER01
config_version: 7
ip_version: ipv4
secauth: on
version: 2
token: 4000 # <--- thought this would help, but no does not.
interface {
bindnetaddr: 10.0.21.10
ringnumber: 0
}
}
-------
# cat /etc/network/intefaces
auto lo
iface lo inet loopback
auto eth0 inet manual
auto eth1 inet manual
auto eth2 inet manual
auto eth3 inet manual
auto eth4 inet manual
auto eth5 inet manual
auto bond20
iface bond20 inet static
address 10.0.20.10
netmask 255.255.255.0
slaves eth0 eth1 eth4 eth5
bond-miimon 100
bond-mode 802.3ad
auto bond0
iface bond0 inet manual
slaves eth2 eth3
bond-miimon 100
bond-mode 802.3ad
auto bond0.21
iface bond0.21 inet manual
vlan-raw-device bond0
auto bond0.100
iface bond0.100 inet manual
vlan-raw-device bond0
auto bond0.101
iface bond0.101 inet manual
vlan-raw-device bond0
auto bond0.170
iface bond0.170 inet manual
vlan-raw-device bond0
auto bond0.180
iface bond0.180 inet manual
vlan-raw-device bond0
auto bond0.190
iface bond0.190 inet manual
vlan-raw-device bond0
auto bond0.200
iface bond0.200 inet manual
vlan-raw-device bond0
auto vmbr21
iface vmbr21 inet static
address 10.0.21.10
netmask 255.255.255.0
bridge_ports bond0.21
bridge_stp off
bridge_fd 0
auto vmbr99
iface vmbr99 inet static
bridge_ports bond0.99
bridge_stp off
bridge_fd 0
auto vmbr100
iface vmbr100 inet static
bridge_ports bond0.100
bridge_stp off
bridge_fd 0
auto vmbr101
iface vmbr101 inet static
bridge_ports bond0.101
bridge_stp off
bridge_fd 0
auto vmbr170
iface vmbr170 inet static
bridge_ports bond0.170
bridge_stp off
bridge_fd 0
auto vmbr180
iface vmbr180 inet static
bridge_ports bond0.180
bridge_stp off
bridge_fd 0
auto vmbr190
iface vmbr190 inet static
bridge_ports bond0.190
bridge_stp off
bridge_fd 0
auto vmbr200
iface vmbr200 inet static
bridge_ports bond0.200
bridge_stp off
bridge_fd 0
auto eth6
iface eth6 inet static
address 192.168.0.166
netmask 255.255.255.0
gateway 192.168.0.251
----
cat /etc/hosts
127.0.0.1 localhost.localdomain localhost
# Admin LAN
192.168.0.166 node1708-1.coro.company.de node1708-1 pvelocalhost
## Proxmox Cluster ##
10.0.21.10 node1708vm-1.vm.company.de node1708vm-1
10.0.21.20 node1708vm-2.vm.company.de node1708vm-2
10.0.21.30 node1708vm-3.vm.company.de node1708vm-3
10.0.21.40 node1708vm-4.vm.company.de node1708vm-4
10.0.21.50 node1708vm-5.vm.company.de node1708vm-5
# Ceph network is 10.0.20.0/24
# The following lines are desirable for IPv6 capable hosts
::1 ip6-localhost ip6-loopback
fe00::0 ip6-localnet
ff00::0 ip6-mcastprefix
ff02::1 ip6-allnodes
ff02::2 ip6-allrouters
ff02::3 ip6-allhosts
---------
# cat /etc/pve/.members (at the time, when all nodes have a red cross)
{
"nodename": "node1708vm-1",
"version": 18,
"cluster": { "name": "CLUSTER01", "version": 7, "nodes": 5, "quorate": 1 },
"nodelist": {
"node1708vm-1": { "id": 1, "online": 1, "ip": "10.0.21.10"},
"node1708vm-2": { "id": 2, "online": 1, "ip": "10.0.21.20"},
"node1708vm-3": { "id": 3, "online": 1, "ip": "10.0.21.30"},
"node1708vm-4": { "id": 4, "online": 1, "ip": "10.0.21.40"},
"node1708vm-5": { "id": 5, "online": 1, "ip": "10.0.21.50"}
}
}
-----------
#pvecm status (at this moment 10.0.21.10 is offline)
Quorum information
------------------
Date: Sat Sep 23 21:40:31 2017
Quorum provider: corosync_votequorum
Nodes: 5
Node ID: 0x00000001
Ring ID: 1/412
Quorate: Yes
Votequorum information
----------------------
Expected votes: 5
Highest expected: 5
Total votes: 5
Quorum: 3
Flags: Quorate
Membership information
----------------------
Nodeid Votes Name
0x00000001 1 10.0.21.10 (local)
0x00000002 1 10.0.21.20
0x00000003 1 10.0.21.30
0x00000004 1 10.0.21.40
0x00000005 1 10.0.21.50
--------
journalctl -xe
Sep 23 21:17:54 node1708vm-1 pvestatd[2553]: status update time (600.290 seconds)
Sep 23 21:25:02 node1708vm-1 pmxcfs[2324]: [status] notice: received log
Sep 23 21:27:29 node1708vm-1 pmxcfs[2324]: [status] notice: received log
Sep 23 21:27:55 node1708vm-1 pvestatd[2553]: status update time (600.309 seconds)
Sep 23 21:36:47 node1708vm-1 rrdcached[2195]: flushing old values
Sep 23 21:36:47 node1708vm-1 rrdcached[2195]: rotating journals
Sep 23 21:36:47 node1708vm-1 rrdcached[2195]: started new journal /var/lib/rrdcached/journal/rrd.journal.1506195407.632967
Sep 23 21:36:47 node1708vm-1 rrdcached[2195]: removing old journal /var/lib/rrdcached/journal/rrd.journal.1506188207.632967
Sep 23 21:36:47 node1708vm-1 pmxcfs[2324]: [dcdb] notice: data verification successful
Sep 23 21:37:55 node1708vm-1 pvestatd[2553]: status update time (600.279 seconds)
Sep 23 21:40:02 node1708vm-1 pmxcfs[2324]: [status] notice: received log
Sep 23 21:42:29 node1708vm-1 pmxcfs[2324]: [status] notice: received log
-------
lspci | grep -i ethernet
03:00.0 Ethernet controller: Broadcom Limited BCM57840 NetXtreme II 10 Gigabit Ethernet (rev 11)
03:00.1 Ethernet controller: Broadcom Limited BCM57840 NetXtreme II 10 Gigabit Ethernet (rev 11)
04:00.0 Ethernet controller: Intel Corporation Ethernet Controller 10-Gigabit X540-AT2 (rev 01)
04:00.1 Ethernet controller: Intel Corporation Ethernet Controller 10-Gigabit X540-AT2 (rev 01)
81:00.0 Ethernet controller: Broadcom Limited BCM57840 NetXtreme II 10 Gigabit Ethernet (rev 11)
81:00.1 Ethernet controller: Broadcom Limited BCM57840 NetXtreme II 10 Gigabit Ethernet (rev 11)
83:00.0 Ethernet controller: Broadcom Limited BCM57840 NetXtreme II 10 Gigabit Ethernet (rev 11)
83:00.1 Ethernet controller: Broadcom Limited BCM57840 NetXtreme II 10 Gigabit Ethernet (rev 11)
I have no clue where to search for the reason of this strange issue and would be happy for every hint in this case.
Thanks in advance.
Juergen
I have a weird behaviour while setting up a brand new cluster.
In the 5 node pve 4.4-18 cluster all nodes start after reboot in green, available state in the GUI. After a minute or so, node by node turn from green state to red cross until all nodes seems to be offline. It takes an other minute and node by node returns to green state until all nodes are ok.
According GUI and pvecm status the HA-Cluster never leave health condition, /etc/pve is still writeable and in /etc/pve/.members are all nodes listed. I can't find any hint in in the logs. Date is 100% in sync.
First thought was mulitcast but all 5 nodes joined the 239.192.145.115 multicast group.
Some snippets from logs and cat from config files:
# dis igmp-snooping group vlan 21
Total 2 entries.
----------
VLAN 21: Total 2 entries.
(0.0.0.0, 232.43.211.234)
Host slots (0 in total):
Host ports (1 in total):
BAGG5 (00:03:08)
(0.0.0.0, 239.192.145.115)
Host slots (0 in total):
Host ports (5 in total):
BAGG1 (00:03:12)
BAGG2 (00:03:04)
BAGG3 (00:03:04)
BAGG4 (00:03:12)
BAGG5 (00:03:13)
----------
proxmox-ve: 4.4-95 (running kernel: 4.4.79-1-pve)
pve-manager: 4.4-18 (running version: 4.4-18/ef2610e8)
pve-kernel-4.4.35-1-pve: 4.4.35-77
pve-kernel-4.4.79-1-pve: 4.4.79-95
lvm2: 2.02.116-pve3
corosync-pve: 2.4.2-2~pve4+1
libqb0: 1.0.1-1
pve-cluster: 4.0-52
qemu-server: 4.0-112
pve-firmware: 1.1-11
libpve-common-perl: 4.0-96
libpve-access-control: 4.0-23
libpve-storage-perl: 4.0-76
pve-libspice-server1: 0.12.8-2
vncterm: 1.3-2
pve-docs: 4.4-4
pve-qemu-kvm: 2.7.1-4
pve-container: 1.0-101
pve-firewall: 2.0-33
pve-ha-manager: 1.0-41
ksm-control-daemon: 1.2-1
glusterfs-client: 3.5.2-2+deb8u3
lxc-pve: 2.0.7-4
lxcfs: 2.0.6-pve1
criu: 1.6.0-1
novnc-pve: 0.5-9
smartmontools: 6.5+svn4324-1~pve80
zfsutils: 0.6.5.9-pve15~bpo80
ceph: 10.2.9-1~bpo80+1
----------
# /etc/pve/.clusterlog (no other entries, then listed below)
, "msg": "successful auth for user 'root@pam'"},
{"uid": 101, "time": 1506171300, "pri": 6, "tag": "pvedaemon", "pid": 1085, "node": "node1708vm-5", "user": "root@pam", "msg": "successful auth for user 'root@pam'"}
----------
# /var/log/daemon.log
Sep 23 20:55:01 node1708vm-4 pmxcfs[2281]: [status] notice: received log
Sep 23 20:56:41 node1708vm-4 pvedaemon[14733]: worker exit
Sep 23 20:56:41 node1708vm-4 pvedaemon[2530]: worker 14733 finished
Sep 23 20:56:41 node1708vm-4 pvedaemon[2530]: starting 1 worker(s)
Sep 23 20:56:41 node1708vm-4 pvedaemon[2530]: worker 15883 started
Sep 23 20:58:05 node1708vm-4 pveproxy[3010]: worker exit
Sep 23 20:58:05 node1708vm-4 pveproxy[16101]: worker 3010 finished
Sep 23 20:58:05 node1708vm-4 pveproxy[16101]: starting 1 worker(s)
Sep 23 20:58:05 node1708vm-4 pveproxy[16101]: worker 16998 started
Sep 23 20:58:39 node1708vm-4 pvestatd[2479]: status update time (600.276 seconds)
----------
# cat /etc/pve/corosync.conf
logging {
debug: off
to_syslog: yes
}
nodelist {
node {
name: node1708vm-5
nodeid: 5
quorum_votes: 1
ring0_addr: node1708vm-5
}
node {
name: node1708vm-3
nodeid: 3
quorum_votes: 1
ring0_addr: node1708vm-3
}
node {
name: node1708vm-4
nodeid: 4
quorum_votes: 1
ring0_addr: node1708vm-4
}
node {
name: node1708vm-2
nodeid: 2
quorum_votes: 1
ring0_addr: node1708vm-2
}
node {
name: node1708vm-1
nodeid: 1
quorum_votes: 1
ring0_addr: node1708vm-1
}
}
quorum {
provider: corosync_votequorum
}
totem {
cluster_name: CLUSTER01
config_version: 7
ip_version: ipv4
secauth: on
version: 2
token: 4000 # <--- thought this would help, but no does not.
interface {
bindnetaddr: 10.0.21.10
ringnumber: 0
}
}
-------
# cat /etc/network/intefaces
auto lo
iface lo inet loopback
auto eth0 inet manual
auto eth1 inet manual
auto eth2 inet manual
auto eth3 inet manual
auto eth4 inet manual
auto eth5 inet manual
auto bond20
iface bond20 inet static
address 10.0.20.10
netmask 255.255.255.0
slaves eth0 eth1 eth4 eth5
bond-miimon 100
bond-mode 802.3ad
auto bond0
iface bond0 inet manual
slaves eth2 eth3
bond-miimon 100
bond-mode 802.3ad
auto bond0.21
iface bond0.21 inet manual
vlan-raw-device bond0
auto bond0.100
iface bond0.100 inet manual
vlan-raw-device bond0
auto bond0.101
iface bond0.101 inet manual
vlan-raw-device bond0
auto bond0.170
iface bond0.170 inet manual
vlan-raw-device bond0
auto bond0.180
iface bond0.180 inet manual
vlan-raw-device bond0
auto bond0.190
iface bond0.190 inet manual
vlan-raw-device bond0
auto bond0.200
iface bond0.200 inet manual
vlan-raw-device bond0
auto vmbr21
iface vmbr21 inet static
address 10.0.21.10
netmask 255.255.255.0
bridge_ports bond0.21
bridge_stp off
bridge_fd 0
auto vmbr99
iface vmbr99 inet static
bridge_ports bond0.99
bridge_stp off
bridge_fd 0
auto vmbr100
iface vmbr100 inet static
bridge_ports bond0.100
bridge_stp off
bridge_fd 0
auto vmbr101
iface vmbr101 inet static
bridge_ports bond0.101
bridge_stp off
bridge_fd 0
auto vmbr170
iface vmbr170 inet static
bridge_ports bond0.170
bridge_stp off
bridge_fd 0
auto vmbr180
iface vmbr180 inet static
bridge_ports bond0.180
bridge_stp off
bridge_fd 0
auto vmbr190
iface vmbr190 inet static
bridge_ports bond0.190
bridge_stp off
bridge_fd 0
auto vmbr200
iface vmbr200 inet static
bridge_ports bond0.200
bridge_stp off
bridge_fd 0
auto eth6
iface eth6 inet static
address 192.168.0.166
netmask 255.255.255.0
gateway 192.168.0.251
----
cat /etc/hosts
127.0.0.1 localhost.localdomain localhost
# Admin LAN
192.168.0.166 node1708-1.coro.company.de node1708-1 pvelocalhost
## Proxmox Cluster ##
10.0.21.10 node1708vm-1.vm.company.de node1708vm-1
10.0.21.20 node1708vm-2.vm.company.de node1708vm-2
10.0.21.30 node1708vm-3.vm.company.de node1708vm-3
10.0.21.40 node1708vm-4.vm.company.de node1708vm-4
10.0.21.50 node1708vm-5.vm.company.de node1708vm-5
# Ceph network is 10.0.20.0/24
# The following lines are desirable for IPv6 capable hosts
::1 ip6-localhost ip6-loopback
fe00::0 ip6-localnet
ff00::0 ip6-mcastprefix
ff02::1 ip6-allnodes
ff02::2 ip6-allrouters
ff02::3 ip6-allhosts
---------
# cat /etc/pve/.members (at the time, when all nodes have a red cross)
{
"nodename": "node1708vm-1",
"version": 18,
"cluster": { "name": "CLUSTER01", "version": 7, "nodes": 5, "quorate": 1 },
"nodelist": {
"node1708vm-1": { "id": 1, "online": 1, "ip": "10.0.21.10"},
"node1708vm-2": { "id": 2, "online": 1, "ip": "10.0.21.20"},
"node1708vm-3": { "id": 3, "online": 1, "ip": "10.0.21.30"},
"node1708vm-4": { "id": 4, "online": 1, "ip": "10.0.21.40"},
"node1708vm-5": { "id": 5, "online": 1, "ip": "10.0.21.50"}
}
}
-----------
#pvecm status (at this moment 10.0.21.10 is offline)
Quorum information
------------------
Date: Sat Sep 23 21:40:31 2017
Quorum provider: corosync_votequorum
Nodes: 5
Node ID: 0x00000001
Ring ID: 1/412
Quorate: Yes
Votequorum information
----------------------
Expected votes: 5
Highest expected: 5
Total votes: 5
Quorum: 3
Flags: Quorate
Membership information
----------------------
Nodeid Votes Name
0x00000001 1 10.0.21.10 (local)
0x00000002 1 10.0.21.20
0x00000003 1 10.0.21.30
0x00000004 1 10.0.21.40
0x00000005 1 10.0.21.50
--------
journalctl -xe
Sep 23 21:17:54 node1708vm-1 pvestatd[2553]: status update time (600.290 seconds)
Sep 23 21:25:02 node1708vm-1 pmxcfs[2324]: [status] notice: received log
Sep 23 21:27:29 node1708vm-1 pmxcfs[2324]: [status] notice: received log
Sep 23 21:27:55 node1708vm-1 pvestatd[2553]: status update time (600.309 seconds)
Sep 23 21:36:47 node1708vm-1 rrdcached[2195]: flushing old values
Sep 23 21:36:47 node1708vm-1 rrdcached[2195]: rotating journals
Sep 23 21:36:47 node1708vm-1 rrdcached[2195]: started new journal /var/lib/rrdcached/journal/rrd.journal.1506195407.632967
Sep 23 21:36:47 node1708vm-1 rrdcached[2195]: removing old journal /var/lib/rrdcached/journal/rrd.journal.1506188207.632967
Sep 23 21:36:47 node1708vm-1 pmxcfs[2324]: [dcdb] notice: data verification successful
Sep 23 21:37:55 node1708vm-1 pvestatd[2553]: status update time (600.279 seconds)
Sep 23 21:40:02 node1708vm-1 pmxcfs[2324]: [status] notice: received log
Sep 23 21:42:29 node1708vm-1 pmxcfs[2324]: [status] notice: received log
-------
lspci | grep -i ethernet
03:00.0 Ethernet controller: Broadcom Limited BCM57840 NetXtreme II 10 Gigabit Ethernet (rev 11)
03:00.1 Ethernet controller: Broadcom Limited BCM57840 NetXtreme II 10 Gigabit Ethernet (rev 11)
04:00.0 Ethernet controller: Intel Corporation Ethernet Controller 10-Gigabit X540-AT2 (rev 01)
04:00.1 Ethernet controller: Intel Corporation Ethernet Controller 10-Gigabit X540-AT2 (rev 01)
81:00.0 Ethernet controller: Broadcom Limited BCM57840 NetXtreme II 10 Gigabit Ethernet (rev 11)
81:00.1 Ethernet controller: Broadcom Limited BCM57840 NetXtreme II 10 Gigabit Ethernet (rev 11)
83:00.0 Ethernet controller: Broadcom Limited BCM57840 NetXtreme II 10 Gigabit Ethernet (rev 11)
83:00.1 Ethernet controller: Broadcom Limited BCM57840 NetXtreme II 10 Gigabit Ethernet (rev 11)
I have no clue where to search for the reason of this strange issue and would be happy for every hint in this case.
Thanks in advance.
Juergen