I have had this problem for a while: corosync stops syncing and pvecm looses quorum
i "fixed" this by running this cron job every few minutes. this allows corosync to to resync for a while but when I connect via web ui only the current node seems online
---------------------------------------------------------
#! /bin/bash
nodes=$1
stat=$(pvecm status)
#echo $stat
if [[ $stat =~ "Total votes: 1" ]]; then
date
echo "only 1 total votes, restarting corosync"
/usr/sbin/service corosync restart
fi
---------------------------------------------------------
after that things are OK for few minutes:
root@proxa3:~# pvecm status
Quorum information
------------------
Date: Tue Feb 27 22:54:05 2018
Quorum provider: corosync_votequorum
Nodes: 4
Node ID: 0x00000003
Ring ID: 1/16475960
Quorate: Yes
Votequorum information
----------------------
Expected votes: 4
Highest expected: 4
Total votes: 4
Quorum: 3
Flags: Quorate
Membership information
----------------------
Nodeid Votes Name
0x00000001 1 10.10.117.75
0x00000002 1 10.10.117.76
0x00000003 1 10.10.117.77 (local)
0x00000004 1 10.10.117.78
--------------------------------------------------------------------
however I am getting many of these in syslog:
pmxcfs[8440]: [status] crit: cpg_send_message failed: 2
other forum posts suggest running omping and it seems I am having 50% packet loss with multicast.
Is something wrong with the network?
root@proxa1:~# omping -c 600 -i 1 -q proxa1 proxa2 proxa3 proxa4
proxa4 : joined (S,G) = (*, 232.43.211.234), pinging
proxa2 : joined (S,G) = (*, 232.43.211.234), pinging
proxa3 : joined (S,G) = (*, 232.43.211.234), pinging
proxa4 : given amount of query messages was sent
proxa2 : given amount of query messages was sent
proxa3 : given amount of query messages was sent
proxa2 : unicast, xmt/rcv/%loss = 600/600/0%, min/avg/max/std-dev = 0.084/0.176/0.303/0.030
proxa2 : multicast, xmt/rcv/%loss = 600/301/49%, min/avg/max/std-dev = 0.094/0.207/0.274/0.031
proxa3 : unicast, xmt/rcv/%loss = 600/600/0%, min/avg/max/std-dev = 0.075/0.323/1.118/0.188
proxa3 : multicast, xmt/rcv/%loss = 600/300/50%, min/avg/max/std-dev = 0.173/0.441/0.912/0.157
proxa4 : unicast, xmt/rcv/%loss = 600/600/0%, min/avg/max/std-dev = 0.084/0.228/0.496/0.069
proxa4 : multicast, xmt/rcv/%loss = 600/304/49% (seq>=2 49%), min/avg/max/std-dev = 0.129/0.293/0.531/0.059
root@proxa2:~# omping -c 600 -i 1 -q proxa1 proxa2 proxa3 proxa4
proxa4 : joined (S,G) = (*, 232.43.211.234), pinging
proxa1 : joined (S,G) = (*, 232.43.211.234), pinging
proxa3 : joined (S,G) = (*, 232.43.211.234), pinging
proxa1 : given amount of query messages was sent
proxa4 : given amount of query messages was sent
proxa3 : given amount of query messages was sent
proxa1 : unicast, xmt/rcv/%loss = 600/600/0%, min/avg/max/std-dev = 0.086/0.182/0.259/0.024
proxa1 : multicast, xmt/rcv/%loss = 600/302/49% (seq>=2 49%), min/avg/max/std-dev = 0.118/0.206/0.277/0.022
proxa3 : unicast, xmt/rcv/%loss = 600/600/0%, min/avg/max/std-dev = 0.097/0.409/1.127/0.193
proxa3 : multicast, xmt/rcv/%loss = 600/300/50%, min/avg/max/std-dev = 0.245/0.504/1.158/0.187
proxa4 : unicast, xmt/rcv/%loss = 600/600/0%, min/avg/max/std-dev = 0.075/0.234/0.494/0.076
proxa4 : multicast, xmt/rcv/%loss = 600/302/49% (seq>=2 49%), min/avg/max/std-dev = 0.123/0.284/0.520/0.075
root@proxa3:~# omping -c 600 -i 1 -q proxa1 proxa2 proxa3 proxa4
proxa1 : joined (S,G) = (*, 232.43.211.234), pinging
proxa4 : joined (S,G) = (*, 232.43.211.234), pinging
proxa2 : joined (S,G) = (*, 232.43.211.234), pinging
proxa1 : given amount of query messages was sent
proxa2 : given amount of query messages was sent
proxa4 : given amount of query messages was sent
proxa1 : unicast, xmt/rcv/%loss = 600/600/0%, min/avg/max/std-dev = 0.100/0.194/0.439/0.052
proxa1 : multicast, xmt/rcv/%loss = 600/300/50% (seq>=2 49%), min/avg/max/std-dev = 0.104/0.204/0.468/0.048
proxa2 : unicast, xmt/rcv/%loss = 600/600/0%, min/avg/max/std-dev = 0.100/0.197/1.221/0.068
proxa2 : multicast, xmt/rcv/%loss = 600/300/50% (seq>=2 49%), min/avg/max/std-dev = 0.109/0.214/0.399/0.045
proxa4 : unicast, xmt/rcv/%loss = 600/600/0%, min/avg/max/std-dev = 0.068/0.193/0.722/0.091
proxa4 : multicast, xmt/rcv/%loss = 600/300/50% (seq>=2 49%), min/avg/max/std-dev = 0.124/0.306/0.644/0.115
root@proxa4:~# omping -c 600 -i 1 -q proxa1 proxa2 proxa3 proxa4
proxa1 : joined (S,G) = (*, 232.43.211.234), pinging
proxa2 : joined (S,G) = (*, 232.43.211.234), pinging
proxa3 : joined (S,G) = (*, 232.43.211.234), pinging
proxa1 : given amount of query messages was sent
proxa2 : given amount of query messages was sent
proxa3 : given amount of query messages was sent
proxa1 : unicast, xmt/rcv/%loss = 600/600/0%, min/avg/max/std-dev = 0.083/0.190/0.259/0.027
proxa1 : multicast, xmt/rcv/%loss = 600/304/49%, min/avg/max/std-dev = 0.095/0.201/0.247/0.023
proxa2 : unicast, xmt/rcv/%loss = 600/600/0%, min/avg/max/std-dev = 0.066/0.160/0.231/0.020
proxa2 : multicast, xmt/rcv/%loss = 600/302/49%, min/avg/max/std-dev = 0.086/0.173/0.250/0.020
proxa3 : unicast, xmt/rcv/%loss = 600/600/0%, min/avg/max/std-dev = 0.082/0.298/1.148/0.179
proxa3 : multicast, xmt/rcv/%loss = 600/300/50%, min/avg/max/std-dev = 0.221/0.366/1.055/0.100
i "fixed" this by running this cron job every few minutes. this allows corosync to to resync for a while but when I connect via web ui only the current node seems online
---------------------------------------------------------
#! /bin/bash
nodes=$1
stat=$(pvecm status)
#echo $stat
if [[ $stat =~ "Total votes: 1" ]]; then
date
echo "only 1 total votes, restarting corosync"
/usr/sbin/service corosync restart
fi
---------------------------------------------------------
after that things are OK for few minutes:
root@proxa3:~# pvecm status
Quorum information
------------------
Date: Tue Feb 27 22:54:05 2018
Quorum provider: corosync_votequorum
Nodes: 4
Node ID: 0x00000003
Ring ID: 1/16475960
Quorate: Yes
Votequorum information
----------------------
Expected votes: 4
Highest expected: 4
Total votes: 4
Quorum: 3
Flags: Quorate
Membership information
----------------------
Nodeid Votes Name
0x00000001 1 10.10.117.75
0x00000002 1 10.10.117.76
0x00000003 1 10.10.117.77 (local)
0x00000004 1 10.10.117.78
--------------------------------------------------------------------
however I am getting many of these in syslog:
pmxcfs[8440]: [status] crit: cpg_send_message failed: 2
other forum posts suggest running omping and it seems I am having 50% packet loss with multicast.
Is something wrong with the network?
root@proxa1:~# omping -c 600 -i 1 -q proxa1 proxa2 proxa3 proxa4
proxa4 : joined (S,G) = (*, 232.43.211.234), pinging
proxa2 : joined (S,G) = (*, 232.43.211.234), pinging
proxa3 : joined (S,G) = (*, 232.43.211.234), pinging
proxa4 : given amount of query messages was sent
proxa2 : given amount of query messages was sent
proxa3 : given amount of query messages was sent
proxa2 : unicast, xmt/rcv/%loss = 600/600/0%, min/avg/max/std-dev = 0.084/0.176/0.303/0.030
proxa2 : multicast, xmt/rcv/%loss = 600/301/49%, min/avg/max/std-dev = 0.094/0.207/0.274/0.031
proxa3 : unicast, xmt/rcv/%loss = 600/600/0%, min/avg/max/std-dev = 0.075/0.323/1.118/0.188
proxa3 : multicast, xmt/rcv/%loss = 600/300/50%, min/avg/max/std-dev = 0.173/0.441/0.912/0.157
proxa4 : unicast, xmt/rcv/%loss = 600/600/0%, min/avg/max/std-dev = 0.084/0.228/0.496/0.069
proxa4 : multicast, xmt/rcv/%loss = 600/304/49% (seq>=2 49%), min/avg/max/std-dev = 0.129/0.293/0.531/0.059
root@proxa2:~# omping -c 600 -i 1 -q proxa1 proxa2 proxa3 proxa4
proxa4 : joined (S,G) = (*, 232.43.211.234), pinging
proxa1 : joined (S,G) = (*, 232.43.211.234), pinging
proxa3 : joined (S,G) = (*, 232.43.211.234), pinging
proxa1 : given amount of query messages was sent
proxa4 : given amount of query messages was sent
proxa3 : given amount of query messages was sent
proxa1 : unicast, xmt/rcv/%loss = 600/600/0%, min/avg/max/std-dev = 0.086/0.182/0.259/0.024
proxa1 : multicast, xmt/rcv/%loss = 600/302/49% (seq>=2 49%), min/avg/max/std-dev = 0.118/0.206/0.277/0.022
proxa3 : unicast, xmt/rcv/%loss = 600/600/0%, min/avg/max/std-dev = 0.097/0.409/1.127/0.193
proxa3 : multicast, xmt/rcv/%loss = 600/300/50%, min/avg/max/std-dev = 0.245/0.504/1.158/0.187
proxa4 : unicast, xmt/rcv/%loss = 600/600/0%, min/avg/max/std-dev = 0.075/0.234/0.494/0.076
proxa4 : multicast, xmt/rcv/%loss = 600/302/49% (seq>=2 49%), min/avg/max/std-dev = 0.123/0.284/0.520/0.075
root@proxa3:~# omping -c 600 -i 1 -q proxa1 proxa2 proxa3 proxa4
proxa1 : joined (S,G) = (*, 232.43.211.234), pinging
proxa4 : joined (S,G) = (*, 232.43.211.234), pinging
proxa2 : joined (S,G) = (*, 232.43.211.234), pinging
proxa1 : given amount of query messages was sent
proxa2 : given amount of query messages was sent
proxa4 : given amount of query messages was sent
proxa1 : unicast, xmt/rcv/%loss = 600/600/0%, min/avg/max/std-dev = 0.100/0.194/0.439/0.052
proxa1 : multicast, xmt/rcv/%loss = 600/300/50% (seq>=2 49%), min/avg/max/std-dev = 0.104/0.204/0.468/0.048
proxa2 : unicast, xmt/rcv/%loss = 600/600/0%, min/avg/max/std-dev = 0.100/0.197/1.221/0.068
proxa2 : multicast, xmt/rcv/%loss = 600/300/50% (seq>=2 49%), min/avg/max/std-dev = 0.109/0.214/0.399/0.045
proxa4 : unicast, xmt/rcv/%loss = 600/600/0%, min/avg/max/std-dev = 0.068/0.193/0.722/0.091
proxa4 : multicast, xmt/rcv/%loss = 600/300/50% (seq>=2 49%), min/avg/max/std-dev = 0.124/0.306/0.644/0.115
root@proxa4:~# omping -c 600 -i 1 -q proxa1 proxa2 proxa3 proxa4
proxa1 : joined (S,G) = (*, 232.43.211.234), pinging
proxa2 : joined (S,G) = (*, 232.43.211.234), pinging
proxa3 : joined (S,G) = (*, 232.43.211.234), pinging
proxa1 : given amount of query messages was sent
proxa2 : given amount of query messages was sent
proxa3 : given amount of query messages was sent
proxa1 : unicast, xmt/rcv/%loss = 600/600/0%, min/avg/max/std-dev = 0.083/0.190/0.259/0.027
proxa1 : multicast, xmt/rcv/%loss = 600/304/49%, min/avg/max/std-dev = 0.095/0.201/0.247/0.023
proxa2 : unicast, xmt/rcv/%loss = 600/600/0%, min/avg/max/std-dev = 0.066/0.160/0.231/0.020
proxa2 : multicast, xmt/rcv/%loss = 600/302/49%, min/avg/max/std-dev = 0.086/0.173/0.250/0.020
proxa3 : unicast, xmt/rcv/%loss = 600/600/0%, min/avg/max/std-dev = 0.082/0.298/1.148/0.179
proxa3 : multicast, xmt/rcv/%loss = 600/300/50%, min/avg/max/std-dev = 0.221/0.366/1.055/0.100