Good morning,
since setting up a cluster with 3 nodes (all running PVE 8.2.2), we've been experiencing an issue where they no longer communicate with each other after a maximum of 24 hours. The hosts are directly connected to each other via a switch. There is no load on the ports in the switch or on the hosts, no overflows, or other errors or drops.
The network configuration is very typical:
Corosync is configured as follows:
The quorum statistics after the cluster has been running for a few hours look like this:
since setting up a cluster with 3 nodes (all running PVE 8.2.2), we've been experiencing an issue where they no longer communicate with each other after a maximum of 24 hours. The hosts are directly connected to each other via a switch. There is no load on the ports in the switch or on the hosts, no overflows, or other errors or drops.
The network configuration is very typical:
Code:
iface eno3 inet manual
auto vmbr871
iface vmbr871 inet static
address 13x.x.x.x/26
gateway 13x.x.x.x
bridge-ports eno3
bridge-stp off
bridge-fd 0
Corosync is configured as follows:
Code:
logging {
debug: on
to_syslog: yes
}
nodelist {
node {
name: mordor
nodeid: 1
quorum_votes: 1
ring0_addr: 13x.x.x.x
}
node {
name: neo
nodeid: 3
quorum_votes: 1
ring0_addr: 13x.x.x.x
}
node {
name: valhalla
nodeid: 2
quorum_votes: 1
ring0_addr: 13x.x.x.x
}
}
quorum {
provider: corosync_votequorum
}
totem {
cluster_name: XXXX
config_version: 38
interface {
linknumber: 0
}
ip_version: ipv4-6
link_mode: passive
secauth: on
version: 2
}
The quorum statistics after the cluster has been running for a few hours look like this:
Code:
stats.ipcs.global.active (u64) = 5
stats.ipcs.global.closed (u64) = 0
stats.ipcs.service0.2088.0x5b0d4a5334a0.dispatched (u64) = 0
stats.ipcs.service0.2088.0x5b0d4a5334a0.flow_control (u32) = 0
stats.ipcs.service0.2088.0x5b0d4a5334a0.flow_control_count (u64) = 4
stats.ipcs.service0.2088.0x5b0d4a5334a0.invalid_request (u64) = 0
stats.ipcs.service0.2088.0x5b0d4a5334a0.overload (u64) = 0
stats.ipcs.service0.2088.0x5b0d4a5334a0.procname (str) = pmxcfs
stats.ipcs.service0.2088.0x5b0d4a5334a0.queued (u32) = 0
stats.ipcs.service0.2088.0x5b0d4a5334a0.queueing (i32) = 0
stats.ipcs.service0.2088.0x5b0d4a5334a0.recv_retries (u64) = 0
stats.ipcs.service0.2088.0x5b0d4a5334a0.requests (u64) = 32
stats.ipcs.service0.2088.0x5b0d4a5334a0.responses (u64) = 32
stats.ipcs.service0.2088.0x5b0d4a5334a0.send_retries (u64) = 0
stats.ipcs.service0.2088.0x5b0d4a5334a0.sent (u32) = 0
stats.ipcs.service0.789201.0x5b0d4a531340.dispatched (u64) = 0
stats.ipcs.service0.789201.0x5b0d4a531340.flow_control (u32) = 0
stats.ipcs.service0.789201.0x5b0d4a531340.flow_control_count (u64) = 0
stats.ipcs.service0.789201.0x5b0d4a531340.invalid_request (u64) = 0
stats.ipcs.service0.789201.0x5b0d4a531340.overload (u64) = 0
stats.ipcs.service0.789201.0x5b0d4a531340.procname (str) = corosync-cmapct
stats.ipcs.service0.789201.0x5b0d4a531340.queued (u32) = 0
stats.ipcs.service0.789201.0x5b0d4a531340.queueing (i32) = 0
stats.ipcs.service0.789201.0x5b0d4a531340.recv_retries (u64) = 0
stats.ipcs.service0.789201.0x5b0d4a531340.requests (u64) = 54
stats.ipcs.service0.789201.0x5b0d4a531340.responses (u64) = 55
stats.ipcs.service0.789201.0x5b0d4a531340.send_retries (u64) = 0
stats.ipcs.service0.789201.0x5b0d4a531340.sent (u32) = 0
stats.ipcs.service2.2088.0x5b0d4a513c70.dispatched (u64) = 49994
stats.ipcs.service2.2088.0x5b0d4a513c70.flow_control (u32) = 0
stats.ipcs.service2.2088.0x5b0d4a513c70.flow_control_count (u64) = 4
stats.ipcs.service2.2088.0x5b0d4a513c70.invalid_request (u64) = 0
stats.ipcs.service2.2088.0x5b0d4a513c70.overload (u64) = 0
stats.ipcs.service2.2088.0x5b0d4a513c70.procname (str) = pmxcfs
stats.ipcs.service2.2088.0x5b0d4a513c70.queued (u32) = 0
stats.ipcs.service2.2088.0x5b0d4a513c70.queueing (i32) = 0
stats.ipcs.service2.2088.0x5b0d4a513c70.recv_retries (u64) = 0
stats.ipcs.service2.2088.0x5b0d4a513c70.requests (u64) = 16202
stats.ipcs.service2.2088.0x5b0d4a513c70.responses (u64) = 2
stats.ipcs.service2.2088.0x5b0d4a513c70.send_retries (u64) = 0
stats.ipcs.service2.2088.0x5b0d4a513c70.sent (u32) = 49994
stats.ipcs.service2.2088.0x5b0d4a52e740.dispatched (u64) = 29339
stats.ipcs.service2.2088.0x5b0d4a52e740.flow_control (u32) = 0
stats.ipcs.service2.2088.0x5b0d4a52e740.flow_control_count (u64) = 4
stats.ipcs.service2.2088.0x5b0d4a52e740.invalid_request (u64) = 0
stats.ipcs.service2.2088.0x5b0d4a52e740.overload (u64) = 0
stats.ipcs.service2.2088.0x5b0d4a52e740.procname (str) = pmxcfs
stats.ipcs.service2.2088.0x5b0d4a52e740.queued (u32) = 0
stats.ipcs.service2.2088.0x5b0d4a52e740.queueing (i32) = 0
stats.ipcs.service2.2088.0x5b0d4a52e740.recv_retries (u64) = 0
stats.ipcs.service2.2088.0x5b0d4a52e740.requests (u64) = 9841
stats.ipcs.service2.2088.0x5b0d4a52e740.responses (u64) = 2
stats.ipcs.service2.2088.0x5b0d4a52e740.send_retries (u64) = 0
stats.ipcs.service2.2088.0x5b0d4a52e740.sent (u32) = 29339
stats.ipcs.service3.2088.0x5b0d4a52b2c0.dispatched (u64) = 3
stats.ipcs.service3.2088.0x5b0d4a52b2c0.flow_control (u32) = 0
stats.ipcs.service3.2088.0x5b0d4a52b2c0.flow_control_count (u64) = 4
stats.ipcs.service3.2088.0x5b0d4a52b2c0.invalid_request (u64) = 0
stats.ipcs.service3.2088.0x5b0d4a52b2c0.overload (u64) = 0
stats.ipcs.service3.2088.0x5b0d4a52b2c0.procname (str) = pmxcfs
stats.ipcs.service3.2088.0x5b0d4a52b2c0.queued (u32) = 0
stats.ipcs.service3.2088.0x5b0d4a52b2c0.queueing (i32) = 0
stats.ipcs.service3.2088.0x5b0d4a52b2c0.recv_retries (u64) = 0
stats.ipcs.service3.2088.0x5b0d4a52b2c0.requests (u64) = 2
stats.ipcs.service3.2088.0x5b0d4a52b2c0.responses (u64) = 2
stats.ipcs.service3.2088.0x5b0d4a52b2c0.send_retries (u64) = 0
stats.ipcs.service3.2088.0x5b0d4a52b2c0.sent (u32) = 3
stats.knet.handle.rx_compress_time_ave (u64) = 0
stats.knet.handle.rx_compress_time_max (u64) = 0
stats.knet.handle.rx_compress_time_min (u64) = 18446744073709551615
stats.knet.handle.rx_compressed_original_bytes (u64) = 0
stats.knet.handle.rx_compressed_packets (u64) = 0
stats.knet.handle.rx_compressed_size_bytes (u64) = 0
stats.knet.handle.rx_crypt_packets (u64) = 972319
stats.knet.handle.rx_crypt_time_ave (u64) = 11280
stats.knet.handle.rx_crypt_time_max (u64) = 139752
stats.knet.handle.rx_crypt_time_min (u64) = 5729
stats.knet.handle.tx_compress_time_ave (u64) = 0
stats.knet.handle.tx_compress_time_max (u64) = 0
stats.knet.handle.tx_compress_time_min (u64) = 18446744073709551615
stats.knet.handle.tx_compressed_original_bytes (u64) = 0
stats.knet.handle.tx_compressed_packets (u64) = 0
stats.knet.handle.tx_compressed_size_bytes (u64) = 0
stats.knet.handle.tx_crypt_byte_overhead (u64) = 54978863
stats.knet.handle.tx_crypt_packets (u64) = 1027998
stats.knet.handle.tx_crypt_time_ave (u64) = 15113
stats.knet.handle.tx_crypt_time_max (u64) = 249407
stats.knet.handle.tx_crypt_time_min (u64) = 6286
stats.knet.handle.tx_uncompressed_packets (u64) = 0
stats.knet.node1.link0.connected (u8) = 1
stats.knet.node1.link0.down_count (u32) = 0
stats.knet.node1.link0.enabled (u8) = 1
stats.knet.node1.link0.latency_ave (u32) = 0
stats.knet.node1.link0.latency_max (u32) = 0
stats.knet.node1.link0.latency_min (u32) = 4294967295
stats.knet.node1.link0.latency_samples (u32) = 0
stats.knet.node1.link0.mtu (u32) = 65535
stats.knet.node1.link0.rx_data_bytes (u64) = 0
stats.knet.node1.link0.rx_data_packets (u64) = 0
stats.knet.node1.link0.rx_ping_bytes (u64) = 0
stats.knet.node1.link0.rx_ping_packets (u64) = 0
stats.knet.node1.link0.rx_pmtu_bytes (u64) = 0
stats.knet.node1.link0.rx_pmtu_packets (u64) = 0
stats.knet.node1.link0.rx_pong_bytes (u64) = 0
stats.knet.node1.link0.rx_pong_packets (u64) = 0
stats.knet.node1.link0.rx_total_bytes (u64) = 0
stats.knet.node1.link0.rx_total_packets (u64) = 0
stats.knet.node1.link0.rx_total_retries (u64) = 0
stats.knet.node1.link0.tx_data_bytes (u64) = 21566195
stats.knet.node1.link0.tx_data_errors (u32) = 0
stats.knet.node1.link0.tx_data_packets (u64) = 52124
stats.knet.node1.link0.tx_data_retries (u32) = 0
stats.knet.node1.link0.tx_ping_bytes (u64) = 0
stats.knet.node1.link0.tx_ping_errors (u32) = 0
stats.knet.node1.link0.tx_ping_packets (u64) = 0
stats.knet.node1.link0.tx_ping_retries (u32) = 0
stats.knet.node1.link0.tx_pmtu_bytes (u64) = 0
stats.knet.node1.link0.tx_pmtu_errors (u32) = 0
stats.knet.node1.link0.tx_pmtu_packets (u64) = 0
stats.knet.node1.link0.tx_pmtu_retries (u32) = 0
stats.knet.node1.link0.tx_pong_bytes (u64) = 0
stats.knet.node1.link0.tx_pong_errors (u32) = 0
stats.knet.node1.link0.tx_pong_packets (u64) = 0
stats.knet.node1.link0.tx_pong_retries (u32) = 0
stats.knet.node1.link0.tx_total_bytes (u64) = 21566195
stats.knet.node1.link0.tx_total_errors (u64) = 0
stats.knet.node1.link0.tx_total_packets (u64) = 52124
stats.knet.node1.link0.up_count (u32) = 1
stats.knet.node2.link0.connected (u8) = 1
stats.knet.node2.link0.down_count (u32) = 1
stats.knet.node2.link0.enabled (u8) = 1
stats.knet.node2.link0.latency_ave (u32) = 274
stats.knet.node2.link0.latency_max (u32) = 782
stats.knet.node2.link0.latency_min (u32) = 274
stats.knet.node2.link0.latency_samples (u32) = 2048
stats.knet.node2.link0.mtu (u32) = 1397
stats.knet.node2.link0.rx_data_bytes (u64) = 26579418
stats.knet.node2.link0.rx_data_packets (u64) = 45299
stats.knet.node2.link0.rx_ping_bytes (u64) = 383916
stats.knet.node2.link0.rx_ping_packets (u64) = 14766
stats.knet.node2.link0.rx_pmtu_bytes (u64) = 703103
stats.knet.node2.link0.rx_pmtu_packets (u64) = 981
stats.knet.node2.link0.rx_pong_bytes (u64) = 383708
stats.knet.node2.link0.rx_pong_packets (u64) = 14758
stats.knet.node2.link0.rx_total_bytes (u64) = 28050145
stats.knet.node2.link0.rx_total_packets (u64) = 75804
stats.knet.node2.link0.rx_total_retries (u64) = 0
stats.knet.node2.link0.tx_data_bytes (u64) = 141649552
stats.knet.node2.link0.tx_data_errors (u32) = 0
stats.knet.node2.link0.tx_data_packets (u64) = 966168
stats.knet.node2.link0.tx_data_retries (u32) = 0
stats.knet.node2.link0.tx_ping_bytes (u64) = 1180640
stats.knet.node2.link0.tx_ping_errors (u32) = 0
stats.knet.node2.link0.tx_ping_packets (u64) = 14758
stats.knet.node2.link0.tx_ping_retries (u32) = 0
stats.knet.node2.link0.tx_pmtu_bytes (u64) = 721280
stats.knet.node2.link0.tx_pmtu_errors (u32) = 0
stats.knet.node2.link0.tx_pmtu_packets (u64) = 490
stats.knet.node2.link0.tx_pmtu_retries (u32) = 0
stats.knet.node2.link0.tx_pong_bytes (u64) = 1181280
stats.knet.node2.link0.tx_pong_errors (u32) = 0
stats.knet.node2.link0.tx_pong_packets (u64) = 14766
stats.knet.node2.link0.tx_pong_retries (u32) = 0
stats.knet.node2.link0.tx_total_bytes (u64) = 144732752
stats.knet.node2.link0.tx_total_errors (u64) = 0
stats.knet.node2.link0.tx_total_packets (u64) = 996182
stats.knet.node2.link0.up_count (u32) = 1
stats.knet.node3.link0.connected (u8) = 1
stats.knet.node3.link0.down_count (u32) = 2
stats.knet.node3.link0.enabled (u8) = 1
stats.knet.node3.link0.latency_ave (u32) = 178
stats.knet.node3.link0.latency_max (u32) = 201581
stats.knet.node3.link0.latency_min (u32) = 178
stats.knet.node3.link0.latency_samples (u32) = 2048
stats.knet.node3.link0.mtu (u32) = 1397
stats.knet.node3.link0.rx_data_bytes (u64) = 80941116
stats.knet.node3.link0.rx_data_packets (u64) = 927020
stats.knet.node3.link0.rx_ping_bytes (u64) = 380042
stats.knet.node3.link0.rx_ping_packets (u64) = 14617
stats.knet.node3.link0.rx_pmtu_bytes (u64) = 698798
stats.knet.node3.link0.rx_pmtu_packets (u64) = 974
stats.knet.node3.link0.rx_pong_bytes (u64) = 380042
stats.knet.node3.link0.rx_pong_packets (u64) = 14617
stats.knet.node3.link0.rx_total_bytes (u64) = 82399998
stats.knet.node3.link0.rx_total_packets (u64) = 957228
stats.knet.node3.link0.rx_total_retries (u64) = 0
stats.knet.node3.link0.tx_data_bytes (u64) = 25527312
stats.knet.node3.link0.tx_data_errors (u32) = 0
stats.knet.node3.link0.tx_data_packets (u64) = 60574
stats.knet.node3.link0.tx_data_retries (u32) = 0
stats.knet.node3.link0.tx_ping_bytes (u64) = 1180640
stats.knet.node3.link0.tx_ping_errors (u32) = 0
stats.knet.node3.link0.tx_ping_packets (u64) = 14758
stats.knet.node3.link0.tx_ping_retries (u32) = 0
stats.knet.node3.link0.tx_pmtu_bytes (u64) = 715392
stats.knet.node3.link0.tx_pmtu_errors (u32) = 0
stats.knet.node3.link0.tx_pmtu_packets (u64) = 486
stats.knet.node3.link0.tx_pmtu_retries (u32) = 0
stats.knet.node3.link0.tx_pong_bytes (u64) = 1169360
stats.knet.node3.link0.tx_pong_errors (u32) = 0
stats.knet.node3.link0.tx_pong_packets (u64) = 14617
stats.knet.node3.link0.tx_pong_retries (u32) = 0
stats.knet.node3.link0.tx_total_bytes (u64) = 28592704
stats.knet.node3.link0.tx_total_errors (u64) = 0
stats.knet.node3.link0.tx_total_packets (u64) = 90435
stats.knet.node3.link0.up_count (u32) = 2
stats.pg.msg_queue_avail (u32) = 0
stats.pg.msg_reserved (u32) = 2
stats.srp.avg_backlog_calc (u32) = 0
stats.srp.avg_token_workload (u32) = 0
stats.srp.commit_entered (u64) = 4
stats.srp.commit_token_lost (u64) = 0
stats.srp.consensus_timeouts (u64) = 0
stats.srp.continuous_gather (u32) = 0
stats.srp.continuous_sendmsg_failures (u32) = 0
stats.srp.firewall_enabled_or_nic_failure (u8) = 0
stats.srp.gather_entered (u64) = 5
stats.srp.gather_token_lost (u64) = 0
stats.srp.mcast_retx (u64) = 0
stats.srp.mcast_rx (u64) = 51452
stats.srp.mcast_tx (u64) = 17245
stats.srp.memb_commit_token_rx (u64) = 8
stats.srp.memb_commit_token_tx (u64) = 8
stats.srp.memb_join_rx (u64) = 31
stats.srp.memb_join_tx (u64) = 5
stats.srp.memb_merge_detect_rx (u64) = 25877
stats.srp.memb_merge_detect_tx (u64) = 25876
stats.srp.mtt_rx_token (u32) = 10
stats.srp.operational_entered (u64) = 4
stats.srp.operational_token_lost (u64) = 0
stats.srp.orf_token_rx (u64) = 904961
stats.srp.orf_token_tx (u64) = 4
stats.srp.recovery_entered (u64) = 4
stats.srp.recovery_token_lost (u64) = 0
stats.srp.rx_msg_dropped (u64) = 0
stats.srp.time_since_token_last_received (u64) = 109
stats.srp.token_hold_cancel_rx (u64) = 23967
stats.srp.token_hold_cancel_tx (u64) = 8952
Last edited: