Hello,
we are facing pretty curios troubles in Proxmox cluster.
When we restart node or corosing on some host (systemctl restart corosync), then on some other nodes ethernet adapter crashes and does not recover himself.
Yesterday we performed a test - we restarted server "backup" and in few seconds we lost the connection with servers "pve, prox3, prox2-brno". I went physically to "prox3" server, thinking that i can do something about the situation via terminal. But unfortunatelly the server was in unusable state, was not responding to keystrokes, after 15 minutes of waiting just some enter keys came through. In this state "hard restart" is always necessary.
I am not sure how this can happen that one restarting server affect ethernet adapter on different nodes of the cluster.
I was trying to collect some log, just tell me if i forgot some necessary logs.
Iam not entirely sure if it can be related to some corosync setting, but we have added knet_transport: sctp and token: 10000, because the cluster was all the time falling apart. After adding those two lines the problem was fixed and cluster sticks together.
Time of the test was about 17:03
Thanks for any help!
we are facing pretty curios troubles in Proxmox cluster.
When we restart node or corosing on some host (systemctl restart corosync), then on some other nodes ethernet adapter crashes and does not recover himself.
Yesterday we performed a test - we restarted server "backup" and in few seconds we lost the connection with servers "pve, prox3, prox2-brno". I went physically to "prox3" server, thinking that i can do something about the situation via terminal. But unfortunatelly the server was in unusable state, was not responding to keystrokes, after 15 minutes of waiting just some enter keys came through. In this state "hard restart" is always necessary.
I am not sure how this can happen that one restarting server affect ethernet adapter on different nodes of the cluster.
I was trying to collect some log, just tell me if i forgot some necessary logs.
Iam not entirely sure if it can be related to some corosync setting, but we have added knet_transport: sctp and token: 10000, because the cluster was all the time falling apart. After adding those two lines the problem was fixed and cluster sticks together.
logging {
debug: off
to_syslog: yes
}
nodelist {
node {
name: backup
nodeid: 5
quorum_votes: 1
ring0_addr: 192.168.0.14
}
node {
name: havirov-prox1
nodeid: 8
quorum_votes: 1
ring0_addr: 192.168.6.2
}
node {
name: prox1
nodeid: 2
quorum_votes: 1
ring0_addr: 192.168.0.11
}
node {
name: prox1-brno
nodeid: 9
quorum_votes: 1
ring0_addr: 192.168.7.2
}
node {
name: prox2
nodeid: 3
quorum_votes: 1
ring0_addr: 192.168.0.12
}
node {
name: prox2-brno
nodeid: 7
quorum_votes: 1
ring0_addr: 192.168.7.10
}
node {
name: prox3
nodeid: 4
quorum_votes: 1
ring0_addr: 192.168.0.13
}
node {
name: prox4
nodeid: 6
quorum_votes: 1
ring0_addr: 192.168.0.15
}
node {
name: pve
nodeid: 1
quorum_votes: 1
ring0_addr: 192.168.0.19
}
}
quorum {
provider: corosync_votequorum
}
totem {
cluster_name: cutter-pv
config_version: 46
interface {
knet_transport: sctp
linknumber: 0
}
ip_version: ipv4
secauth: on
token: 10000
version: 2
}
debug: off
to_syslog: yes
}
nodelist {
node {
name: backup
nodeid: 5
quorum_votes: 1
ring0_addr: 192.168.0.14
}
node {
name: havirov-prox1
nodeid: 8
quorum_votes: 1
ring0_addr: 192.168.6.2
}
node {
name: prox1
nodeid: 2
quorum_votes: 1
ring0_addr: 192.168.0.11
}
node {
name: prox1-brno
nodeid: 9
quorum_votes: 1
ring0_addr: 192.168.7.2
}
node {
name: prox2
nodeid: 3
quorum_votes: 1
ring0_addr: 192.168.0.12
}
node {
name: prox2-brno
nodeid: 7
quorum_votes: 1
ring0_addr: 192.168.7.10
}
node {
name: prox3
nodeid: 4
quorum_votes: 1
ring0_addr: 192.168.0.13
}
node {
name: prox4
nodeid: 6
quorum_votes: 1
ring0_addr: 192.168.0.15
}
node {
name: pve
nodeid: 1
quorum_votes: 1
ring0_addr: 192.168.0.19
}
}
quorum {
provider: corosync_votequorum
}
totem {
cluster_name: cutter-pv
config_version: 46
interface {
knet_transport: sctp
linknumber: 0
}
ip_version: ipv4
secauth: on
token: 10000
version: 2
}
root@prox3:~$ pveversion -v
proxmox-ve: 6.1-2 (running kernel: 5.4.27-1-pve)
pve-manager: 6.1-8 (running version: 6.1-8/806edfe1)
pve-kernel-5.4: 6.1-8
pve-kernel-helper: 6.1-8
pve-kernel-5.3: 6.1-6
pve-kernel-5.4.27-1-pve: 5.4.27-1
pve-kernel-5.4.24-1-pve: 5.4.24-1
pve-kernel-5.3.18-3-pve: 5.3.18-3
pve-kernel-4.13.13-5-pve: 4.13.13-38
pve-kernel-4.13.13-2-pve: 4.13.13-33
ceph: 12.2.13-pve1
ceph-fuse: 12.2.13-pve1
corosync: 3.0.3-pve1
criu: 3.11-3
glusterfs-client: 5.5-3
ifupdown: 0.8.35+pve1
ksm-control-daemon: 1.3-1
libjs-extjs: 6.0.1-10
libknet1: 1.15-pve1
libpve-access-control: 6.0-6
libpve-apiclient-perl: 3.0-3
libpve-common-perl: 6.0-17
libpve-guest-common-perl: 3.0-5
libpve-http-server-perl: 3.0-5
libpve-storage-perl: 6.1-5
libqb0: 1.0.5-1
libspice-server1: 0.14.2-4~pve6+1
lvm2: 2.03.02-pve4
lxc-pve: 3.2.1-1
lxcfs: 4.0.1-pve1
novnc-pve: 1.1.0-1
proxmox-mini-journalreader: 1.1-1
proxmox-widget-toolkit: 2.1-3
pve-cluster: 6.1-4
pve-container: 3.0-23
pve-docs: 6.1-6
pve-edk2-firmware: 2.20200229-1
pve-firewall: 4.0-10
pve-firmware: 3.0-7
pve-ha-manager: 3.0-9
pve-i18n: 2.0-4
pve-qemu-kvm: 4.1.1-4
pve-xtermjs: 4.3.0-1
qemu-server: 6.1-7
smartmontools: 7.1-pve2
spiceterm: 3.1-1
vncterm: 1.6-1
zfsutils-linux: 0.8.3-pve1
proxmox-ve: 6.1-2 (running kernel: 5.4.27-1-pve)
pve-manager: 6.1-8 (running version: 6.1-8/806edfe1)
pve-kernel-5.4: 6.1-8
pve-kernel-helper: 6.1-8
pve-kernel-5.3: 6.1-6
pve-kernel-5.4.27-1-pve: 5.4.27-1
pve-kernel-5.4.24-1-pve: 5.4.24-1
pve-kernel-5.3.18-3-pve: 5.3.18-3
pve-kernel-4.13.13-5-pve: 4.13.13-38
pve-kernel-4.13.13-2-pve: 4.13.13-33
ceph: 12.2.13-pve1
ceph-fuse: 12.2.13-pve1
corosync: 3.0.3-pve1
criu: 3.11-3
glusterfs-client: 5.5-3
ifupdown: 0.8.35+pve1
ksm-control-daemon: 1.3-1
libjs-extjs: 6.0.1-10
libknet1: 1.15-pve1
libpve-access-control: 6.0-6
libpve-apiclient-perl: 3.0-3
libpve-common-perl: 6.0-17
libpve-guest-common-perl: 3.0-5
libpve-http-server-perl: 3.0-5
libpve-storage-perl: 6.1-5
libqb0: 1.0.5-1
libspice-server1: 0.14.2-4~pve6+1
lvm2: 2.03.02-pve4
lxc-pve: 3.2.1-1
lxcfs: 4.0.1-pve1
novnc-pve: 1.1.0-1
proxmox-mini-journalreader: 1.1-1
proxmox-widget-toolkit: 2.1-3
pve-cluster: 6.1-4
pve-container: 3.0-23
pve-docs: 6.1-6
pve-edk2-firmware: 2.20200229-1
pve-firewall: 4.0-10
pve-firmware: 3.0-7
pve-ha-manager: 3.0-9
pve-i18n: 2.0-4
pve-qemu-kvm: 4.1.1-4
pve-xtermjs: 4.3.0-1
qemu-server: 6.1-7
smartmontools: 7.1-pve2
spiceterm: 3.1-1
vncterm: 1.6-1
zfsutils-linux: 0.8.3-pve1
Thanks for any help!