Hello all,
I'm experimenting with HA clustering and I ran into a possible bug.
VM is configured with HA group to preferably run on the node 3:
Cluster is healthy and quorates:
I shutdown the node 3 and the VM is shutdown as well as per HA default policy.
HA detects that node PVE3 is unreachable, that the VM is stopped, tries and fails to start the VM…
… but the VM is considered started by the HA nevertheless:
Same status can be seen by QEMU:
All the while, the PVE Cluster is healthy:
The VM is unreacheable: no console, no ping, no services, no guest agent…
I didn't try to power cycle the VM that is in this incoherent state (HA vs QM).
Everything works properly as soon as I power the node 3 back:
The VM is back on node 3, pings and services are operational.
Regarding the environment, the cluster is up-to-date and all nodes run the same configuration:
Storage is powered by a healthy ceph cluster:
Could it be a bug that I need to report or is it a configuration problem?
BR, PAH.
I'm experimenting with HA clustering and I ran into a possible bug.
VM is configured with HA group to preferably run on the node 3:
Code:
root@srv-pve1:~# cat /etc/pve/ha/groups.cfg
group: PVE3_First
comment PVE3 is prefered for this group
nodes srv-pve3
nofailback 0
restricted 0
Code:
root@srv-pve1:~# cat /etc/pve/ha/resources.cfg
vm: 244
comment Associates OMV4 on PVE3
group PVE3_First
state started
Cluster is healthy and quorates:
Code:
root@srv-pve1:~# ha-manager status
quorum OK
master srv-pve2 (active, Wed Mar 20 10:10:49 2019)
lrm srv-pve1 (active, Wed Mar 20 10:10:48 2019)
lrm srv-pve2 (active, Wed Mar 20 10:10:51 2019)
lrm srv-pve3 (active, Wed Mar 20 10:10:54 2019)
service vm:244 (srv-pve3, started)
I shutdown the node 3 and the VM is shutdown as well as per HA default policy.
HA detects that node PVE3 is unreachable, that the VM is stopped, tries and fails to start the VM…
Code:
task started by HA resource agent
TASK ERROR: start failed: command '/usr/bin/kvm -id 244 -name SRV-OMV-VIDEO4 -chardev 'socket,id=qmp,path=/var/run/qemu-server/244.qmp,server,nowait' -mon 'chardev=qmp,mode=control' -chardev 'socket,id=qmp-event,path=/var/run/qmeventd.sock,reconnect=5' -mon 'chardev=qmp-event,mode=control' -pidfile /var/run/qemu-server/244.pid -daemonize -smbios 'type=1,uuid=8868a218-d321-4b38-afc4-dc56108658b9' -smp '8,sockets=2,cores=4,maxcpus=8' -nodefaults -boot 'menu=on,strict=on,reboot-timeout=1000,splash=/usr/share/qemu-server/bootsplash.jpg' -vnc unix:/var/run/qemu-server/244.vnc,x509,password -cpu kvm64,+lahf_lm,+sep,+kvm_pv_unhalt,+kvm_pv_eoi,enforce -m 8192 -device 'pci-bridge,id=pci.2,chassis_nr=2,bus=pci.0,addr=0x1f' -device 'pci-bridge,id=pci.1,chassis_nr=1,bus=pci.0,addr=0x1e' -device 'vmgenid,guid=449229ce-33c8-430e-b801-4132a9633ac7' -device 'piix3-usb-uhci,id=uhci,bus=pci.0,addr=0x1.0x2' -device 'usb-tablet,id=tablet,bus=uhci.0,port=1' -device 'VGA,id=vga,bus=pci.0,addr=0x2' -chardev 'socket,path=/var/run/qemu-server/244.qga,server,nowait,id=qga0' -device 'virtio-serial,id=qga0,bus=pci.0,addr=0x8' -device 'virtserialport,chardev=qga0,name=org.qemu.guest_agent.0' -iscsi 'initiator-name=iqn.1993-08.org.debian:01:9d7c74e48281' -drive 'if=none,id=drive-ide2,media=cdrom,aio=threads' -device 'ide-cd,bus=ide.1,unit=0,drive=drive-ide2,id=ide2,bootindex=200' -device 'virtio-scsi-pci,id=scsihw0,bus=pci.0,addr=0x5' -drive 'file=rbd:STD_POOL/vm-244-disk-0:conf=/etc/pve/ceph.conf:id=admin:keyring=/etc/pve/priv/ceph/STD_POOL_vm.keyring,if=none,id=drive-scsi0,format=raw,cache=none,aio=native,detect-zeroes=on' -device 'scsi-hd,bus=scsihw0.0,channel=0,scsi-id=0,lun=0,drive=drive-scsi0,id=scsi0,bootindex=100' -drive 'file=rbd:STD_POOL/vm-244-disk-1:conf=/etc/pve/ceph.conf:id=admin:keyring=/etc/pve/priv/ceph/STD_POOL_vm.keyring,if=none,id=drive-scsi1,format=raw,cache=none,aio=native,detect-zeroes=on' -device 'scsi-hd,bus=scsihw0.0,channel=0,scsi-id=0,lun=1,drive=drive-scsi1,id=scsi1' -netdev 'type=tap,id=net0,ifname=tap244i0,script=/var/lib/qemu-server/pve-bridge,downscript=/var/lib/qemu-server/pve-bridgedown,vhost=on' -device 'virtio-net-pci,mac=56:67:F6:99:BD:4C,netdev=net0,bus=pci.0,addr=0x12,id=net0,bootindex=300' -netdev 'type=tap,id=net1,ifname=tap244i1,script=/var/lib/qemu-server/pve-bridge,downscript=/var/lib/qemu-server/pve-bridgedown,vhost=on' -device 'virtio-net-pci,mac=6A:33:04:C1:03:F9,netdev=net1,bus=pci.0,addr=0x13,id=net1,bootindex=301' -machine 'type=pc'' failed: got timeout
… but the VM is considered started by the HA nevertheless:
Code:
root@srv-pve1:~# ha-manager status
quorum OK
master srv-pve2 (active, Wed Mar 20 10:23:09 2019)
lrm srv-pve1 (active, Wed Mar 20 10:23:07 2019)
lrm srv-pve2 (active, Wed Mar 20 10:23:09 2019)
lrm srv-pve3 (old timestamp - dead?, Wed Mar 20 10:13:39 2019)
service vm:244 (srv-pve1, started)
Same status can be seen by QEMU:
Code:
root@srv-pve1:~# qm status 244
status: running
All the while, the PVE Cluster is healthy:
Code:
root@srv-pve1:~# pvecm status
Quorum information
------------------
Date: Wed Mar 20 10:23:30 2019
Quorum provider: corosync_votequorum
Nodes: 2
Node ID: 0x00000001
Ring ID: 1/756
Quorate: Yes
Votequorum information
----------------------
Expected votes: 3
Highest expected: 3
Total votes: 2
Quorum: 2
Flags: Quorate
Membership information
----------------------
Nodeid Votes Name
0x00000001 1 10.0.0.101 (local)
0x00000002 1 10.0.0.102
The VM is unreacheable: no console, no ping, no services, no guest agent…
I didn't try to power cycle the VM that is in this incoherent state (HA vs QM).
Everything works properly as soon as I power the node 3 back:
Code:
root@srv-pve3:~# qm status 244
status: running
Code:
root@srv-pve1:~# ha-manager status
quorum OK
master srv-pve2 (active, Wed Mar 20 10:49:49 2019)
lrm srv-pve1 (active, Wed Mar 20 10:49:48 2019)
lrm srv-pve2 (active, Wed Mar 20 10:49:51 2019)
lrm srv-pve3 (active, Wed Mar 20 10:49:54 2019)
service vm:244 (srv-pve3, started)
Code:
root@srv-pve1:~# pvecm status
Quorum information
------------------
Date: Wed Mar 20 10:49:35 2019
Quorum provider: corosync_votequorum
Nodes: 3
Node ID: 0x00000001
Ring ID: 1/760
Quorate: Yes
Votequorum information
----------------------
Expected votes: 3
Highest expected: 3
Total votes: 3
Quorum: 2
Flags: Quorate
Membership information
----------------------
Nodeid Votes Name
0x00000001 1 10.0.0.101 (local)
0x00000002 1 10.0.0.102
0x00000003 1 10.0.0.103
The VM is back on node 3, pings and services are operational.
Regarding the environment, the cluster is up-to-date and all nodes run the same configuration:
Code:
root@srv-pve1:~# pveversion -v
proxmox-ve: 5.3-1 (running kernel: 4.15.18-12-pve)
pve-manager: 5.3-11 (running version: 5.3-11/d4907f84)
pve-kernel-4.15: 5.3-3
pve-kernel-4.15.18-12-pve: 4.15.18-35
pve-kernel-4.15.18-11-pve: 4.15.18-34
pve-kernel-4.15.18-10-pve: 4.15.18-32
pve-kernel-4.15.18-9-pve: 4.15.18-30
pve-kernel-4.15.18-8-pve: 4.15.18-28
pve-kernel-4.15.18-7-pve: 4.15.18-27
pve-kernel-4.15.17-1-pve: 4.15.17-9
ceph: 12.2.11-pve1
corosync: 2.4.4-pve1
criu: 2.11.1-1~bpo90
glusterfs-client: 3.8.8-1
ksm-control-daemon: 1.2-2
libjs-extjs: 6.0.1-2
libpve-access-control: 5.1-3
libpve-apiclient-perl: 2.0-5
libpve-common-perl: 5.0-47
libpve-guest-common-perl: 2.0-20
libpve-http-server-perl: 2.0-12
libpve-storage-perl: 5.0-39
libqb0: 1.0.3-1~bpo9
lvm2: 2.02.168-pve6
lxc-pve: 3.1.0-3
lxcfs: 3.0.3-pve1
novnc-pve: 1.0.0-3
proxmox-widget-toolkit: 1.0-23
pve-cluster: 5.0-33
pve-container: 2.0-35
pve-docs: 5.3-3
pve-edk2-firmware: 1.20181023-1
pve-firewall: 3.0-18
pve-firmware: 2.0-6
pve-ha-manager: 2.0-8
pve-i18n: 1.0-9
pve-libspice-server1: 0.14.1-2
pve-qemu-kvm: 2.12.1-2
pve-xtermjs: 3.10.1-2
qemu-server: 5.0-47
smartmontools: 6.5+svn4324-1
spiceterm: 3.0-5
vncterm: 1.5-3
zfsutils-linux: 0.7.13-pve1~bpo1
Storage is powered by a healthy ceph cluster:
Code:
root@srv-pve1:~# ceph-brag
{
"cluster_creation_date": "2018-10-11 17:51:26.259055",
"uuid": "dd52bfc1-5409-4730-8f3a-72637478418a",
"components_count": {
"num_data_bytes": 7485079178309,
"num_mons": 3,
"num_pgs": 672,
"num_mdss": 1,
"num_pools": 3,
"num_osds": 18,
"num_bytes_total": 72002146295808,
"num_objects": 1788813
},
"crush_types": [
{
"count": 6,
"type": "host"
},
{
"count": 2,
"type": "root"
},
{
"count": 18,
"type": "devices"
}
],
"ownership": {},
"pool_metadata": [
{
"type": 1,
"id": 6,
"size": 2
},
{
"type": 1,
"id": 9,
"size": 3
},
{
"type": 1,
"id": 10,
"size": 3
}
],
"sysinfo": {
"kernel_types": [
{
"count": 18,
"type": "#1 SMP PVE 4.15.18-35 (Wed, 13 Mar 2019 08:24:42 +0100)"
}
],
"cpu_archs": [
{
"count": 18,
"arch": "x86_64"
}
],
"cpus": [
{
"count": 18,
"cpu": "Intel(R) Xeon(R) CPU E5-2440 0 @ 2.40GHz"
}
],
"kernel_versions": [
{
"count": 18,
"version": "4.15.18-12-pve"
}
],
"ceph_versions": [
{
"count": 18,
"version": "12.2.11(c96e82ac735a75ae99d4847983711e1f2dbf12e5)"
}
],
"os_info": [
{
"count": 18,
"os": "Linux"
}
],
"distros": []
}
}
Could it be a bug that I need to report or is it a configuration problem?
BR, PAH.