Can you post the output of 'systemctl status <vmid>.scope' when this happens?
root@captive023-79001-bl03:~# systemctl status 1200645.scope
● 1200645.scope
Loaded: loaded (/run/systemd/transient/1200645.scope; transient; vendor preset: enabled)
Transient: yes
Active: active (running) since Thu 2019-06-06 10:44:26 CEST; 2h 18min ago
Tasks: 43 (limit: 9830)
CPU: 1h 12min 51.005s
CGroup: /qemu.slice/1200645.scope
└─11121 /usr/bin/kvm -id 1200645 -name vps-zap386293-1.zap-srv.com -chardev socket,id=qmp,path=/var/run/qemu-server/1200645.qmp,server,nowait -mon chardev=qmp,mode=control -chardev socket,id=qmp-event,path=/var/run/qmeventd.so
Jun 06 10:44:26 captive023-79001-bl03 systemd[1]: Started 1200645.scope.
These are VMs used by customers as well as test VMs without any custom services running. All KVM VMs are affected, Linux and windows. We are facing this on all nodes in our cluster using CEPH RBD storage or directory storage. LXC containers on the same hosts are not affected.Looks like the qemu process is hanging. What's running in the VM?
root@captive023-79001-bl03:~# top -b -n 1 | head -n 15
top - 18:00:08 up 1 day, 7:25, 2 users, load average: 20.87, 19.75, 18.87
Tasks: 1022 total, 4 running, 754 sleeping, 0 stopped, 0 zombie
%Cpu(s): 25.4 us, 10.8 sy, 0.2 ni, 56.6 id, 6.0 wa, 0.0 hi, 1.0 si, 0.0 st
KiB Mem : 26410505+total, 54241876 free, 20167544+used, 8187744 buff/cache
KiB Swap: 0 total, 0 free, 0 used. 66865996 avail Mem
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
3449224 root 20 0 9555708 8.052g 27288 S 400.0 3.2 254:55.22 kvm
3114381 root 20 0 5338756 4.059g 27436 S 330.0 1.6 891:25.72 kvm
3062631 root 20 0 17.275g 0.016t 27996 t 100.0 6.4 118:21.34 kvm
3665345 root 20 0 585036 139504 12728 R 100.0 0.1 8:06.85 pvedaemon worke
3102005 root 20 0 33.313g 0.029t 27424 S 90.0 11.6 365:40.19 kvm
3425985 root 20 0 5406636 4.059g 27412 S 60.0 1.6 122:09.18 kvm
2504 ceph 20 0 5708412 4.655g 29672 S 35.0 1.8 330:51.32 ceph-osd
3581158 1002 20 0 8229688 1.213g 19400 S 35.0 0.5 57:23.93 java
Any ideas?2598573 be/4 root 0.00 B/s 478.63 M/s 0.00 % 5.02 % [kworker/u129:3]
root@captive022-79001-bl01:~# cat /etc/network/interfaces
auto lo
iface lo inet loopback
iface eno1 inet manual
auto vmbr0
iface vmbr0 inet static
address 185.239.XXXXXX
netmask 255.255.255.0
gateway 185.239.XXXXXX
bridge_ports eno1
bridge_stp off
bridge_fd 0
auto vmbr0:0
iface vmbr0:0 inet static
address 10.10.10.219
netmask 255.255.255.0
auto eno2
iface eno2 inet static
address 10.10.20.30
netmask 255.255.255.0
root@captive022-79001-bl01:~# qm config 1200920
agent: 1,fstrim_cloned_disks=1
balloon: 2048
bootdisk: scsi0
cipassword: **********
ciuser: Administrator
cores: 6
cpu: host
cpulimit: 6
ide2: captive022-lxcstor01-localLV:1200920/vm-1200920-cloudinit.qcow2,media=cdrom
ipconfig0: gw=92.42.47.1,ip=92.42.47.22/24
memory: 8192
name: vps-zap364572-1.zap-srv.com
onboot: 1
ostype: win10
scsi0: captive022-lxcstor01-localLV:1200920/vm-1200920-disk-0.qcow2,cache=directsync,discard=on,format=qcow2,iothread=1,mbps_rd=100,mbps_rd_max=100,mbps_wr=100,mbps_wr_max=100,size=50G,ssd=1
scsihw: virtio-scsi-single
serial0: socket
smbios1: uuid=1b2119a0-b6d9-471e-8df1-4646ebe6c658
vmgenid: 0de43e14-a16f-4954-90af-65f87695beb9
434: tap1098623i0: <BROADCAST,MULTICAST,PROMISC,UP,LOWER_UP> mtu 1500 qdisc htb master fwbr1098623i0 state UNKNOWN group default qlen 1000
link/ether 06:a4:02:54:c7:1f brd ff:ff:ff:ff:ff:ff promiscuity 2
tun
bridge_slave state forwarding priority 32 cost 100 hairpin off guard off root_block off fastleave off learning on flood on port_id 0x8002 port_no 0x2 designated_port 32770 designated_cost 0 designated_bridge 8000.7a:60:4:97:59:bd designated_root 8000.7a:60:4:97:59:bd hold_timer 0.00 message_age_timer 0.00 forward_delay_timer 0.00 topology_change_ack 0 config_pending 0 proxy_arp off proxy_arp_wifi off mcast_router 1 mcast_fast_leave off mcast_flood on neigh_suppress off group_fwd_mask 0x0 group_fwd_mask_str 0x0 vlan_tunnel off numtxqueues 1 numrxqueues 1 gso_max_size 65536 gso_max_segs 65535
435: fwbr1098623i0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether 7a:60:04:97:59:bd brd ff:ff:ff:ff:ff:ff promiscuity 0
bridge forward_delay 0 hello_time 200 max_age 2000 ageing_time 30000 stp_state 0 priority 32768 vlan_filtering 0 vlan_protocol 802.1Q bridge_id 8000.7a:60:4:97:59:bd designated_root 8000.7a:60:4:97:59:bd root_port 0 root_path_cost 0 topology_change 0 topology_change_detected 0 hello_timer 0.00 tcn_timer 0.00 topology_change_timer 0.00 gc_timer 241.70 vlan_default_pvid 1 vlan_stats_enabled 0 group_fwd_mask 0 group_address 01:80:c2:00:00:00 mcast_snooping 1 mcast_router 1 mcast_query_use_ifaddr 0 mcast_querier 0 mcast_hash_elasticity 4 mcast_hash_max 512 mcast_last_member_count 2 mcast_startup_query_count 2 mcast_last_member_interval 100 mcast_membership_interval 26000 mcast_querier_interval 25500 mcast_query_interval 12500 mcast_query_response_interval 1000 mcast_startup_query_interval 3124 mcast_stats_enabled 0 mcast_igmp_version 2 mcast_mld_version 1 nf_call_iptables 0 nf_call_ip6tables 0 nf_call_arptables 0 numtxqueues 1 numrxqueues 1 gso_max_size 65536 gso_max_segs 65535
436: fwpr1098623p0@fwln1098623i0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue master vmbr0 state UP group default qlen 1000
link/ether b2:12:86:a5:83:15 brd ff:ff:ff:ff:ff:ff promiscuity 1
veth
bridge_slave state forwarding priority 32 cost 2 hairpin off guard off root_block off fastleave off learning on flood on port_id 0x8007 port_no 0x7 designated_port 32775 designated_cost 0 designated_bridge 8000.d8:9d:67:6b:b3:b8 designated_root 8000.d8:9d:67:6b:b3:b8 hold_timer 0.00 message_age_timer 0.00 forward_delay_timer 0.00 topology_change_ack 0 config_pending 0 proxy_arp off proxy_arp_wifi off mcast_router 1 mcast_fast_leave off mcast_flood on neigh_suppress off group_fwd_mask 0x0 group_fwd_mask_str 0x0 vlan_tunnel off numtxqueues 1 numrxqueues 1 gso_max_size 65536 gso_max_segs 65535
437: fwln1098623i0@fwpr1098623p0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue master fwbr1098623i0 state UP group default qlen 1000
link/ether 7a:60:04:97:59:bd brd ff:ff:ff:ff:ff:ff promiscuity 1
veth
bridge_slave state forwarding priority 32 cost 2 hairpin off guard off root_block off fastleave off learning on flood on port_id 0x8001 port_no 0x1 designated_port 32769 designated_cost 0 designated_bridge 8000.7a:60:4:97:59:bd designated_root 8000.7a:60:4:97:59:bd hold_timer 0.00 message_age_timer 0.00 forward_delay_timer 0.00 topology_change_ack 0 config_pending 0 proxy_arp off proxy_arp_wifi off mcast_router 1 mcast_fast_leave off mcast_flood on neigh_suppress off group_fwd_mask 0x0 group_fwd_mask_str 0x0 vlan_tunnel off numtxqueues 1 numrxqueues 1 gso_max_size 65536 gso_max_segs 65535