Backup hangs at 100% for many hours

wlang

New Member
Dec 17, 2024
1
0
1
Since about three weeks ago backups of the VMs on our Proxmox end somewhere around the noon or in the afternoon whereas before they managed to finish by the morning. There was a reboot of the PVE host since then.

When we look at the logs of the backup tasks in the Proxmox Backup Server the time between these two entries was only a few seconds before:
2024-11-28T03:51:11+01:00: add blob "/mnt/pve/NAS1-NFS/vm/101/2024-11-28T01:18:09Z/index.json.blob" (379 bytes, comp: 379)
2024-11-28T03:51:12+01:00: syncing filesystem

Now however, this part takes many hours:
2024-12-16T03:02:27+01:00: add blob "/mnt/pve/NAS1-NFS/vm/101/2024-12-16T01:25:39Z/index.json.blob" (380 bytes, comp: 380)
2024-12-16T15:05:47+01:00: syncing filesystem

We tried to trace the system calls for the backup task (strace -p #) and every second we see a block like this:
Code:
openat(AT_FDCWD, "/var/run/qemu-server/101.pid", O_RDONLY|O_CLOEXEC) = 12
ioctl(12, TCGETS, 0x7fff7a6dc450)       = -1 ENOTTY (Niewłaściwy ioctl dla urządzenia)                               //inappropriate ioctl for device
lseek(12, 0, SEEK_CUR)                  = 0
newfstatat(12, "", {st_mode=S_IFREG|0600, st_size=5, ...}, AT_EMPTY_PATH) = 0
newfstatat(12, "", {st_mode=S_IFREG|0600, st_size=5, ...}, AT_EMPTY_PATH) = 0
read(12, "6063\n", 8192)                = 5
close(12)                               = 0
openat(AT_FDCWD, "/proc/6063/cmdline", O_RDONLY|O_CLOEXEC) = 12
ioctl(12, TCGETS, 0x7fff7a6dc450)       = -1 ENOTTY (Niewłaściwy ioctl dla urządzenia)
lseek(12, 0, SEEK_CUR)                  = 0
newfstatat(12, "", {st_mode=S_IFREG|0444, st_size=0, ...}, AT_EMPTY_PATH) = 0
read(12, "/usr/bin/kvm\0-id\000101\0-name\0SERWE"..., 8192) = 2817
read(12, "", 8192)                      = 0
close(12)                               = 0
openat(AT_FDCWD, "/proc/6063/stat", O_RDONLY|O_CLOEXEC) = 12
ioctl(12, TCGETS, 0x7fff7a6dc450)       = -1 ENOTTY (Niewłaściwy ioctl dla urządzenia)
lseek(12, 0, SEEK_CUR)                  = 0
newfstatat(12, "", {st_mode=S_IFREG|0444, st_size=0, ...}, AT_EMPTY_PATH) = 0
read(12, "6063 (kvm) S 1 6061 6061 0 -1 13"..., 8192) = 368
close(12)                               = 0
newfstatat(AT_FDCWD, "/var/run/qemu-server/101.qmp", {st_mode=S_IFSOCK|0750, st_size=0, ...}, 0) = 0
socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0) = 12
fcntl(12, F_SETFD, FD_CLOEXEC)          = 0
ioctl(12, TCGETS, 0x7fff7a6dc540)       = -1 ENOTTY (Niewłaściwy ioctl dla urządzenia)
lseek(12, 0, SEEK_CUR)                  = -1 ESPIPE (Błędne przesunięcie)                                           //invalid seek
fcntl(12, F_SETFD, FD_CLOEXEC)          = 0
ioctl(12, TCGETS, 0x7fff7a6dc540)       = -1 ENOTTY (Niewłaściwy ioctl dla urządzenia)
lseek(12, 0, SEEK_CUR)                  = -1 ESPIPE (Błędne przesunięcie)
fcntl(12, F_GETFL)                      = 0x2 (flags O_RDWR)
fcntl(12, F_SETFL, O_RDWR|O_NONBLOCK)   = 0
fcntl(12, F_GETFL)                      = 0x802 (flags O_RDWR|O_NONBLOCK)
connect(12, {sa_family=AF_UNIX, sun_path="/var/run/qemu-server/101.qmp"}, 110) = 0
fcntl(12, F_GETFL)                      = 0x802 (flags O_RDWR|O_NONBLOCK)
fcntl(12, F_SETFL, O_RDWR|O_NONBLOCK)   = 0
getsockopt(12, SOL_SOCKET, SO_TYPE, [1], [256 => 4]) = 0
getpid()                                = 492059
pselect6(16, [12], [12], NULL, {tv_sec=599, tv_nsec=999907000}, NULL) = 2 (in [12], out [12], left {tv_sec=599, tv_nsec=999905990})
read(12, "{\"QMP\": {\"version\": {\"qemu\": {\"m"..., 8192) = 125
write(12, "{\"arguments\":{},\"id\":\"492059:902"..., 65) = 65
pselect6(16, [12], [], NULL, {tv_sec=599, tv_nsec=999764000}, NULL) = 1 (in [12], left {tv_sec=599, tv_nsec=998637969})
read(12, "{\"return\": {}, \"id\": \"492059:902"..., 8192) = 38
getpid()                                = 492059
pselect6(16, [12], [12], NULL, {tv_sec=599, tv_nsec=998488000}, NULL) = 1 (out [12], left {tv_sec=599, tv_nsec=998486940})
write(12, "{\"arguments\":{},\"id\":\"492059:902"..., 61) = 61
pselect6(16, [12], [], NULL, {tv_sec=599, tv_nsec=998418000}, NULL) = 1 (in [12], left {tv_sec=599, tv_nsec=997704949})
read(12, "{\"return\": {\"total\": 19593135390"..., 8192) = 324
close(12)                               = 0
clock_nanosleep(CLOCK_REALTIME, 0, {tv_sec=1, tv_nsec=0}, 0x7fff7a6dc730) = 0

A quick check on the socket:
Code:
socat - UNIX-CONNECT:/var/run/qemu-server/101.qmp
{"QMP": {"version": {"qemu": {"micro": 2, "minor": 0, "major": 9}, "package": "pve-qemu-kvm_9.0.2-4"}, "capabilities": []}}

Verifications of a backup made before and a backup made after the issue began have passed successfully.


So it seems like the backup task tries to do something for hours that took only seconds before - do you have any ideas what this could be and how to get back to the previous state?
 

About

The Proxmox community has been around for many years and offers help and support for Proxmox VE, Proxmox Backup Server, and Proxmox Mail Gateway.
We think our community is one of the best thanks to people like you!

Get your subscription!

The Proxmox team works very hard to make sure you are running the best software and getting stable updates and security enhancements, as well as quick enterprise support. Tens of thousands of happy customers have a Proxmox subscription. Get yours easily in our online shop.

Buy now!