Backup hangs at 100% for many hours

wlang · Dec 17, 2024

Since about three weeks ago backups of the VMs on our Proxmox end somewhere around the noon or in the afternoon whereas before they managed to finish by the morning. There was a reboot of the PVE host since then.

When we look at the logs of the backup tasks in the Proxmox Backup Server the time between these two entries was only a few seconds before:
2024-11-28T03:51:11+01:00: add blob "/mnt/pve/NAS1-NFS/vm/101/2024-11-28T01:18:09Z/index.json.blob" (379 bytes, comp: 379)
2024-11-28T03:51:12+01:00: syncing filesystem

Now however, this part takes many hours:
2024-12-16T03:02:27+01:00: add blob "/mnt/pve/NAS1-NFS/vm/101/2024-12-16T01:25:39Z/index.json.blob" (380 bytes, comp: 380)
2024-12-16T15:05:47+01:00: syncing filesystem

We tried to trace the system calls for the backup task (strace -p #) and every second we see a block like this:

Code:

openat(AT_FDCWD, "/var/run/qemu-server/101.pid", O_RDONLY|O_CLOEXEC) = 12
ioctl(12, TCGETS, 0x7fff7a6dc450)       = -1 ENOTTY (Niewłaściwy ioctl dla urządzenia)                               //inappropriate ioctl for device
lseek(12, 0, SEEK_CUR)                  = 0
newfstatat(12, "", {st_mode=S_IFREG|0600, st_size=5, ...}, AT_EMPTY_PATH) = 0
newfstatat(12, "", {st_mode=S_IFREG|0600, st_size=5, ...}, AT_EMPTY_PATH) = 0
read(12, "6063\n", 8192)                = 5
close(12)                               = 0
openat(AT_FDCWD, "/proc/6063/cmdline", O_RDONLY|O_CLOEXEC) = 12
ioctl(12, TCGETS, 0x7fff7a6dc450)       = -1 ENOTTY (Niewłaściwy ioctl dla urządzenia)
lseek(12, 0, SEEK_CUR)                  = 0
newfstatat(12, "", {st_mode=S_IFREG|0444, st_size=0, ...}, AT_EMPTY_PATH) = 0
read(12, "/usr/bin/kvm\0-id\000101\0-name\0SERWE"..., 8192) = 2817
read(12, "", 8192)                      = 0
close(12)                               = 0
openat(AT_FDCWD, "/proc/6063/stat", O_RDONLY|O_CLOEXEC) = 12
ioctl(12, TCGETS, 0x7fff7a6dc450)       = -1 ENOTTY (Niewłaściwy ioctl dla urządzenia)
lseek(12, 0, SEEK_CUR)                  = 0
newfstatat(12, "", {st_mode=S_IFREG|0444, st_size=0, ...}, AT_EMPTY_PATH) = 0
read(12, "6063 (kvm) S 1 6061 6061 0 -1 13"..., 8192) = 368
close(12)                               = 0
newfstatat(AT_FDCWD, "/var/run/qemu-server/101.qmp", {st_mode=S_IFSOCK|0750, st_size=0, ...}, 0) = 0
socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0) = 12
fcntl(12, F_SETFD, FD_CLOEXEC)          = 0
ioctl(12, TCGETS, 0x7fff7a6dc540)       = -1 ENOTTY (Niewłaściwy ioctl dla urządzenia)
lseek(12, 0, SEEK_CUR)                  = -1 ESPIPE (Błędne przesunięcie)                                           //invalid seek
fcntl(12, F_SETFD, FD_CLOEXEC)          = 0
ioctl(12, TCGETS, 0x7fff7a6dc540)       = -1 ENOTTY (Niewłaściwy ioctl dla urządzenia)
lseek(12, 0, SEEK_CUR)                  = -1 ESPIPE (Błędne przesunięcie)
fcntl(12, F_GETFL)                      = 0x2 (flags O_RDWR)
fcntl(12, F_SETFL, O_RDWR|O_NONBLOCK)   = 0
fcntl(12, F_GETFL)                      = 0x802 (flags O_RDWR|O_NONBLOCK)
connect(12, {sa_family=AF_UNIX, sun_path="/var/run/qemu-server/101.qmp"}, 110) = 0
fcntl(12, F_GETFL)                      = 0x802 (flags O_RDWR|O_NONBLOCK)
fcntl(12, F_SETFL, O_RDWR|O_NONBLOCK)   = 0
getsockopt(12, SOL_SOCKET, SO_TYPE, [1], [256 => 4]) = 0
getpid()                                = 492059
pselect6(16, [12], [12], NULL, {tv_sec=599, tv_nsec=999907000}, NULL) = 2 (in [12], out [12], left {tv_sec=599, tv_nsec=999905990})
read(12, "{\"QMP\": {\"version\": {\"qemu\": {\"m"..., 8192) = 125
write(12, "{\"arguments\":{},\"id\":\"492059:902"..., 65) = 65
pselect6(16, [12], [], NULL, {tv_sec=599, tv_nsec=999764000}, NULL) = 1 (in [12], left {tv_sec=599, tv_nsec=998637969})
read(12, "{\"return\": {}, \"id\": \"492059:902"..., 8192) = 38
getpid()                                = 492059
pselect6(16, [12], [12], NULL, {tv_sec=599, tv_nsec=998488000}, NULL) = 1 (out [12], left {tv_sec=599, tv_nsec=998486940})
write(12, "{\"arguments\":{},\"id\":\"492059:902"..., 61) = 61
pselect6(16, [12], [], NULL, {tv_sec=599, tv_nsec=998418000}, NULL) = 1 (in [12], left {tv_sec=599, tv_nsec=997704949})
read(12, "{\"return\": {\"total\": 19593135390"..., 8192) = 324
close(12)                               = 0
clock_nanosleep(CLOCK_REALTIME, 0, {tv_sec=1, tv_nsec=0}, 0x7fff7a6dc730) = 0

A quick check on the socket:

Code:

socat - UNIX-CONNECT:/var/run/qemu-server/101.qmp
{"QMP": {"version": {"qemu": {"micro": 2, "minor": 0, "major": 9}, "package": "pve-qemu-kvm_9.0.2-4"}, "capabilities": []}}

Verifications of a backup made before and a backup made after the issue began have passed successfully.

So it seems like the backup task tries to do something for hours that took only seconds before - do you have any ideas what this could be and how to get back to the previous state?

Chris · Dec 17, 2024

Hi,
please have a look at https://forum.proxmox.com/threads/pbs-3-3-1-backup-tasks-hang-after-concluding-uploads.158812/

Search

Search

Backup hangs at 100% for many hours

wlang

New Member

Chris

Proxmox Staff Member