INFO: Starting Backup of VM 102 (lxc)
INFO: status = running
INFO: found old vzdump snapshot (force removal)
Use of uninitialized value in string eq at /usr/share/perl5/PVE/LXC.pm line 1872.
ERROR: Backup of VM 102 failed - Can't delete snapshot: 102 vzdump zfs error: could not find any snapshots to destroy; check snapshot names.
INFO: starting new backup job: vzdump 100 102 103 105 --mode snapshot --storage store --quiet 1 --compress lzo --mailnotification failure
INFO: Starting Backup of VM 100 (lxc)
INFO: status = running
INFO: mode failure - some volumes does not support snapshots
INFO: trying 'suspend' mode instead
INFO: backup mode: suspend
INFO: ionice priority: 7
INFO: starting first sync /proc/11330/root// to /media/store/dump/vzdump-lxc-100-2016_02_04-02_00_01.tmp
INFO: Number of files: 49,591 (reg: 39,672, dir: 3,717, link: 6,170, dev: 2, special: 30)
INFO: Number of created files: 49,590 (reg: 39,672, dir: 3,716, link: 6,170, dev: 2, special: 30)
INFO: Number of deleted files: 0
INFO: Number of regular files transferred: 39,660
INFO: Total file size: 3,757,495,903 bytes
INFO: Total transferred file size: 3,753,246,782 bytes
INFO: Literal data: 3,753,247,621 bytes
INFO: Matched data: 0 bytes
INFO: File list size: 1,769,338
INFO: File list generation time: 0.001 seconds
INFO: File list transfer time: 0.000 seconds
INFO: Total bytes sent: 3,757,266,045
INFO: Total bytes received: 793,627
INFO: sent 3,757,266,045 bytes received 793,627 bytes 97,611,939.53 bytes/sec
INFO: total size is 3,757,495,903 speedup is 1.00
INFO: first sync finished (38 seconds)
INFO: suspend vm
wait4(19264, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], 0, NULL) = 19264
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=19264, si_uid=0, si_status=0, si_utime=0, si_stime=0} ---
rt_sigprocmask(SIG_BLOCK, [CHLD], [], 8) = 0
rt_sigaction(SIGCHLD, NULL, {SIG_DFL, [], 0}, 8) = 0
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
nanosleep({1, 0}, 0x7fffda1ec980) = 0
pipe([3, 4]) = 0
clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7f72afe9ca50) = 19265
close(4) = 0
read(3, "\10\0\0\0", 4) = 4
read(3, "FREEZING", 8) = 8
close(3) = 0
wait4(19265, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], 0, NULL) = 19265
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=19265, si_uid=0, si_status=0, si_utime=0, si_stime=0} ---
rt_sigprocmask(SIG_BLOCK, [CHLD], [], 8) = 0
rt_sigaction(SIGCHLD, NULL, {SIG_DFL, [], 0}, 8) = 0
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
nanosleep({1, 0}, 0x7fffda1ec980) = 0
pipe([3, 4]) = 0
clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7f72afe9ca50) = 19266
close(4) = 0
read(3, "\10\0\0\0", 4) = 4
read(3, "FREEZING", 8) = 8
close(3) = 0
wait4(19266, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], 0, NULL) = 19266
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=19266, si_uid=0, si_status=0, si_utime=0, si_stime=0} ---
rt_sigprocmask(SIG_BLOCK, [CHLD], [], 8) = 0
rt_sigaction(SIGCHLD, NULL, {SIG_DFL, [], 0}, 8) = 0
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
nanosleep({1, 0}, ^CProcess 32565 detached
<detached ...>
4 0 11330 11290 20 0 33524 4064 refrig Ds ? 0:43 /sbin/init
1 0 11821 11330 20 0 19480 180 refrig D ? 0:10 upstart-udev-bridge --daemon
5 0 11902 11330 20 0 49276 3208 refrig Ds ? 0:00 /lib/systemd/systemd-udevd --daemon
5 102 11986 11330 20 0 39120 2308 refrig Ds ? 0:09 dbus-daemon --system --fork
4 0 12127 11330 20 0 35024 2932 refrig Ds ? 0:00 /lib/systemd/systemd-logind
5 101 12306 11330 20 0 255848 8624 refrig Dsl ? 0:01 rsyslogd
1 0 12402 11330 20 0 15548 2112 refrig D ? 0:11 upstart-socket-bridge --daemon
1 0 12406 11330 20 0 15412 1896 refrig D ? 0:10 upstart-file-bridge --daemon
4 0 12539 11330 20 0 15000 2076 refrig Ds ? 0:00 /usr/sbin/xinetd -dontfork -pidfile /var/run/xinetd.pid -stayalive -inetd_compat -inetd_ipv6
4 0 12551 11330 20 0 61376 5424 refrig Ds ? 0:00 /usr/sbin/sshd -D
1 0 12552 11330 20 0 23660 2404 refrig Ds ? 0:00 cron
1 1 12558 11330 20 0 19144 164 refrig Ds ? 0:00 atd
5 0 12595 11330 20 0 19328 2188 refrig Ds ? 0:10 /usr/sbin/irqbalance
4 105 12747 11330 20 0 1128952 237384 refrig Dsl ? 17:11 /usr/sbin/mysqld
5 0 13197 11330 20 0 26916 4864 refrig Ds ? 0:08 /usr/sbin/openvpn --writepid /run/openvpn/corp-client.pid --daemon ovpn-corp-client --status /run/openvpn/corp-client.status 10 --cd /etc/openvpn --config /etc/openvpn/corp-client.conf --script-security 2
1 0 13244 11330 20 0 181732 32244 reques Ds ? 0:32 linode-longview
4 0 13369 11330 20 0 25348 2952 refrig Ds ? 0:00 /usr/lib/postfix/master
4 106 13378 13369 20 0 27464 2896 refrig D ? 0:00 qmgr -l -t unix -u
1 0 13422 11330 20 0 83612 2400 refrig Ds ? 0:00 /usr/sbin/saslauthd -a pam -c -m /var/run/saslauthd -n 2
1 0 13425 13422 20 0 83612 756 refrig D ? 0:00 /usr/sbin/saslauthd -a pam -c -m /var/run/saslauthd -n 2
5 0 13545 11330 20 0 310904 22880 refrig Ds ? 0:02 /usr/sbin/apache2 -k start
5 33 13553 13545 20 0 310976 7848 refrig D ? 0:00 /usr/sbin/apache2 -k start
5 33 13554 13545 20 0 310976 7848 refrig D ? 0:00 /usr/sbin/apache2 -k start
5 33 13555 13545 20 0 310976 7848 refrig D ? 0:00 /usr/sbin/apache2 -k start
5 33 13556 13545 20 0 310976 7848 refrig D ? 0:00 /usr/sbin/apache2 -k start
5 33 13557 13545 20 0 310976 7848 refrig D ? 0:00 /usr/sbin/apache2 -k start
5 0 13766 11330 20 0 265564 12456 refrig Dl ? 0:19 /usr/bin/python /usr/bin/fail2ban-server -b -s /var/run/fail2ban/fail2ban.sock -p /var/run/fail2ban/fail2ban.pid
4 0 13830 11330 20 0 12792 1864 refrig Ds+ pts/3 0:00 /sbin/getty -8 38400 console
4 0 13833 11330 20 0 12792 1976 refrig Ds+ pts/1 0:00 /sbin/getty -8 38400 tty2
4 0 13834 11330 20 0 12792 1940 refrig Ds+ pts/0 0:00 /sbin/getty -8 38400 tty1
5 0 17446 11330 20 0 25116 4220 refrig Ds ? 0:04 /usr/sbin/ntpd -p /var/run/ntpd.pid -g -u 103:109
4 106 29497 13369 20 0 27412 2848 refrig D ? 0:00 pickup -l -t unix -u -c
4 0 32518 11330 20 0 12792 1912 refrig Ds ? 0:00 /sbin/getty -8 38400 ttyS0
They should not suspend the container, but they briefly (if everything goes right, not noticable) freeze the container to create the snapshot volumeI am having the same issue on a brand new Proxmox setup with containers. I am using suspend backup mode and the backing filesystem is local ext4. Next time it happens I will post a full gist. I need automated backups on my containers; one of them is running our VPN service so if that hangs indefinitely, it's pretty hard to remote manage.
I could migrate to ZFS to get snapshots but it sounds like even people using snapshots are having this problem. Do snapshot backups also suspend the container?
I could migrate to ZFS to get snapshots but it sounds like even people using snapshots are having this problem. Do snapshot backups also suspend the container?
They are no container suspend if snasphot is used
(you can check the code in /usr/share/perl5/PVE/VZDump/LXC.pm)
From what I gather it's spinning in a loop waiting for processes to finishCan you describe what this ps process / cron job is doing?
I.. do not know I use FUSE? It's a standard debian containerWhat are you using FUSE for in this container?
I'll try patching the code so it doesnt constantly call ps - that should do the trick, correct?If possible, can you try disabling the cron job and the FUSE mount and see if the issue persists?
From what I gather it's spinning in a loop waiting for processes to finish
There's a isProcessStillRunning function.. which gets all pids via some ps/awk trickery and then searches for the pid in there - probably the best way that could feasably be solved *sigh*
an exists(/proc/$PID) call would be too difficult - it's even being checked if proc is available for other stuff in the code
I.. do not know I use FUSE? It's a standard debian container
I just looked up what that even is and it seems to be an abstraction to emulate file systems for containers? Is that correct?
What info do you need/where do I look it up?
I'll try patching the code so it doesnt constantly call ps - that should do the trick, correct?
Could you provide the output of "mount" and "ls -la /dev" ?
# mount
local/subvol-102-disk-1 on / type zfs (rw,relatime,xattr,posixacl)
none on /dev type tmpfs (rw,relatime,size=100k,mode=755)
proc on /proc type proc (rw,nosuid,nodev,noexec,relatime)
proc on /proc/sys/net type proc (rw,nosuid,nodev,noexec,relatime)
proc on /proc/sys type proc (ro,nosuid,nodev,noexec,relatime)
proc on /proc/sysrq-trigger type proc (ro,nosuid,nodev,noexec,relatime)
sysfs on /sys type sysfs (rw,nosuid,nodev,noexec,relatime)
sysfs on /sys type sysfs (ro,nosuid,nodev,noexec,relatime)
sysfs on /sys/devices/virtual/net type sysfs (rw,relatime)
sysfs on /sys/devices/virtual/net type sysfs (rw,nosuid,nodev,noexec,relatime)
fusectl on /sys/fs/fuse/connections type fusectl (rw,relatime)
cgroup on /sys/fs/cgroup type tmpfs (rw,relatime,size=12k,mode=755)
tmpfs on /sys/fs/cgroup/cgmanager type tmpfs (rw,mode=755)
lxcfs on /proc/cpuinfo type fuse.lxcfs (rw,nosuid,nodev,relatime,user_id=0,group_id=0,allow_other)
lxcfs on /proc/diskstats type fuse.lxcfs (rw,nosuid,nodev,relatime,user_id=0,group_id=0,allow_other)
lxcfs on /proc/meminfo type fuse.lxcfs (rw,nosuid,nodev,relatime,user_id=0,group_id=0,allow_other)
lxcfs on /proc/stat type fuse.lxcfs (rw,nosuid,nodev,relatime,user_id=0,group_id=0,allow_other)
lxcfs on /proc/uptime type fuse.lxcfs (rw,nosuid,nodev,relatime,user_id=0,group_id=0,allow_other)
lxcfs on /sys/fs/cgroup/blkio type fuse.lxcfs (rw,nosuid,nodev,relatime,user_id=0,group_id=0,allow_other)
lxcfs on /sys/fs/cgroup/cpu,cpuacct type fuse.lxcfs (rw,nosuid,nodev,relatime,user_id=0,group_id=0,allow_other)
lxcfs on /sys/fs/cgroup/cpuset type fuse.lxcfs (rw,nosuid,nodev,relatime,user_id=0,group_id=0,allow_other)
lxcfs on /sys/fs/cgroup/devices type fuse.lxcfs (rw,nosuid,nodev,relatime,user_id=0,group_id=0,allow_other)
lxcfs on /sys/fs/cgroup/freezer type fuse.lxcfs (rw,nosuid,nodev,relatime,user_id=0,group_id=0,allow_other)
lxcfs on /sys/fs/cgroup/hugetlb type fuse.lxcfs (rw,nosuid,nodev,relatime,user_id=0,group_id=0,allow_other)
lxcfs on /sys/fs/cgroup/memory type fuse.lxcfs (rw,nosuid,nodev,relatime,user_id=0,group_id=0,allow_other)
lxcfs on /sys/fs/cgroup/systemd type fuse.lxcfs (rw,nosuid,nodev,relatime,user_id=0,group_id=0,allow_other)
lxcfs on /sys/fs/cgroup/net_cls,net_prio type fuse.lxcfs (rw,nosuid,nodev,relatime,user_id=0,group_id=0,allow_other)
lxcfs on /sys/fs/cgroup/perf_event type fuse.lxcfs (rw,nosuid,nodev,relatime,user_id=0,group_id=0,allow_other)
devpts on /dev/console type devpts (rw,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode=000)
devpts on /dev/pts type devpts (rw,relatime,gid=5,mode=620,ptmxmode=666)
devpts on /dev/tty1 type devpts (rw,relatime,gid=5,mode=620,ptmxmode=666)
devpts on /dev/tty2 type devpts (rw,relatime,gid=5,mode=620,ptmxmode=666)
tmpfs on /run type tmpfs (rw,nosuid,noexec,relatime,size=6594316k,mode=755)
tmpfs on /run/lock type tmpfs (rw,nosuid,nodev,noexec,relatime,size=5120k)
tmpfs on /run/shm type tmpfs (rw,nosuid,nodev,noexec,relatime,size=5138020k)
# ls -la /dev
total 9
drwxr-xr-x 3 root root 400 Feb 8 08:39 .
drwxr-xr-x 23 root root 23 Feb 8 08:39 ..
crw------- 1 root tty 136, 3 Feb 8 08:39 console
lrwxrwxrwx 1 root root 13 Feb 8 08:39 fd -> /proc/self/fd
crw-rw-rw- 1 root root 1, 7 Feb 8 08:39 full
srw-rw-rw- 1 root root 0 Feb 8 08:39 log
crw-rw-rw- 1 root root 1, 3 Feb 8 08:39 null
lrwxrwxrwx 1 root root 13 Feb 8 08:39 ptmx -> /dev/pts/ptmx
drwxr-xr-x 2 root root 0 Feb 8 08:39 pts
crw-rw-rw- 1 root root 1, 8 Feb 8 08:39 random
lrwxrwxrwx 1 root root 8 Feb 8 08:39 shm -> /run/shm
lrwxrwxrwx 1 root root 15 Feb 8 08:39 stderr -> /proc/self/fd/2
lrwxrwxrwx 1 root root 15 Feb 8 08:39 stdin -> /proc/self/fd/0
lrwxrwxrwx 1 root root 15 Feb 8 08:39 stdout -> /proc/self/fd/1
crw-rw-rw- 1 root root 5, 0 Feb 8 08:39 tty
crw-rw---- 1 root tty 136, 0 Feb 8 08:39 tty1
crw-rw---- 1 root tty 136, 1 Feb 8 08:39 tty2
crw-rw-rw- 1 root root 1, 9 Feb 8 08:39 urandom
prw-r----- 1 root adm 0 Feb 9 11:54 xconsole
crw-rw-rw- 1 root root 1, 5 Feb 8 08:39 zero
https://github.com/piwik/piwik/blob/master/core/CliMulti/Process.php#L140 and https://github.com/piwik/piwik/blob/master/core/CliMulti/Process.php#L246 for the function itselfIs it possible to give us a copy of the php script you use (the one that calls ps)? If not, could you at least give us those snippets where "ps" is called?
I'll try and set one upIf you already have a non-productive copy of the container that exhibits the problem, you could also try wrapping the ps calls with strace and collect detailled information about what ps actually does when it messes with lxc-freeze..
Me neither, trying to freeze the container with the ps command running in a loop freezes it (takes a few more seconds than without it running, but thats it)While I can reproduce the issue with explicit FUSE mounts in a container (e.g., sshfs), I cannot reproduce it using a vanilla wheezy or jessie container using ps or accessing files in /proc/ ...
I'll give that a try aswellOne more question: does the problem also occur if you setup a new debian wheezy container from scratch and only setup this cron job / php script?
@iMer please open a new thread or a bug report if you can reproduce that
We were not able to reproduce the problem here on a variety of file systems so we need more debugging information
so for people having the problem with their lxc backup not finising in snapshot/suspend mode, can you please post the following debugging information, preferably on pastebin or something similar and link it from here if you can
you need to install the package iotop on the host
1. while the backup process hangs ( not before, not after ), on the host
a) enter the command
ps faxo f,uid,pid,ppid,pri,ni,vsz,rss,wchan:20,stat,tty,time,comm
(don't grep the output we want everything !)
b) enter the command iotop --batch --only --quiet -qq, let it run five seconds, stop it with Ctrl-C and copy the ouput
c) from the host copy the last five lines of the vzdump command output
2. At any time check that the host kernel do not see system wide problem I/O problem with
zgrep 'blocked for more' /var/log/*
you can also add at the end of the document the content of
tasks and cgroup.procs files in /sys/fs/cgroup/freezer/lxc/<CTID>
as recommended by fabian
in the end you should have a document similar to this
https://gist.github.com/EmmanuelKasper/39fe1f64a5420858a266
# gdb
[gdb version output removed]
(gdb) attach 22928
Attaching to process 22928
[symbol reading output removed]
(gdb) bt
#0 __lll_lock_wait_private () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:95
#1 0x00007f9552b816db in _L_lock_11305 () from /lib/x86_64-linux-gnu/libc.so.6
#2 0x00007f9552b7f838 in __GI___libc_realloc (oldmem=0x7f9552ea8620 <main_arena>, bytes=bytes@entry=567) at malloc.c:3025
#3 0x00007f9552b7524b in _IO_vasprintf (result_ptr=0x7f95520fc748, format=<optimized out>, args=args@entry=0x7f95520fc628) at vasprintf.c:84
#4 0x00007f9552b54027 in ___asprintf (string_ptr=string_ptr@entry=0x7f95520fc748, format=format@entry=0x7f9552c6a688 "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n") at asprintf.c:35
#5 0x00007f9552b311c2 in __assert_fail_base (fmt=0x7f9552c6a688 "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n",
assertion=assertion@entry=0x7f9552c6d9a0 "({ __typeof (self->tid) __value; if (sizeof (__value) == 1) asm volatile (\"movb %%fs:%P2,%b0\" : \"=q\" (__value) : \"0\" (0), \"i\" (__builtin_offsetof (struct pthread, tid))); else if (sizeof (__value) == "..., file=file@entry=0x7f9552c6d968 "../nptl/sysdeps/unix/sysv/linux/x86_64/../fork.c", line=line@entry=141,
function=function@entry=0x7f9552c681b2 <__PRETTY_FUNCTION__.11206> "__libc_fork") at assert.c:57
#6 0x00007f9552b31312 in __GI___assert_fail (
assertion=0x7f9552c6d9a0 "({ __typeof (self->tid) __value; if (sizeof (__value) == 1) asm volatile (\"movb %%fs:%P2,%b0\" : \"=q\" (__value) : \"0\" (0), \"i\" (__builtin_offsetof (struct pthread, tid))); else if (sizeof (__value) == "..., file=0x7f9552c6d968 "../nptl/sysdeps/unix/sysv/linux/x86_64/../fork.c", line=141, function=0x7f9552c681b2 <__PRETTY_FUNCTION__.112
06> "__libc_fork") at assert.c:101
#7 0x00007f9552bbd385 in __libc_fork () at ../nptl/sysdeps/unix/sysv/linux/x86_64/../fork.c:141
#8 0x00007f9552ebf425 in __fork () at ../nptl/sysdeps/unix/sysv/linux/pt-fork.c:25
#9 0x0000000000404dd8 in write_task_init_pid_exit (sock=4, target=<optimized out>) at lxcfs.c:2530
#10 0x0000000000406fac in get_task_reaper_pid (task=<optimized out>) at lxcfs.c:2565
#11 get_reaper_busy (task=<optimized out>) at lxcfs.c:2581
#12 proc_uptime_read (fi=<optimized out>, offset=<optimized out>, size=<optimized out>, buf=<optimized out>) at lxcfs.c:2613
#13 proc_read (path=<optimized out>, fi=<optimized out>, offset=<optimized out>, size=<optimized out>, buf=<optimized out>) at lxcfs.c:2891
#14 lxcfs_read (path=0xfffffffffffffe00 <error: Cannot access memory at address 0xfffffffffffffe00>, buf=0x80 <error: Cannot access memory at address 0x80>, size=8191, offset=0, fi=0x7f953c002eb0) at lxcfs.c:2980
#15 0x00007f95530d8574 in fuse_fs_read_buf () from /lib/x86_64-linux-gnu/libfuse.so.2
#16 0x00007f95530d8732 in ?? () from /lib/x86_64-linux-gnu/libfuse.so.2
#17 0x00007f95530e108e in ?? () from /lib/x86_64-linux-gnu/libfuse.so.2
#18 0x00007f95530e1895 in ?? () from /lib/x86_64-linux-gnu/libfuse.so.2
#19 0x00007f95530de394 in ?? () from /lib/x86_64-linux-gnu/libfuse.so.2
#20 0x00007f9552eb60a4 in start_thread (arg=0x7f95520fd700) at pthread_create.c:309
#21 0x00007f9552beb9cd in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:111