Hi Alwin (and others),
thanks for your time looking at my issue.
full error :
task started by HA resource agent
TASK ERROR: start failed: command '/usr/bin/kvm -id 100 -chardev 'socket,id=qmp,path=/var/run/qemu-server/100.qmp,server,nowait' -mon 'chardev=qmp,mode=control' -pidfile /var/run/qemu-server/100.pid -daemonize -smbios 'type=1,uuid=ca2cb04c-43ed-428c-adfb-6098c7036a0d' -name deb9 -smp '4,sockets=2,cores=2,maxcpus=4' -nodefaults -boot 'menu=on,strict=on,reboot-timeout=1000,splash=/usr/share/qemu-server/bootsplash.jpg' -vga std -vnc unix:/var/run/qemu-server/100.vnc,x509,password -cpu qemu64 -m 512 -k de -device 'pci-bridge,id=pci.2,chassis_nr=2,bus=pci.0,addr=0x1f' -device 'pci-bridge,id=pci.1,chassis_nr=1,bus=pci.0,addr=0x1e' -device 'piix3-usb-uhci,id=uhci,bus=pci.0,addr=0x1.0x2' -device 'usb-tablet,id=tablet,bus=uhci.0,port=1' -device 'virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x3' -iscsi 'initiator-name=iqn.1993-08.org.debian:01:f1d993d5b9ef' -drive 'if=none,id=drive-ide2,media=cdrom,aio=threads' -device 'ide-cd,bus=ide.1,unit=0,drive=drive-ide2,id=ide2,bootindex=200' -device 'lsi,id=scsihw0,bus=pci.0,addr=0x5' -drive 'file=rbd:sus-pool/vm-100-disk-1:conf=/etc/pve/ceph.conf:id=admin:keyring=/etc/pve/priv/ceph/sus-pool.keyring,if=none,id=drive-scsi0,format=raw,cache=none,aio=native,detect-zeroes=on' -device 'scsi-hd,bus=scsihw0.0,scsi-id=0,drive=drive-scsi0,id=scsi0,bootindex=100' -netdev 'type=tap,id=net0,ifname=tap100i0,script=/var/lib/qemu-server/pve-bridge,downscript=/var/lib/qemu-server/pve-bridgedown' -device 'e1000,mac=7A:9D:7E:F4:F9:59,netdev=net0,bus=pci.0,addr=0x12,id=net0,bootindex=300' -machine 'accel=tcg'' failed: got timeout
'ceph osd tree' healthy :
root@mox2:~# ceph osd tree
ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF
-1 0.03879 root default
-3 0.01939 host mox1
0 hdd 0.01939 osd.0 up 1.00000 1.00000
-5 0.01939 host mox2
1 hdd 0.01939 osd.1 up 1.00000 1.00000
'ceph osd tree' unhealthy (after powering off node "mox2" - look at the state "up") :
root@mox1:~# ceph osd tree
ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF
-1 0.03879 root default
-3 0.01939 host mox1
0 hdd 0.01939 osd.0 up 1.00000 1.00000
-5 0.01939 host mox2
1 hdd 0.01939 osd.1 up 1.00000 1.00000
'ceph osd dump' healthy :
root@mox2:~# ceph osd dump
epoch 121
fsid 92157505-5d27-465e-9feb-b8e71d32224b
created 2017-12-24 01:52:17.703537
modified 2017-12-28 09:50:18.352252
flags sortbitwise,recovery_deletes,purged_snapdirs
crush_version 5
full_ratio 0.95
backfillfull_ratio 0.9
nearfull_ratio 0.85
require_min_compat_client jewel
min_compat_client jewel
require_osd_release luminous
pool 4 'sus-pool' replicated size 2 min_size 1 crush_rule 0 object_hash rjenkins pg_num 64 pgp_num 64 last_change 84 flags hashpspool stripe_width 0 application rbd
removed_snaps [1~3]
max_osd 2
osd.0 up in weight 1 up_from 109 up_thru 119 down_at 105 last_clean_interval [86,104) 192.168.99.101:6800/1553 192.168.99.101:6801/1553 192.168.99.101:6802/1 553 192.168.99.101:6803/1553 exists,up b05a32d2-4b51-4f49-9a20-52a4b16e679d
osd.1 up in weight 1 up_from 119 up_thru 119 down_at 117 last_clean_interval [101,115) 192.168.99.102:6801/1581 192.168.99.102:6802/1581 192.168.99.102:6803/1581 192.168.99.102:6804/1581 exists,up 1e198f49-4fe6-476e-87f3-c3c50411efed
blacklist 192.168.99.102:0/3451113584 expires 2017-12-28 10:50:18.301930
'ceph osd dump' unhealthy (after powering off node "mox2" - look at the state "up") :
root@mox1:~# ceph osd dump
epoch 121
fsid 92157505-5d27-465e-9feb-b8e71d32224b
created 2017-12-24 01:52:17.703537
modified 2017-12-28 09:50:18.352252
flags sortbitwise,recovery_deletes,purged_snapdirs
crush_version 5
full_ratio 0.95
backfillfull_ratio 0.9
nearfull_ratio 0.85
require_min_compat_client jewel
min_compat_client jewel
require_osd_release luminous
pool 4 'sus-pool' replicated size 2 min_size 1 crush_rule 0 object_hash rjenkins pg_num 64 pgp_num 64 last_change 84 flags hashpspool stripe_width 0 application rbd
removed_snaps [1~3]
max_osd 2
osd.0 up in weight 1 up_from 109 up_thru 119 down_at 105 last_clean_interval [86,104) 192.168.99.101:6800/1553 192.168.99.101:6801/1553 192.168.99.101:6802/1553 192.168.99.101:6803/1553 exists,up b05a32d2-4b51-4f49-9a20-52a4b16e679d
osd.1 up in weight 1 up_from 119 up_thru 119 down_at 117 last_clean_interval [101,115) 192.168.99.102:6801/1581 192.168.99.102:6802/1581 192.168.99.102:6803/1581 192.168.99.102:6804/1581 exists,up 1e198f49-4fe6-476e-87f3-c3c50411efed
blacklist 192.168.99.102:0/3451113584 expires 2017-12-28 10:50:18.301930
VM100 was first running at "mox2". I was able to ping and ssh these VM. So I generatet 'ceph' output as requestet at "mox2". Then I switched off "mox2" and a few minutes later I generated new 'ceph' output from "mox1" at a time the UI shows VM100 running at "mox1". Now I'm not able to ping or SSH these VM. I think they only looks startet but is'nt.