Liebe Leute,
ich bekommen nun seit wenigen Tagen ca. 1-2 mal täglich die Fehlermeldung:
HEALTH_WARN - daemons have recently crashed
Mon's, Mgr's und OSD's sind onlinen und alles läuft meines erachtens einwandfrei.
# mit ceph crash archive-all
deaktviere ich die Fehlermeldungen. Nun bin ich jedoch neugierig was den Fehler verursacht. Der Fehler tritt bei unterschiedlichen Knoten auf.
# ceph crash info <ID>
gibt folgende Fehler aus:
Node4
{
"os_version_id": "10",
"utsname_machine": "x86_64",
"entity_name": "mon.promo4",
"backtrace": [
"(()+0x12730) [0x7f30ca142730]",
"(gsignal()+0x10b) [0x7f30c9c257bb]",
"(abort()+0x121) [0x7f30c9c10535]",
"(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x1a3) [0x7f30cb27be79]",
"(()+0x282000) [0x7f30cb27c000]",
"(Paxos::store_state(MMonPaxos*)+0xaa8) [0x5602540626f8]",
"(Paxos::handle_commit(boost::intrusive_ptr<MonOpRequest>)+0x2ea) [0x560254062a5a]",
"(Paxos::dispatch(boost::intrusive_ptr<MonOpRequest>)+0x223) [0x560254068213]",
"(Monitor::dispatch_op(boost::intrusive_ptr<MonOpRequest>)+0x131c) [0x560253f9db1c]",
"(Monitor::_ms_dispatch(Message*)+0x4aa) [0x560253f9e10a]",
"(Monitor::ms_dispatch(Message*)+0x26) [0x560253fcda36]",
"(Dispatcher::ms_dispatch2(boost::intrusive_ptr<Message> const&)+0x26) [0x560253fc9f66]",
"(DispatchQueue::entry()+0x1a49) [0x7f30cb4b4e69]",
"(DispatchQueue:ispatchThread::entry()+0xd) [0x7f30cb5629ed]",
"(()+0x7fa3) [0x7f30ca137fa3]",
"(clone()+0x3f) [0x7f30c9ce74cf]"
],
"process_name": "ceph-mon",
"assert_line": 485,
"archived": "2020-01-21 07:02:49.036123",
"assert_file": "/mnt/npool/tlamprecht/pve-ceph/ceph-14.2.6/src/common/ceph_time.h",
"utsname_sysname": "Linux",
"os_version": "10 (buster)",
"os_id": "10",
"assert_msg": "/mnt/npool/tlamprecht/pve-ceph/ceph-14.2.6/src/common/ceph_time.h: In function 'ceph::time_detail::timespan ceph::to_timespan(ceph::time_detail::signedspan)' thread 7f30c11fe700 time 2020-01-21 03:43:48.848411\n/mnt/npool/tlamprecht/pve-ceph/ceph-14.2.6/src/common/ceph_time.h: 485: FAILED ceph_assert(z >= signedspan::zero())\n",
"assert_func": "ceph::time_detail::timespan ceph::to_timespan(ceph::time_detail::signedspan)",
"ceph_version": "14.2.6",
"os_name": "Debian GNU/Linux 10 (buster)",
"timestamp": "2020-01-21 02:43:48.891122Z",
"assert_thread_name": "ms_dispatch",
"utsname_release": "5.3.13-1-pve",
"utsname_hostname": "promo4",
"crash_id": "2020-01-21_02:43:48.891122Z_0aade13c-463f-43fe-9b05-76ca71f6bc1b",
"assert_condition": "z >= signedspan::zero()",
"utsname_version": "#1 SMP PVE 5.3.13-1 (Thu, 05 Dec 2019 07:18:14 +0100)"
}
Node2
{
"os_version_id": "10",
"utsname_machine": "x86_64",
"entity_name": "mon.promo2",
"backtrace": [
"(()+0x12730) [0x7f74f6c3f730]",
"(gsignal()+0x10b) [0x7f74f67227bb]",
"(abort()+0x121) [0x7f74f670d535]",
"(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x1a3) [0x7f74f7d78e79]",
"(()+0x282000) [0x7f74f7d79000]",
"(Paxos::store_state(MMonPaxos*)+0xaa8) [0x55b9540ae6f8]",
"(Paxos::handle_commit(boost::intrusive_ptr<MonOpRequest>)+0x2ea) [0x55b9540aea5a]",
"(Paxos::dispatch(boost::intrusive_ptr<MonOpRequest>)+0x223) [0x55b9540b4213]",
"(Monitor::dispatch_op(boost::intrusive_ptr<MonOpRequest>)+0x131c) [0x55b953fe9b1c]",
"(Monitor::_ms_dispatch(Message*)+0x4aa) [0x55b953fea10a]",
"(Monitor::ms_dispatch(Message*)+0x26) [0x55b954019a36]",
"(Dispatcher::ms_dispatch2(boost::intrusive_ptr<Message> const&)+0x26) [0x55b954015f66]",
"(DispatchQueue::entry()+0x1a49) [0x7f74f7fb1e69]",
"(DispatchQueue:ispatchThread::entry()+0xd) [0x7f74f805f9ed]",
"(()+0x7fa3) [0x7f74f6c34fa3]",
"(clone()+0x3f) [0x7f74f67e44cf]"
],
"process_name": "ceph-mon",
"assert_line": 485,
"archived": "2020-01-21 07:02:49.041386",
"assert_file": "/mnt/npool/tlamprecht/pve-ceph/ceph-14.2.6/src/common/ceph_time.h",
"utsname_sysname": "Linux",
"os_version": "10 (buster)",
"os_id": "10",
"assert_msg": "/mnt/npool/tlamprecht/pve-ceph/ceph-14.2.6/src/common/ceph_time.h: In function 'ceph::time_detail::timespan ceph::to_timespan(ceph::time_detail::signedspan)' thread 7f74edcfb700 time 2020-01-20 22:32:56.933800\n/mnt/npool/tlamprecht/pve-ceph/ceph-14.2.6/src/common/ceph_time.h: 485: FAILED ceph_assert(z >= signedspan::zero())\n",
"assert_func": "ceph::time_detail::timespan ceph::to_timespan(ceph::time_detail::signedspan)",
"ceph_version": "14.2.6",
"os_name": "Debian GNU/Linux 10 (buster)",
"timestamp": "2020-01-20 21:32:56.947402Z",
"assert_thread_name": "ms_dispatch",
"utsname_release": "5.3.13-1-pve",
"utsname_hostname": "promo2",
"crash_id": "2020-01-20_21:32:56.947402Z_3ae7220c-23c9-478a-a22d-626c2fa34414",
"assert_condition": "z >= signedspan::zero()",
"utsname_version": "#1 SMP PVE 5.3.13-1 (Thu, 05 Dec 2019 07:18:14 +0100)"
}
Das sind zwei Ausgaben von unterschiedlichen Crash Reports.
Vielleicht hat ja jemand eine Idee.
LG
ff
ich bekommen nun seit wenigen Tagen ca. 1-2 mal täglich die Fehlermeldung:
HEALTH_WARN - daemons have recently crashed
Mon's, Mgr's und OSD's sind onlinen und alles läuft meines erachtens einwandfrei.
# mit ceph crash archive-all
deaktviere ich die Fehlermeldungen. Nun bin ich jedoch neugierig was den Fehler verursacht. Der Fehler tritt bei unterschiedlichen Knoten auf.
# ceph crash info <ID>
gibt folgende Fehler aus:
Node4
{
"os_version_id": "10",
"utsname_machine": "x86_64",
"entity_name": "mon.promo4",
"backtrace": [
"(()+0x12730) [0x7f30ca142730]",
"(gsignal()+0x10b) [0x7f30c9c257bb]",
"(abort()+0x121) [0x7f30c9c10535]",
"(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x1a3) [0x7f30cb27be79]",
"(()+0x282000) [0x7f30cb27c000]",
"(Paxos::store_state(MMonPaxos*)+0xaa8) [0x5602540626f8]",
"(Paxos::handle_commit(boost::intrusive_ptr<MonOpRequest>)+0x2ea) [0x560254062a5a]",
"(Paxos::dispatch(boost::intrusive_ptr<MonOpRequest>)+0x223) [0x560254068213]",
"(Monitor::dispatch_op(boost::intrusive_ptr<MonOpRequest>)+0x131c) [0x560253f9db1c]",
"(Monitor::_ms_dispatch(Message*)+0x4aa) [0x560253f9e10a]",
"(Monitor::ms_dispatch(Message*)+0x26) [0x560253fcda36]",
"(Dispatcher::ms_dispatch2(boost::intrusive_ptr<Message> const&)+0x26) [0x560253fc9f66]",
"(DispatchQueue::entry()+0x1a49) [0x7f30cb4b4e69]",
"(DispatchQueue:ispatchThread::entry()+0xd) [0x7f30cb5629ed]",
"(()+0x7fa3) [0x7f30ca137fa3]",
"(clone()+0x3f) [0x7f30c9ce74cf]"
],
"process_name": "ceph-mon",
"assert_line": 485,
"archived": "2020-01-21 07:02:49.036123",
"assert_file": "/mnt/npool/tlamprecht/pve-ceph/ceph-14.2.6/src/common/ceph_time.h",
"utsname_sysname": "Linux",
"os_version": "10 (buster)",
"os_id": "10",
"assert_msg": "/mnt/npool/tlamprecht/pve-ceph/ceph-14.2.6/src/common/ceph_time.h: In function 'ceph::time_detail::timespan ceph::to_timespan(ceph::time_detail::signedspan)' thread 7f30c11fe700 time 2020-01-21 03:43:48.848411\n/mnt/npool/tlamprecht/pve-ceph/ceph-14.2.6/src/common/ceph_time.h: 485: FAILED ceph_assert(z >= signedspan::zero())\n",
"assert_func": "ceph::time_detail::timespan ceph::to_timespan(ceph::time_detail::signedspan)",
"ceph_version": "14.2.6",
"os_name": "Debian GNU/Linux 10 (buster)",
"timestamp": "2020-01-21 02:43:48.891122Z",
"assert_thread_name": "ms_dispatch",
"utsname_release": "5.3.13-1-pve",
"utsname_hostname": "promo4",
"crash_id": "2020-01-21_02:43:48.891122Z_0aade13c-463f-43fe-9b05-76ca71f6bc1b",
"assert_condition": "z >= signedspan::zero()",
"utsname_version": "#1 SMP PVE 5.3.13-1 (Thu, 05 Dec 2019 07:18:14 +0100)"
}
Node2
{
"os_version_id": "10",
"utsname_machine": "x86_64",
"entity_name": "mon.promo2",
"backtrace": [
"(()+0x12730) [0x7f74f6c3f730]",
"(gsignal()+0x10b) [0x7f74f67227bb]",
"(abort()+0x121) [0x7f74f670d535]",
"(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x1a3) [0x7f74f7d78e79]",
"(()+0x282000) [0x7f74f7d79000]",
"(Paxos::store_state(MMonPaxos*)+0xaa8) [0x55b9540ae6f8]",
"(Paxos::handle_commit(boost::intrusive_ptr<MonOpRequest>)+0x2ea) [0x55b9540aea5a]",
"(Paxos::dispatch(boost::intrusive_ptr<MonOpRequest>)+0x223) [0x55b9540b4213]",
"(Monitor::dispatch_op(boost::intrusive_ptr<MonOpRequest>)+0x131c) [0x55b953fe9b1c]",
"(Monitor::_ms_dispatch(Message*)+0x4aa) [0x55b953fea10a]",
"(Monitor::ms_dispatch(Message*)+0x26) [0x55b954019a36]",
"(Dispatcher::ms_dispatch2(boost::intrusive_ptr<Message> const&)+0x26) [0x55b954015f66]",
"(DispatchQueue::entry()+0x1a49) [0x7f74f7fb1e69]",
"(DispatchQueue:ispatchThread::entry()+0xd) [0x7f74f805f9ed]",
"(()+0x7fa3) [0x7f74f6c34fa3]",
"(clone()+0x3f) [0x7f74f67e44cf]"
],
"process_name": "ceph-mon",
"assert_line": 485,
"archived": "2020-01-21 07:02:49.041386",
"assert_file": "/mnt/npool/tlamprecht/pve-ceph/ceph-14.2.6/src/common/ceph_time.h",
"utsname_sysname": "Linux",
"os_version": "10 (buster)",
"os_id": "10",
"assert_msg": "/mnt/npool/tlamprecht/pve-ceph/ceph-14.2.6/src/common/ceph_time.h: In function 'ceph::time_detail::timespan ceph::to_timespan(ceph::time_detail::signedspan)' thread 7f74edcfb700 time 2020-01-20 22:32:56.933800\n/mnt/npool/tlamprecht/pve-ceph/ceph-14.2.6/src/common/ceph_time.h: 485: FAILED ceph_assert(z >= signedspan::zero())\n",
"assert_func": "ceph::time_detail::timespan ceph::to_timespan(ceph::time_detail::signedspan)",
"ceph_version": "14.2.6",
"os_name": "Debian GNU/Linux 10 (buster)",
"timestamp": "2020-01-20 21:32:56.947402Z",
"assert_thread_name": "ms_dispatch",
"utsname_release": "5.3.13-1-pve",
"utsname_hostname": "promo2",
"crash_id": "2020-01-20_21:32:56.947402Z_3ae7220c-23c9-478a-a22d-626c2fa34414",
"assert_condition": "z >= signedspan::zero()",
"utsname_version": "#1 SMP PVE 5.3.13-1 (Thu, 05 Dec 2019 07:18:14 +0100)"
}
Das sind zwei Ausgaben von unterschiedlichen Crash Reports.
Vielleicht hat ja jemand eine Idee.
LG
ff