Our PVE enviroment:
Proxmox 8.4.1
CEPH: 19.2.1
Funny enough: no fail over happens and the dmesg of the server is clean. If I move the mgr role to another node it also does crash on the other node. It happens like every 3-7 days and here is the logs:
Any ideas / suggestions? Performance wise and everything else is pertty fine.
Proxmox 8.4.1
CEPH: 19.2.1
Funny enough: no fail over happens and the dmesg of the server is clean. If I move the mgr role to another node it also does crash on the other node. It happens like every 3-7 days and here is the logs:
Code:
root@pve-node01:~# ceph crash info 2025-07-19T08:31:27.419469Z_19831497-06d1-4405-a012-7cd2cd9646ae
{
"assert_condition": "nref == 0",
"assert_file": "./src/common/RefCountedObj.cc",
"assert_func": "virtual ceph::common::RefCountedObject::~RefCountedObject()",
"assert_line": 14,
"assert_msg": "./src/common/RefCountedObj.cc: In function 'virtual ceph::common::RefCountedObject::~RefCountedObject()' thread 7a4a1305a6c0 time 2025-07-19T10:31:27.418023+0200\n./src/common/RefCountedObj.cc: 14: FAILED ceph_assert(nref == 0)\n",
"backtrace": [
"/lib/x86_64-linux-gnu/libc.so.6(+0x3c050) [0x7a4a2b37c050]",
"/lib/x86_64-linux-gnu/libc.so.6(+0x8aeec) [0x7a4a2b3caeec]",
"gsignal()",
"abort()",
"(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x17b) [0x7a4a2bb0484c]",
"/usr/lib/ceph/libceph-common.so.2(+0x2c798f) [0x7a4a2bb0498f]",
"/usr/lib/ceph/libceph-common.so.2(+0x3c0c15) [0x7a4a2bbfdc15]",
"(MMgrCommand::~MMgrCommand()+0x7a) [0x57c55f1e077a]",
"(ceph::common::RefCountedObject::put() const+0x1ad) [0x7a4a2bbfdf2d]",
"(TrackedOp::put()+0x25a) [0x57c55f03db3a]",
"(OpHistoryServiceThread::entry()+0x143) [0x57c55f0b57f3]",
"/lib/x86_64-linux-gnu/libc.so.6(+0x891f5) [0x7a4a2b3c91f5]",
"/lib/x86_64-linux-gnu/libc.so.6(+0x10989c) [0x7a4a2b44989c]"
],
"ceph_version": "19.2.1",
"crash_id": "2025-07-19T08:31:27.419469Z_19831497-06d1-4405-a012-7cd2cd9646ae",
"entity_name": "mgr.pve-node01",
"os_id": "12",
"os_name": "Debian GNU/Linux 12 (bookworm)",
"os_version": "12 (bookworm)",
"os_version_id": "12",
"process_name": "ceph-mgr",
"stack_sig": "34573e4c3543433958d462fb8fbe67add0880797713bcf5217f9638195366242",
"timestamp": "2025-07-19T08:31:27.419469Z",
"utsname_hostname": "pve-node01",
"utsname_machine": "x86_64",
"utsname_release": "6.8.12-11-pve",
"utsname_sysname": "Linux",
"utsname_version": "#1 SMP PREEMPT_DYNAMIC PMX 6.8.12-11 (2025-05-22T09:39Z)"
}
root@pve-node01:~#
root@pve-node01:~# ceph crash info 2025-07-23T21:51:02.425311Z_47376f06-a90c-46c0-ba1f-480b9a27821c
{
"assert_condition": "nref == 0",
"assert_file": "./src/common/RefCountedObj.cc",
"assert_func": "virtual ceph::common::RefCountedObject::~RefCountedObject()",
"assert_line": 14,
"assert_msg": "./src/common/RefCountedObj.cc: In function 'virtual ceph::common::RefCountedObject::~RefCountedObject()' thread 743b911bf6c0 time 2025-07-23T23:51:02.423924+0200\n./src/common/RefCountedObj.cc: 14: FAILED ceph_assert(nref == 0)\n",
"backtrace": [
"/lib/x86_64-linux-gnu/libc.so.6(+0x3c050) [0x743ba95e1050]",
"/lib/x86_64-linux-gnu/libc.so.6(+0x8aeec) [0x743ba962feec]",
"gsignal()",
"abort()",
"(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x17b) [0x743ba9d6984c]",
"/usr/lib/ceph/libceph-common.so.2(+0x2c798f) [0x743ba9d6998f]",
"/usr/lib/ceph/libceph-common.so.2(+0x3c0c15) [0x743ba9e62c15]",
"(MMgrCommand::~MMgrCommand()+0x7a) [0x5cb60d7fd77a]",
"(ceph::common::RefCountedObject::put() const+0x1ad) [0x743ba9e62f2d]",
"(TrackedOp::put()+0x25a) [0x5cb60d65ab3a]",
"(OpHistoryServiceThread::entry()+0x143) [0x5cb60d6d27f3]",
"/lib/x86_64-linux-gnu/libc.so.6(+0x891f5) [0x743ba962e1f5]",
"/lib/x86_64-linux-gnu/libc.so.6(+0x10989c) [0x743ba96ae89c]"
],
"ceph_version": "19.2.1",
"crash_id": "2025-07-23T21:51:02.425311Z_47376f06-a90c-46c0-ba1f-480b9a27821c",
"entity_name": "mgr.pve-node01",
"os_id": "12",
"os_name": "Debian GNU/Linux 12 (bookworm)",
"os_version": "12 (bookworm)",
"os_version_id": "12",
"process_name": "ceph-mgr",
"stack_sig": "34573e4c3543433958d462fb8fbe67add0880797713bcf5217f9638195366242",
"timestamp": "2025-07-23T21:51:02.425311Z",
"utsname_hostname": "pve-node01",
"utsname_machine": "x86_64",
"utsname_release": "6.8.12-11-pve",
"utsname_sysname": "Linux",
"utsname_version": "#1 SMP PREEMPT_DYNAMIC PMX 6.8.12-11 (2025-05-22T09:39Z)"
}
Any ideas / suggestions? Performance wise and everything else is pertty fine.
Last edited: