I have one osd that goes out every 3-7 days.
It is an osd in 4 node ceph cluster running under Proxmox and a mamber of 16 osds pool (4 osds per node). The issue is recent , the pool has been up almost 2 years. It happened 3 times in last two weeks. I checked the drive with SMART but it did not finds anything and it is showing that the drive is fine. The cluster is also fine otherwise. I have two pools in this cluster and eveything seems to work well. I can start the osd and it comes back up and works for 3-7 days before the issue reappears .
Here is the message I am getting:
ceph crash info 2021-10-25_16:30:16.449522Z_ddd6e537-0e6e-4bc0-9754-6e06cd063b02
{
"os_version_id": "10",
"assert_condition": "r == 0",
"utsname_release": "5.3.13-1-pve",
"os_name": "Debian GNU/Linux 10 (buster)",
"entity_name": "osd.14",
"assert_file": "/mnt/npool/tlamprecht/pve-ceph/ceph-14.2.6/src/os/bluestore/BlueStore.cc",
"timestamp": "2021-10-25 16:30:16.449522Z",
"process_name": "ceph-osd",
"utsname_machine": "x86_64",
"assert_line": 9152,
"utsname_sysname": "Linux",
"os_version": "10 (buster)",
"os_id": "10",
"assert_thread_name": "tp_osd_tp",
"utsname_version": "#1 SMP PVE 5.3.13-1 (Thu, 05 Dec 2019 07:18:14 +0100)",
"backtrace": [
"(()+0x12730) [0x7f269fe77730]",
"(gsignal()+0x10b) [0x7f269f95a7bb]",
"(abort()+0x121) [0x7f269f945535]",
"(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x1a3) [0x55a037398ad7]",
"(()+0x518c5e) [0x55a037398c5e]",
"(BlueStore::_do_read(BlueStore::Collection*, boost::intrusive_ptr<BlueStore::Onode>, unsigned long, unsigned long, ceph::buffer::v14_2_0::list&, unsigned int, unsigned long)+0x39e7) [0x55a0379037f7]",
"(BlueStore::read(boost::intrusive_ptr<ObjectStore::CollectionImpl>&, ghobject_t const&, unsigned long, unsigned long, ceph::buffer::v14_2_0::list&, unsigned int)+0x1b5) [0x55a037907595]",
"(ReplicatedBackend::be_deep_scrub(hobject_t const&, ScrubMap&, ScrubMapBuilder&, ScrubMap:bject&)+0x2cb) [0x55a03777b89b]",
"(PGBackend::be_scan_list(ScrubMap&, ScrubMapBuilder&)+0x6db) [0x55a037696e3b]",
"(PG::build_scrub_map_chunk(ScrubMap&, ScrubMapBuilder&, hobject_t, hobject_t, bool, ThreadPool::TPHandle&)+0x83) [0x55a037535c53]",
"(PG::chunky_scrub(ThreadPool::TPHandle&)+0x192b) [0x55a03756202b]",
"(PG::scrub(unsigned int, ThreadPool::TPHandle&)+0x4bb) [0x55a03756310b]",
"(PGScrub::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&)+0x1a) [0x55a03771b6ea]",
"(OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0x7d7) [0x55a037494d17]",
"(ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x5b4) [0x55a037a53864]",
"(ShardedThreadPool::WorkThreadSharded::entry()+0x10) [0x55a037a56270]",
"(()+0x7fa3) [0x7f269fe6cfa3]",
"(clone()+0x3f) [0x7f269fa1c4cf]"
],
"utsname_hostname": "ceph03",
"assert_msg": "/mnt/npool/tlamprecht/pve-ceph/ceph-14.2.6/src/os/bluestore/BlueStore.cc: In function 'int BlueStore::_do_read(BlueStore::Collection*, BlueStore::OnodeRef, uint64_t, size_t, ceph::bufferlist&, uint32_t, uint64_t)' thread 7f2682ee3700 time 2021-10-25 12:30:16.437928\n/mnt/npool/tlamprecht/pve-ceph/ceph-14.2.6/src/os/bluestore/BlueStore.cc: 9152: FAILED ceph_assert(r == 0)\n",
"crash_id": "2021-10-25_16:30:16.449522Z_ddd6e537-0e6e-4bc0-9754-6e06cd063b02",
"assert_func": "int BlueStore::_do_read(BlueStore::Collection*, BlueStore::OnodeRef, uint64_t, size_t, ceph::bufferlist&, uint32_t, uint64_t)",
"ceph_version": "14.2.6"
}
Does anybody have any suggestions ? I did not try removing it and re-adding, perhaps taking the drive out and putting it back in the bay.
Any help is appreciated.
Thank you
It is an osd in 4 node ceph cluster running under Proxmox and a mamber of 16 osds pool (4 osds per node). The issue is recent , the pool has been up almost 2 years. It happened 3 times in last two weeks. I checked the drive with SMART but it did not finds anything and it is showing that the drive is fine. The cluster is also fine otherwise. I have two pools in this cluster and eveything seems to work well. I can start the osd and it comes back up and works for 3-7 days before the issue reappears .
Here is the message I am getting:
ceph crash info 2021-10-25_16:30:16.449522Z_ddd6e537-0e6e-4bc0-9754-6e06cd063b02
{
"os_version_id": "10",
"assert_condition": "r == 0",
"utsname_release": "5.3.13-1-pve",
"os_name": "Debian GNU/Linux 10 (buster)",
"entity_name": "osd.14",
"assert_file": "/mnt/npool/tlamprecht/pve-ceph/ceph-14.2.6/src/os/bluestore/BlueStore.cc",
"timestamp": "2021-10-25 16:30:16.449522Z",
"process_name": "ceph-osd",
"utsname_machine": "x86_64",
"assert_line": 9152,
"utsname_sysname": "Linux",
"os_version": "10 (buster)",
"os_id": "10",
"assert_thread_name": "tp_osd_tp",
"utsname_version": "#1 SMP PVE 5.3.13-1 (Thu, 05 Dec 2019 07:18:14 +0100)",
"backtrace": [
"(()+0x12730) [0x7f269fe77730]",
"(gsignal()+0x10b) [0x7f269f95a7bb]",
"(abort()+0x121) [0x7f269f945535]",
"(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x1a3) [0x55a037398ad7]",
"(()+0x518c5e) [0x55a037398c5e]",
"(BlueStore::_do_read(BlueStore::Collection*, boost::intrusive_ptr<BlueStore::Onode>, unsigned long, unsigned long, ceph::buffer::v14_2_0::list&, unsigned int, unsigned long)+0x39e7) [0x55a0379037f7]",
"(BlueStore::read(boost::intrusive_ptr<ObjectStore::CollectionImpl>&, ghobject_t const&, unsigned long, unsigned long, ceph::buffer::v14_2_0::list&, unsigned int)+0x1b5) [0x55a037907595]",
"(ReplicatedBackend::be_deep_scrub(hobject_t const&, ScrubMap&, ScrubMapBuilder&, ScrubMap:bject&)+0x2cb) [0x55a03777b89b]",
"(PGBackend::be_scan_list(ScrubMap&, ScrubMapBuilder&)+0x6db) [0x55a037696e3b]",
"(PG::build_scrub_map_chunk(ScrubMap&, ScrubMapBuilder&, hobject_t, hobject_t, bool, ThreadPool::TPHandle&)+0x83) [0x55a037535c53]",
"(PG::chunky_scrub(ThreadPool::TPHandle&)+0x192b) [0x55a03756202b]",
"(PG::scrub(unsigned int, ThreadPool::TPHandle&)+0x4bb) [0x55a03756310b]",
"(PGScrub::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&)+0x1a) [0x55a03771b6ea]",
"(OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0x7d7) [0x55a037494d17]",
"(ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x5b4) [0x55a037a53864]",
"(ShardedThreadPool::WorkThreadSharded::entry()+0x10) [0x55a037a56270]",
"(()+0x7fa3) [0x7f269fe6cfa3]",
"(clone()+0x3f) [0x7f269fa1c4cf]"
],
"utsname_hostname": "ceph03",
"assert_msg": "/mnt/npool/tlamprecht/pve-ceph/ceph-14.2.6/src/os/bluestore/BlueStore.cc: In function 'int BlueStore::_do_read(BlueStore::Collection*, BlueStore::OnodeRef, uint64_t, size_t, ceph::bufferlist&, uint32_t, uint64_t)' thread 7f2682ee3700 time 2021-10-25 12:30:16.437928\n/mnt/npool/tlamprecht/pve-ceph/ceph-14.2.6/src/os/bluestore/BlueStore.cc: 9152: FAILED ceph_assert(r == 0)\n",
"crash_id": "2021-10-25_16:30:16.449522Z_ddd6e537-0e6e-4bc0-9754-6e06cd063b02",
"assert_func": "int BlueStore::_do_read(BlueStore::Collection*, BlueStore::OnodeRef, uint64_t, size_t, ceph::bufferlist&, uint32_t, uint64_t)",
"ceph_version": "14.2.6"
}
Does anybody have any suggestions ? I did not try removing it and re-adding, perhaps taking the drive out and putting it back in the bay.
Any help is appreciated.
Thank you