A couple days ago 2 of my 3 ceph nodes stopped working without any clever reason. I have tried to start osd's multiple times but every time osd service starts fine but after minute service got kill signal and restarts. At the moment I don't understand what can be wrong. Nothing changed and system worked very well a long time. Only what I have done was latest proxmox update a couple weeks ago. I also noticed osd eats at the moment a lot of memory. Any ideas?
Crash info:
Ceph service:
System running Reph version 16.2.9 (a569859f5e07da0c4c39da81d5fb5675cd95da49) pacific (stable)
Crash info:
Code:
root@pve1:~# ceph crash info 2022-11-15T20:54:42.302236Z_4c17a3fb-f317-4235-b54b-7b60ceb09e0c
{
"assert_condition": "abort",
"assert_file": "./src/os/bluestore/BlueFS.cc",
"assert_func": "int BlueFS::_flush_range(BlueFS::FileWriter*, uint64_t, uint64_t)",
"assert_line": 2768,
"assert_msg": "./src/os/bluestore/BlueFS.cc: In function 'int BlueFS::_flush_range(BlueFS::FileWriter*, uint64_t, uint64_t)' thread 7f2a01c42080 time 2022-11-15T22:54:42.266441+0200\n./src/os/bluestore/BlueFS.cc: 2768: ceph_abort_msg(\"bluefs enospc\")\n",
"assert_thread_name": "ceph-osd",
"backtrace": [
"/lib/x86_64-linux-gnu/libpthread.so.0(+0x13140) [0x7f2a022a8140]",
"gsignal()",
"abort()",
"(ceph::__ceph_abort(char const*, int, char const*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)+0x18a) [0x56240f4774de]",
"(BlueFS::_flush_range(BlueFS::FileWriter*, unsigned long, unsigned long)+0x9bd) [0x56240fb69cbd]",
"(BlueFS::_flush(BlueFS::FileWriter*, bool, bool*)+0x9a) [0x56240fb6a2ca]",
"(BlueFS::_flush(BlueFS::FileWriter*, bool, std::unique_lock<std::mutex>&)+0x2f) [0x56240fb7b55f]",
"(BlueRocksWritableFile::Append(rocksdb::Slice const&)+0x100) [0x56240fb938d0]",
"(rocksdb::LegacyWritableFileWrapper::Append(rocksdb::Slice const&, rocksdb::IOOptions const&, rocksdb::IODebugContext*)+0x48) [0x56241005be8e]",
"(rocksdb::WritableFileWriter::WriteBuffered(char const*, unsigned long)+0x338) [0x562410236958]",
"(rocksdb::WritableFileWriter::Append(rocksdb::Slice const&)+0x5d7) [0x562410234edb]",
"(rocksdb::BlockBasedTableBuilder::WriteRawBlock(rocksdb::Slice const&, rocksdb::CompressionType, rocksdb::BlockHandle*, bool)+0x11d) [0x5624103fef17]",
"(rocksdb::BlockBasedTableBuilder::WriteBlock(rocksdb::Slice const&, rocksdb::BlockHandle*, bool)+0x7d0) [0x5624103fecfe]",
"(rocksdb::BlockBasedTableBuilder::WriteBlock(rocksdb::BlockBuilder*, rocksdb::BlockHandle*, bool)+0x48) [0x5624103fe51a]",
"(rocksdb::BlockBasedTableBuilder::Flush()+0x9a) [0x5624103fe4ca]",
"(rocksdb::BlockBasedTableBuilder::Add(rocksdb::Slice const&, rocksdb::Slice const&)+0x197) [0x5624103fdfff]",
"(rocksdb::BuildTable(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, rocksdb::Env*, rocksdb::FileSystem*, rocksdb::ImmutableCFOptions const&, rocksdb::MutableCFOptions const&, rocksdb::FileOptions const&, rocksdb::TableCache*, rocksdb::InternalIteratorBase<rocksdb::Slice>*, std::vector<std::unique_ptr<rocksdb::FragmentedRangeTombstoneIterator, std::default_delete<rocksdb::FragmentedRangeTombstoneIterator> >, std::allocator<std::unique_ptr<rocksdb::FragmentedRangeTombstoneIterator, std::default_delete<rocksdb::FragmentedRangeTombstoneIterator> > > >, rocksdb::FileMetaData*, rocksdb::InternalKeyComparator const&, std::vector<std::unique_ptr<rocksdb::IntTblPropCollectorFactory, std::default_delete<rocksdb::IntTblPropCollectorFactory> >, std::allocator<std::unique_ptr<rocksdb::IntTblPropCollectorFactory, std::default_delete<rocksdb::IntTblPropCollectorFactory> > > > const*, unsigned int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::vector<unsigned long, std::allocator<unsigned long> >, unsigned long, rocksdb::SnapshotChecker*, rocksdb::CompressionType, unsigned long, rocksdb::CompressionOptions const&, bool, rocksdb::InternalStats*, rocksdb::TableFileCreationReason, rocksdb::EventLogger*, int, rocksdb::Env::IOPriority, rocksdb::TableProperties*, int, unsigned long, unsigned long, rocksdb::Env::WriteLifeTimeHint, unsigned long)+0x782) [0x562410381372]",
"(rocksdb::DBImpl::WriteLevel0TableForRecovery(int, rocksdb::ColumnFamilyData*, rocksdb::MemTable*, rocksdb::VersionEdit*)+0x5ea) [0x5624100f9e66]",
"(rocksdb::DBImpl::RecoverLogFiles(std::vector<unsigned long, std::allocator<unsigned long> > const&, unsigned long*, bool, bool*)+0x1ad1) [0x5624100f8add]",
"(rocksdb::DBImpl::Recover(std::vector<rocksdb::ColumnFamilyDescriptor, std::allocator<rocksdb::ColumnFamilyDescriptor> > const&, bool, bool, bool, unsigned long*)+0x159e) [0x5624100f6014]",
"(rocksdb::DBImpl::Open(rocksdb::DBOptions const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::vector<rocksdb::ColumnFamilyDescriptor, std::allocator<rocksdb::ColumnFamilyDescriptor> > const&, std::vector<rocksdb::ColumnFamilyHandle*, std::allocator<rocksdb::ColumnFamilyHandle*> >*, rocksdb::DB**, bool, bool)+0x677) [0x5624100fb30d]",
"(rocksdb::DB::Open(rocksdb::DBOptions const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::vector<rocksdb::ColumnFamilyDescriptor, std::allocator<rocksdb::ColumnFamilyDescriptor> > const&, std::vector<rocksdb::ColumnFamilyHandle*, std::allocator<rocksdb::ColumnFamilyHandle*> >*, rocksdb::DB**)+0x52) [0x5624100fa6e4]",
"(RocksDBStore::do_open(std::ostream&, bool, bool, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)+0x1096) [0x56241000a1f6]",
"(BlueStore::_open_db(bool, bool, bool)+0xa19) [0x56240fa845d9]",
"(BlueStore::_open_db_and_around(bool, bool)+0x332) [0x56240facb642]",
"(BlueStore::_mount()+0x191) [0x56240facdfe1]",
"(OSD::init()+0x58d) [0x56240f56e9fd]",
"main()",
"__libc_start_main()",
"_start()"
],
"ceph_version": "16.2.9",
"crash_id": "2022-11-15T20:54:42.302236Z_4c17a3fb-f317-4235-b54b-7b60ceb09e0c",
"entity_name": "osd.1",
"os_id": "11",
"os_name": "Debian GNU/Linux 11 (bullseye)",
"os_version": "11 (bullseye)",
"os_version_id": "11",
"process_name": "ceph-osd",
"stack_sig": "5de9bd31e89a4834761d2b48a6b1a027f6daa68faa1c6d824cb2e45a549bbe1e",
"timestamp": "2022-11-15T20:54:42.302236Z",
"utsname_hostname": "pve2",
"utsname_machine": "x86_64",
"utsname_release": "5.15.64-1-pve",
"utsname_sysname": "Linux",
"utsname_version": "#1 SMP PVE 5.15.64-1 (Thu, 13 Oct 2022 10:30:34 +0200)"
}
Ceph service:
Code:
root@pve1:~# systemctl status ceph-osd@0
● ceph-osd@0.service - Ceph object storage daemon osd.0
Loaded: loaded (/lib/systemd/system/ceph-osd@.service; enabled-runtime; vendor preset: enabled)
Drop-In: /usr/lib/systemd/system/ceph-osd@.service.d
└─ceph-after-pve-cluster.conf
Active: active (running) since Tue 2022-11-15 23:22:14 EET; 1min 15s ago
Main PID: 1011521 (ceph-osd)
Tasks: 25
Memory: 1.9G
CPU: 1min 1.999s
CGroup: /system.slice/system-ceph\x2dosd.slice/ceph-osd@0.service
└─1011521 /usr/bin/ceph-osd -f --cluster ceph --id 0 --setuser ceph --setgroup ceph
Nov 15 23:22:14 pve1 systemd[1]: Starting Ceph object storage daemon osd.0...
Nov 15 23:22:14 pve1 systemd[1]: Started Ceph object storage daemon osd.0.
root@pve1:~# systemctl status ceph-osd@0
● ceph-osd@0.service - Ceph object storage daemon osd.0
Loaded: loaded (/lib/systemd/system/ceph-osd@.service; enabled-runtime; vendor preset: enabled)
Drop-In: /usr/lib/systemd/system/ceph-osd@.service.d
└─ceph-after-pve-cluster.conf
Active: activating (auto-restart) (Result: signal) since Tue 2022-11-15 23:23:31 EET; 566ms ago
Process: 1011521 ExecStart=/usr/bin/ceph-osd -f --cluster ${CLUSTER} --id 0 --setuser ceph --setgroup ceph (code=killed, signal=ABRT)
Main PID: 1011521 (code=killed, signal=ABRT)
CPU: 1min 2.912s
root@pve1:~# systemctl status ceph-osd@0
● ceph-osd@0.service - Ceph object storage daemon osd.0
Loaded: loaded (/lib/systemd/system/ceph-osd@.service; enabled-runtime; vendor preset: enabled)
Drop-In: /usr/lib/systemd/system/ceph-osd@.service.d
└─ceph-after-pve-cluster.conf
Active: active (running) since Tue 2022-11-15 23:23:41 EET; 800ms ago
Process: 1012814 ExecStartPre=/usr/libexec/ceph/ceph-osd-prestart.sh --cluster ${CLUSTER} --id 0 (code=exited, status=0/SUCCESS)
Main PID: 1012818 (ceph-osd)
Tasks: 12
Memory: 10.8M
CPU: 58ms
CGroup: /system.slice/system-ceph\x2dosd.slice/ceph-osd@0.service
└─1012818 /usr/bin/ceph-osd -f --cluster ceph --id 0 --setuser ceph --setgroup ceph
Nov 15 23:23:41 pve1 systemd[1]: Starting Ceph object storage daemon osd.0...
Nov 15 23:23:41 pve1 systemd[1]: Started Ceph object storage daemon osd.0.
System running Reph version 16.2.9 (a569859f5e07da0c4c39da81d5fb5675cd95da49) pacific (stable)