목차

1 daemons have recently crashed

ceph 운영중에 상태에 간혹 1 daemons have recently crashed 와 같은 메세지가 나오면서 HEALTH_WARN상태인 경우를 볼 수 있다.

[root@mgmt ~]# ceph -s 
  cluster: 
    id:     7025ab16-5810-4382-9318-1bd4a704ef48 
    health: HEALTH_WARN 
            1 daemons have recently crashed 

  services: 
    mon: 2 daemons, quorum mgmt,mon (age 3m) 
    mgr: mgmt(active, since 47h) 
    mds:  1 up:standby 
    osd: 9 osds: 9 up (since 5m), 9 in (since 47h) 

  data: 
    pools:   1 pools, 128 pgs 
    objects: 4 objects, 35 B 
    usage:   9.8 GiB used, 80 GiB / 90 GiB avail 
    pgs:     128 active+clean

이런경우는 간혹 OSD disk에 문제가 생겼거나 교체를 수행하고 나서 발생하는데
상세한 내용은 아래 명령어로 확인이 가능하다.

[root@mgmt ~]# ceph crash ls
ID                                                               ENTITY   NEW
2021-02-20_11:41:52.234574Z_fac113ad-5fa2-40fd-bb00-a0410e0472dc mon.mgmt  * 


[root@mgmt ~]# ceph crash info 2021-02-20_11:41:52.234574Z_fac113ad-5fa2-40fd-bb00-a0410e0472dc
{
    "os_version_id": "7",
    "assert_condition": "(*__errno_location ()) == 4",
    "utsname_release": "3.10.0-1127.10.1.el7.x86_64",
    "os_name": "CentOS Linux",
    "entity_name": "mon.mgmt",
    "assert_file": "/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos7/DIST/centos7/MACHINE_SIZE/gigantic/release/14.2.16/rpm/el7/BUILD/ceph-14.2.16/src/common/fork_function.h",
    "timestamp": "2021-02-20 11:41:52.234574Z",
    "process_name": "ceph-mon",
    "utsname_machine": "x86_64",
    "assert_line": 34,
    "utsname_sysname": "Linux",
    "os_version": "7 (Core)",
    "os_id": "centos",
    "assert_thread_name": "ms_dispatch",
    "utsname_version": "#1 SMP Wed Jun 3 14:28:03 UTC 2020",
    "backtrace": [
        "(()+0xf630) [0x7fbbf4045630]",
        "(gsignal()+0x37) [0x7fbbf2e243d7]",
        "(abort()+0x148) [0x7fbbf2e25ac8]",
        "(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x199) [0x7fbbf7292b76]",
        "(()+0x25ccef) [0x7fbbf7292cef]",
        "(CrushTester::test_with_fork(int)+0x799) [0x7fbbf77fd839]",
        "(OSDMonitor::prepare_new_pool(std::string&, int, std::string const&, unsigned int, unsigned int, unsigned int, unsigned long, unsigned long, float, std::string const&, unsigned int, unsigned long, OSDMonitor::FastReadType, std::ostream*)+0x460) [0x5568226a9aa0]",
        "(OSDMonitor::prepare_command_impl(boost::intrusive_ptr<MonOpRequest>, std::map<std::string, boost::variant<std::string, bool, long, double, std::vector<std::string, std::allocator<std::string> >, std::vector<long, std::allocator<long> >, std::vector<double, std::allocator<double> > >, std::less<void>, std::allocator<std::pair<std::string const, boost::variant<std::string, bool, long, double, std::vector<std::string, std::allocator<std::string> >, std::vector<long, std::allocator<long> >, std::vector<double, std::allocator<double> > > > > > const&)+0x1919b) [0x5568226c998b]",
        "(OSDMonitor::prepare_command(boost::intrusive_ptr<MonOpRequest>)+0x10d) [0x5568226d217d]",
        "(OSDMonitor::prepare_update(boost::intrusive_ptr<MonOpRequest>)+0x2a6) [0x5568226d5b26]",
        "(PaxosService::dispatch(boost::intrusive_ptr<MonOpRequest>)+0x66d) [0x55682266325d]",
        "(Monitor::handle_command(boost::intrusive_ptr<MonOpRequest>)+0x23ab) [0x55682257c98b]",
        "(Monitor::dispatch_op(boost::intrusive_ptr<MonOpRequest>)+0x805) [0x556822581ca5]",
        "(Monitor::_ms_dispatch(Message*)+0xca0) [0x5568225833f0]",
        "(Monitor::ms_dispatch(Message*)+0x26) [0x5568225b0736]",
        "(Dispatcher::ms_dispatch2(boost::intrusive_ptr<Message> const&)+0x26) [0x5568225ad116]",
        "(DispatchQueue::entry()+0x1699) [0x7fbbf74b50e9]",
        "(DispatchQueue::DispatchThread::entry()+0xd) [0x7fbbf756277d]",
        "(()+0x7ea5) [0x7fbbf403dea5]",
        "(clone()+0x6d) [0x7fbbf2eec9fd]"
    ],
    "utsname_hostname": "mgmt",
    "assert_msg": "/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos7/DIST/centos7/MACHINE_SIZE/gigantic/release/14.2.16/rpm/el7/BUILD/ceph-14.2.16/src/common/fork_function.h: In function 'int fork_function(int, std::ostream&, std::function<signed char()>)' thread 7fbbe7ff0700 time 2021-02-20 20:41:52.224795\n/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos7/DIST/centos7/MACHINE_SIZE/gigantic/release/14.2.16/rpm/el7/BUILD/ceph-14.2.16/src/common/fork_function.h: 34: FAILED ceph_assert((*__errno_location ()) == 4)\n",
    "crash_id": "2021-02-20_11:41:52.234574Z_fac113ad-5fa2-40fd-bb00-a0410e0472dc",
    "assert_func": "int fork_function(int, std::ostream&, std::function<signed char()>)",
    "ceph_version": "14.2.16"
}

Crash 명령어

Crash log에 대한 명령어는 아래와 같다.

crash list

# ceph crash ls-new

crash info

#ceph crash info <crash-id>

crash archive (acknowledged 확인)

#ceph crash archive <crash-id>

또는 전체 archive

#ceph crash archive-all

기능 비활성화

아예 archive 기능을 비활성화 하려면 아래 명령어를 사용한다.

#ceph config set mgr mgr/crash/warn_recent_interval 0

참조링크