Benjamin Renard commited on 2013-12-03 22:49:19
              Showing 3 changed files, with 283 additions and 0 deletions.
            
| ... | ... | 
                      @@ -0,0 +1 @@  | 
                  
| 1 | 
                        +*~  | 
                    
| ... | ... | 
                      @@ -0,0 +1,49 @@  | 
                  
| 1 | 
                        +Nagios plugin to check Ceph cluster status  | 
                    |
| 2 | 
                        +==========================================  | 
                    |
| 3 | 
                        +  | 
                    |
| 4 | 
                        +This plugin check ceph health, number of OSDs UP, number of MONs UP  | 
                    |
| 5 | 
                        +and PGs states to determine Ceph cluster status.  | 
                    |
| 6 | 
                        +  | 
                    |
| 7 | 
                        +Usage  | 
                    |
| 8 | 
                        +-----  | 
                    |
| 9 | 
                        +  | 
                    |
| 10 | 
                        + Usage: check_ceph_status [options]  | 
                    |
| 11 | 
                        +  | 
                    |
| 12 | 
                        + Options:  | 
                    |
| 13 | 
                        + -h, --help show this help message and exit  | 
                    |
| 14 | 
                        + -d, --debug  | 
                    |
| 15 | 
                        + -b BIN, --bin=BIN Ceph binary (default : /usr/bin/ceph)  | 
                    |
| 16 | 
                        + --conf=CONF Ceph configuration file  | 
                    |
| 17 | 
                        + -m MON, --mon=MON Ceph monitor address[:port]  | 
                    |
| 18 | 
                        + -i ID, --id=ID Ceph client id  | 
                    |
| 19 | 
                        + -k KEYRING, --keyring=KEYRING  | 
                    |
| 20 | 
                        + Ceph client keyring file  | 
                    |
| 21 | 
                        + -w WARNLOSTOSD, --warning-lost-osd=WARNLOSTOSD  | 
                    |
| 22 | 
                        + Warning number of non-up OSDs (default : 1)  | 
                    |
| 23 | 
                        + -c CRITLOSTOSD, --critical-lost-osd=CRITLOSTOSD  | 
                    |
| 24 | 
                        + Critical number of non-up OSDs (default : 2)  | 
                    |
| 25 | 
                        + -W WARNLOSTMON, --warning-lost-mon=WARNLOSTMON  | 
                    |
| 26 | 
                        + Warning number of non-up MONs (default : 1)  | 
                    |
| 27 | 
                        + -C CRITLOSTMON, --critical-lost-mon=CRITLOSTMON  | 
                    |
| 28 | 
                        + Critical number of non-up MONs (default : 2)  | 
                    |
| 29 | 
                        +  | 
                    |
| 30 | 
                        +Copyright  | 
                    |
| 31 | 
                        +---------  | 
                    |
| 32 | 
                        +  | 
                    |
| 33 | 
                        +Copyright (c) 2013 Benjamin Renard <brenard@zionetrix.net>  | 
                    |
| 34 | 
                        +  | 
                    |
| 35 | 
                        +License  | 
                    |
| 36 | 
                        +-------  | 
                    |
| 37 | 
                        +  | 
                    |
| 38 | 
                        +This program is free software; you can redistribute it and/or  | 
                    |
| 39 | 
                        +modify it under the terms of the GNU General Public License version 2  | 
                    |
| 40 | 
                        +as published by the Free Software Foundation.  | 
                    |
| 41 | 
                        +  | 
                    |
| 42 | 
                        +This program is distributed in the hope that it will be useful,  | 
                    |
| 43 | 
                        +but WITHOUT ANY WARRANTY; without even the implied warranty of  | 
                    |
| 44 | 
                        +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the  | 
                    |
| 45 | 
                        +GNU General Public License for more details.  | 
                    |
| 46 | 
                        +  | 
                    |
| 47 | 
                        +You should have received a copy of the GNU General Public License  | 
                    |
| 48 | 
                        +along with this program; if not, write to the Free Software  | 
                    |
| 49 | 
                        +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  | 
                    
| ... | ... | 
                      @@ -0,0 +1,233 @@  | 
                  
| 1 | 
                        +#!/usr/bin/python  | 
                    |
| 2 | 
                        +#  | 
                    |
| 3 | 
                        +# Nagios plugin to check Ceph cluster state  | 
                    |
| 4 | 
                        +#  | 
                    |
| 5 | 
                        +# This plugin check ceph health, number of OSDs UP, number of MONs UP  | 
                    |
| 6 | 
                        +# and PGs states to determine Ceph cluster status.  | 
                    |
| 7 | 
                        +#  | 
                    |
| 8 | 
                        +# Usage: check_ceph_status [options]  | 
                    |
| 9 | 
                        +#  | 
                    |
| 10 | 
                        +# Options:  | 
                    |
| 11 | 
                        +# -h, --help show this help message and exit  | 
                    |
| 12 | 
                        +# -d, --debug  | 
                    |
| 13 | 
                        +# -b BIN, --bin=BIN Ceph binary (default : /usr/bin/ceph)  | 
                    |
| 14 | 
                        +# --conf=CONF Ceph configuration file  | 
                    |
| 15 | 
                        +# -m MON, --mon=MON Ceph monitor address[:port]  | 
                    |
| 16 | 
                        +# -i ID, --id=ID Ceph client id  | 
                    |
| 17 | 
                        +# -k KEYRING, --keyring=KEYRING  | 
                    |
| 18 | 
                        +# Ceph client keyring file  | 
                    |
| 19 | 
                        +# -w WARNLOSTOSD, --warning-lost-osd=WARNLOSTOSD  | 
                    |
| 20 | 
                        +# Warning number of non-up OSDs (default : 1)  | 
                    |
| 21 | 
                        +# -c CRITLOSTOSD, --critical-lost-osd=CRITLOSTOSD  | 
                    |
| 22 | 
                        +# Critical number of non-up OSDs (default : 2)  | 
                    |
| 23 | 
                        +# -W WARNLOSTMON, --warning-lost-mon=WARNLOSTMON  | 
                    |
| 24 | 
                        +# Warning number of non-up MONs (default : 1)  | 
                    |
| 25 | 
                        +# -C CRITLOSTMON, --critical-lost-mon=CRITLOSTMON  | 
                    |
| 26 | 
                        +# Critical number of non-up MONs (default : 2)  | 
                    |
| 27 | 
                        +#  | 
                    |
| 28 | 
                        +# Copyright (c) 2013 Benjamin Renard <brenard@zionetrix.net>  | 
                    |
| 29 | 
                        +#  | 
                    |
| 30 | 
                        +# This program is free software; you can redistribute it and/or  | 
                    |
| 31 | 
                        +# modify it under the terms of the GNU General Public License version 2  | 
                    |
| 32 | 
                        +# as published by the Free Software Foundation.  | 
                    |
| 33 | 
                        +#  | 
                    |
| 34 | 
                        +# This program is distributed in the hope that it will be useful,  | 
                    |
| 35 | 
                        +# but WITHOUT ANY WARRANTY; without even the implied warranty of  | 
                    |
| 36 | 
                        +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the  | 
                    |
| 37 | 
                        +# GNU General Public License for more details.  | 
                    |
| 38 | 
                        +#  | 
                    |
| 39 | 
                        +# You should have received a copy of the GNU General Public License  | 
                    |
| 40 | 
                        +# along with this program; if not, write to the Free Software  | 
                    |
| 41 | 
                        +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  | 
                    |
| 42 | 
                        +#  | 
                    |
| 43 | 
                        +  | 
                    |
| 44 | 
                        +import sys,os,json,subprocess,re  | 
                    |
| 45 | 
                        +from optparse import OptionParser  | 
                    |
| 46 | 
                        +  | 
                    |
| 47 | 
                        +# default ceph values  | 
                    |
| 48 | 
                        +CEPH_COMMAND = '/usr/bin/ceph'  | 
                    |
| 49 | 
                        +WARN_LOST_OSD = 1  | 
                    |
| 50 | 
                        +CRIT_LOST_OSD = 2  | 
                    |
| 51 | 
                        +WARN_LOST_MON = 1  | 
                    |
| 52 | 
                        +CRIT_LOST_MON = 2  | 
                    |
| 53 | 
                        +  | 
                    |
| 54 | 
                        +# nagios exit code  | 
                    |
| 55 | 
                        +STATUS = {
                       | 
                    |
| 56 | 
                        + 'OK': 0,  | 
                    |
| 57 | 
                        + 'WARNING': 1,  | 
                    |
| 58 | 
                        + 'CRITICAL': 2,  | 
                    |
| 59 | 
                        + 'UNKNOWN': 3  | 
                    |
| 60 | 
                        +}  | 
                    |
| 61 | 
                        +  | 
                    |
| 62 | 
                        +parser = OptionParser()  | 
                    |
| 63 | 
                        +parser.add_option('-d',
                       | 
                    |
| 64 | 
                        + '--debug',  | 
                    |
| 65 | 
                        + action="store_true",  | 
                    |
| 66 | 
                        + dest="debug",  | 
                    |
| 67 | 
                        + default=False)  | 
                    |
| 68 | 
                        +  | 
                    |
| 69 | 
                        +parser.add_option('-b',
                       | 
                    |
| 70 | 
                        + '--bin',  | 
                    |
| 71 | 
                        + action="store",  | 
                    |
| 72 | 
                        + dest="bin",  | 
                    |
| 73 | 
                        + help="Ceph binary (default : %s)" % CEPH_COMMAND,  | 
                    |
| 74 | 
                        + type='string',  | 
                    |
| 75 | 
                        + default=CEPH_COMMAND)  | 
                    |
| 76 | 
                        +  | 
                    |
| 77 | 
                        +parser.add_option('--conf',
                       | 
                    |
| 78 | 
                        + action="store",  | 
                    |
| 79 | 
                        + dest="conf",  | 
                    |
| 80 | 
                        + help="Ceph configuration file",  | 
                    |
| 81 | 
                        + type='string',  | 
                    |
| 82 | 
                        + default=None)  | 
                    |
| 83 | 
                        +  | 
                    |
| 84 | 
                        +parser.add_option('-m',
                       | 
                    |
| 85 | 
                        + '--mon',  | 
                    |
| 86 | 
                        + action="store",  | 
                    |
| 87 | 
                        + dest="mon",  | 
                    |
| 88 | 
                        + help="Ceph monitor address[:port]",  | 
                    |
| 89 | 
                        + type='string',  | 
                    |
| 90 | 
                        + default=None)  | 
                    |
| 91 | 
                        +  | 
                    |
| 92 | 
                        +parser.add_option('-i',
                       | 
                    |
| 93 | 
                        + '--id',  | 
                    |
| 94 | 
                        + action="store",  | 
                    |
| 95 | 
                        + dest="id",  | 
                    |
| 96 | 
                        + help="Ceph client id",  | 
                    |
| 97 | 
                        + type='string',  | 
                    |
| 98 | 
                        + default=None)  | 
                    |
| 99 | 
                        +  | 
                    |
| 100 | 
                        +parser.add_option('-k',
                       | 
                    |
| 101 | 
                        + '--keyring',  | 
                    |
| 102 | 
                        + action="store",  | 
                    |
| 103 | 
                        + dest="keyring",  | 
                    |
| 104 | 
                        + help="Ceph client keyring file",  | 
                    |
| 105 | 
                        + type='string',  | 
                    |
| 106 | 
                        + default=None)  | 
                    |
| 107 | 
                        +  | 
                    |
| 108 | 
                        +parser.add_option('-w',
                       | 
                    |
| 109 | 
                        + '--warning-lost-osd',  | 
                    |
| 110 | 
                        + action="store",  | 
                    |
| 111 | 
                        + dest="warnlostosd",  | 
                    |
| 112 | 
                        + help="Warning number of non-up OSDs (default : %s)" % WARN_LOST_OSD,  | 
                    |
| 113 | 
                        + type='int',  | 
                    |
| 114 | 
                        + default=WARN_LOST_OSD)  | 
                    |
| 115 | 
                        +  | 
                    |
| 116 | 
                        +parser.add_option('-c',
                       | 
                    |
| 117 | 
                        + '--critical-lost-osd',  | 
                    |
| 118 | 
                        + action="store",  | 
                    |
| 119 | 
                        + dest="critlostosd",  | 
                    |
| 120 | 
                        + help="Critical number of non-up OSDs (default : %s)" % CRIT_LOST_OSD,  | 
                    |
| 121 | 
                        + type='int',  | 
                    |
| 122 | 
                        + default=CRIT_LOST_OSD)  | 
                    |
| 123 | 
                        +  | 
                    |
| 124 | 
                        +parser.add_option('-W',
                       | 
                    |
| 125 | 
                        + '--warning-lost-mon',  | 
                    |
| 126 | 
                        + action="store",  | 
                    |
| 127 | 
                        + dest="warnlostmon",  | 
                    |
| 128 | 
                        + help="Warning number of non-up MONs (default : %s)" % WARN_LOST_MON,  | 
                    |
| 129 | 
                        + type='int',  | 
                    |
| 130 | 
                        + default=WARN_LOST_MON)  | 
                    |
| 131 | 
                        +  | 
                    |
| 132 | 
                        +parser.add_option('-C',
                       | 
                    |
| 133 | 
                        + '--critical-lost-mon',  | 
                    |
| 134 | 
                        + action="store",  | 
                    |
| 135 | 
                        + dest="critlostmon",  | 
                    |
| 136 | 
                        + help="Critical number of non-up MONs (default : %s)" % CRIT_LOST_MON,  | 
                    |
| 137 | 
                        + type='int',  | 
                    |
| 138 | 
                        + default=CRIT_LOST_MON)  | 
                    |
| 139 | 
                        +  | 
                    |
| 140 | 
                        +(options, args) = parser.parse_args()  | 
                    |
| 141 | 
                        +  | 
                    |
| 142 | 
                        + # validate args  | 
                    |
| 143 | 
                        +if not os.path.exists(options.bin):  | 
                    |
| 144 | 
                        + print "ERROR: ceph executable '%s' doesn't exist" % options.bin  | 
                    |
| 145 | 
                        + sys.exit(STATUS['UNKNOWN'])  | 
                    |
| 146 | 
                        +  | 
                    |
| 147 | 
                        +if options.conf and not os.path.exists(options.conf):  | 
                    |
| 148 | 
                        + print "ERROR: ceph conf file '%s' doesn't exist" % options.conf  | 
                    |
| 149 | 
                        + sys.exit(STATUS['UNKNOWN'])  | 
                    |
| 150 | 
                        +  | 
                    |
| 151 | 
                        +if options.keyring and not os.path.exists(options.keyring):  | 
                    |
| 152 | 
                        + print "ERROR: keyring file '%s' doesn't exist" % options.keyring  | 
                    |
| 153 | 
                        + sys.exit(STATUS['UNKNOWN'])  | 
                    |
| 154 | 
                        +  | 
                    |
| 155 | 
                        +# build command  | 
                    |
| 156 | 
                        +ceph_cmd = [options.bin]  | 
                    |
| 157 | 
                        +if options.mon:  | 
                    |
| 158 | 
                        +    ceph_cmd.append('-m')
                       | 
                    |
| 159 | 
                        + ceph_cmd.append(options.mon)  | 
                    |
| 160 | 
                        +if options.conf:  | 
                    |
| 161 | 
                        +    ceph_cmd.append('-c')
                       | 
                    |
| 162 | 
                        + ceph_cmd.append(options.conf)  | 
                    |
| 163 | 
                        +if options.id:  | 
                    |
| 164 | 
                        +    ceph_cmd.append('--id')
                       | 
                    |
| 165 | 
                        + ceph_cmd.append(options.id)  | 
                    |
| 166 | 
                        +if options.keyring:  | 
                    |
| 167 | 
                        +    ceph_cmd.append('--keyring')
                       | 
                    |
| 168 | 
                        + ceph_cmd.append(options.keyring)  | 
                    |
| 169 | 
                        +ceph_cmd.append('status')
                       | 
                    |
| 170 | 
                        +ceph_cmd.append('--format=json')
                       | 
                    |
| 171 | 
                        +  | 
                    |
| 172 | 
                        +# exec command  | 
                    |
| 173 | 
                        +p = subprocess.Popen(ceph_cmd,stdout=subprocess.PIPE,stderr=subprocess.PIPE)  | 
                    |
| 174 | 
                        +output, err = p.communicate()  | 
                    |
| 175 | 
                        +  | 
                    |
| 176 | 
                        +if output:  | 
                    |
| 177 | 
                        + data=json.loads(output)  | 
                    |
| 178 | 
                        +  | 
                    |
| 179 | 
                        + status='OK'  | 
                    |
| 180 | 
                        +  | 
                    |
| 181 | 
                        + health=data['health']['overall_status']  | 
                    |
| 182 | 
                        + if health=='HEALTH_WARN':  | 
                    |
| 183 | 
                        + status='WARNING'  | 
                    |
| 184 | 
                        + elif health=='HEALTH_CRIT':  | 
                    |
| 185 | 
                        + status='CRITICAL'  | 
                    |
| 186 | 
                        +  | 
                    |
| 187 | 
                        + total_mon=len(data['monmap']['mons'])  | 
                    |
| 188 | 
                        + total_mon_up=len(data['health']['timechecks']['mons'])  | 
                    |
| 189 | 
                        +  | 
                    |
| 190 | 
                        + num_lost_mon=total_mon-total_mon_up  | 
                    |
| 191 | 
                        + if num_lost_mon==0:  | 
                    |
| 192 | 
                        + monstate="(MONs UP : %s/%s)" % (total_mon_up,total_mon)  | 
                    |
| 193 | 
                        + else:  | 
                    |
| 194 | 
                        + monstate="%s MONs down (MONs UP : %s/%s)" % (num_lost_mon,total_mon_up,total_mon)  | 
                    |
| 195 | 
                        + if num_lost_mon >= options.critlostmon:  | 
                    |
| 196 | 
                        + status='CRITICAL'  | 
                    |
| 197 | 
                        + elif num_lost_mon >= options.warnlostmon and status!='CRITICAL':  | 
                    |
| 198 | 
                        + status='WARNING'  | 
                    |
| 199 | 
                        +  | 
                    |
| 200 | 
                        + total_osd=data['osdmap']['osdmap']['num_osds']  | 
                    |
| 201 | 
                        + total_osd_up=data['osdmap']['osdmap']['num_up_osds']  | 
                    |
| 202 | 
                        +  | 
                    |
| 203 | 
                        + num_lost_osd=total_osd-total_osd_up  | 
                    |
| 204 | 
                        +  | 
                    |
| 205 | 
                        + if num_lost_osd>=options.critlostosd:  | 
                    |
| 206 | 
                        + status='CRITICAL'  | 
                    |
| 207 | 
                        + elif num_lost_osd>=options.warnlostosd and status!='CRITICAL':  | 
                    |
| 208 | 
                        + status='WARNING'  | 
                    |
| 209 | 
                        +  | 
                    |
| 210 | 
                        + total_pg=data['pgmap']['num_pgs']  | 
                    |
| 211 | 
                        + pgstate=""  | 
                    |
| 212 | 
                        + for st in data['pgmap']['pgs_by_state']:  | 
                    |
| 213 | 
                        +		if re.search('(down|inconsistent|imcomplete|stale)',st['state_name'],re.IGNORECASE):
                       | 
                    |
| 214 | 
                        + status='CRITICAL'  | 
                    |
| 215 | 
                        + pgstate="%s / %s PGs %s" % (pgstate,st['count'],st['state_name'])  | 
                    |
| 216 | 
                        +		elif re.search('(replay|degraded|repair|recovering|backfill)',st['state_name'],re.IGNORECASE):
                       | 
                    |
| 217 | 
                        + if status!='CRITICAL':  | 
                    |
| 218 | 
                        + status="WARNING"  | 
                    |
| 219 | 
                        + pgstate="%s / %s PGs %s" % (pgstate,st['count'],st['state_name'])  | 
                    |
| 220 | 
                        + elif st['state_name']=="active+clean":  | 
                    |
| 221 | 
                        + pgstate="%s / %s/%s PGs active+clean" % (pgstate,st['count'],total_pg)  | 
                    |
| 222 | 
                        +  | 
                    |
| 223 | 
                        + msg="%s : %s%s %s" % (status,health,pgstate,monstate)  | 
                    |
| 224 | 
                        +  | 
                    |
| 225 | 
                        +  | 
                    |
| 226 | 
                        + if num_lost_osd==0:  | 
                    |
| 227 | 
                        + print "%s (OSDs UP : %s/%s)" % (msg,total_osd_up,total_osd)  | 
                    |
| 228 | 
                        + else:  | 
                    |
| 229 | 
                        + print "%s / %s OSDs down (OSDs UP : %s/%s)" % (msg,num_lost_osd,total_osd_up,total_osd)  | 
                    |
| 230 | 
                        + sys.exit(STATUS[status])  | 
                    |
| 231 | 
                        +else:  | 
                    |
| 232 | 
                        + print "UNKNOWN : fail to execute ceph status command"  | 
                    |
| 233 | 
                        + sys.exit(STATUS['UNKNOWN'])  | 
                    |
| 0 | 234 |