Benjamin Renard commited on 2013-12-03 22:49:19
Showing 3 changed files, with 283 additions and 0 deletions.
| ... | ... |
@@ -0,0 +1 @@ |
| 1 |
+*~ |
| ... | ... |
@@ -0,0 +1,49 @@ |
| 1 |
+Nagios plugin to check Ceph cluster status |
|
| 2 |
+========================================== |
|
| 3 |
+ |
|
| 4 |
+This plugin check ceph health, number of OSDs UP, number of MONs UP |
|
| 5 |
+and PGs states to determine Ceph cluster status. |
|
| 6 |
+ |
|
| 7 |
+Usage |
|
| 8 |
+----- |
|
| 9 |
+ |
|
| 10 |
+ Usage: check_ceph_status [options] |
|
| 11 |
+ |
|
| 12 |
+ Options: |
|
| 13 |
+ -h, --help show this help message and exit |
|
| 14 |
+ -d, --debug |
|
| 15 |
+ -b BIN, --bin=BIN Ceph binary (default : /usr/bin/ceph) |
|
| 16 |
+ --conf=CONF Ceph configuration file |
|
| 17 |
+ -m MON, --mon=MON Ceph monitor address[:port] |
|
| 18 |
+ -i ID, --id=ID Ceph client id |
|
| 19 |
+ -k KEYRING, --keyring=KEYRING |
|
| 20 |
+ Ceph client keyring file |
|
| 21 |
+ -w WARNLOSTOSD, --warning-lost-osd=WARNLOSTOSD |
|
| 22 |
+ Warning number of non-up OSDs (default : 1) |
|
| 23 |
+ -c CRITLOSTOSD, --critical-lost-osd=CRITLOSTOSD |
|
| 24 |
+ Critical number of non-up OSDs (default : 2) |
|
| 25 |
+ -W WARNLOSTMON, --warning-lost-mon=WARNLOSTMON |
|
| 26 |
+ Warning number of non-up MONs (default : 1) |
|
| 27 |
+ -C CRITLOSTMON, --critical-lost-mon=CRITLOSTMON |
|
| 28 |
+ Critical number of non-up MONs (default : 2) |
|
| 29 |
+ |
|
| 30 |
+Copyright |
|
| 31 |
+--------- |
|
| 32 |
+ |
|
| 33 |
+Copyright (c) 2013 Benjamin Renard <brenard@zionetrix.net> |
|
| 34 |
+ |
|
| 35 |
+License |
|
| 36 |
+------- |
|
| 37 |
+ |
|
| 38 |
+This program is free software; you can redistribute it and/or |
|
| 39 |
+modify it under the terms of the GNU General Public License version 2 |
|
| 40 |
+as published by the Free Software Foundation. |
|
| 41 |
+ |
|
| 42 |
+This program is distributed in the hope that it will be useful, |
|
| 43 |
+but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
| 44 |
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
| 45 |
+GNU General Public License for more details. |
|
| 46 |
+ |
|
| 47 |
+You should have received a copy of the GNU General Public License |
|
| 48 |
+along with this program; if not, write to the Free Software |
|
| 49 |
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
| ... | ... |
@@ -0,0 +1,233 @@ |
| 1 |
+#!/usr/bin/python |
|
| 2 |
+# |
|
| 3 |
+# Nagios plugin to check Ceph cluster state |
|
| 4 |
+# |
|
| 5 |
+# This plugin check ceph health, number of OSDs UP, number of MONs UP |
|
| 6 |
+# and PGs states to determine Ceph cluster status. |
|
| 7 |
+# |
|
| 8 |
+# Usage: check_ceph_status [options] |
|
| 9 |
+# |
|
| 10 |
+# Options: |
|
| 11 |
+# -h, --help show this help message and exit |
|
| 12 |
+# -d, --debug |
|
| 13 |
+# -b BIN, --bin=BIN Ceph binary (default : /usr/bin/ceph) |
|
| 14 |
+# --conf=CONF Ceph configuration file |
|
| 15 |
+# -m MON, --mon=MON Ceph monitor address[:port] |
|
| 16 |
+# -i ID, --id=ID Ceph client id |
|
| 17 |
+# -k KEYRING, --keyring=KEYRING |
|
| 18 |
+# Ceph client keyring file |
|
| 19 |
+# -w WARNLOSTOSD, --warning-lost-osd=WARNLOSTOSD |
|
| 20 |
+# Warning number of non-up OSDs (default : 1) |
|
| 21 |
+# -c CRITLOSTOSD, --critical-lost-osd=CRITLOSTOSD |
|
| 22 |
+# Critical number of non-up OSDs (default : 2) |
|
| 23 |
+# -W WARNLOSTMON, --warning-lost-mon=WARNLOSTMON |
|
| 24 |
+# Warning number of non-up MONs (default : 1) |
|
| 25 |
+# -C CRITLOSTMON, --critical-lost-mon=CRITLOSTMON |
|
| 26 |
+# Critical number of non-up MONs (default : 2) |
|
| 27 |
+# |
|
| 28 |
+# Copyright (c) 2013 Benjamin Renard <brenard@zionetrix.net> |
|
| 29 |
+# |
|
| 30 |
+# This program is free software; you can redistribute it and/or |
|
| 31 |
+# modify it under the terms of the GNU General Public License version 2 |
|
| 32 |
+# as published by the Free Software Foundation. |
|
| 33 |
+# |
|
| 34 |
+# This program is distributed in the hope that it will be useful, |
|
| 35 |
+# but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
| 36 |
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
| 37 |
+# GNU General Public License for more details. |
|
| 38 |
+# |
|
| 39 |
+# You should have received a copy of the GNU General Public License |
|
| 40 |
+# along with this program; if not, write to the Free Software |
|
| 41 |
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
|
| 42 |
+# |
|
| 43 |
+ |
|
| 44 |
+import sys,os,json,subprocess,re |
|
| 45 |
+from optparse import OptionParser |
|
| 46 |
+ |
|
| 47 |
+# default ceph values |
|
| 48 |
+CEPH_COMMAND = '/usr/bin/ceph' |
|
| 49 |
+WARN_LOST_OSD = 1 |
|
| 50 |
+CRIT_LOST_OSD = 2 |
|
| 51 |
+WARN_LOST_MON = 1 |
|
| 52 |
+CRIT_LOST_MON = 2 |
|
| 53 |
+ |
|
| 54 |
+# nagios exit code |
|
| 55 |
+STATUS = {
|
|
| 56 |
+ 'OK': 0, |
|
| 57 |
+ 'WARNING': 1, |
|
| 58 |
+ 'CRITICAL': 2, |
|
| 59 |
+ 'UNKNOWN': 3 |
|
| 60 |
+} |
|
| 61 |
+ |
|
| 62 |
+parser = OptionParser() |
|
| 63 |
+parser.add_option('-d',
|
|
| 64 |
+ '--debug', |
|
| 65 |
+ action="store_true", |
|
| 66 |
+ dest="debug", |
|
| 67 |
+ default=False) |
|
| 68 |
+ |
|
| 69 |
+parser.add_option('-b',
|
|
| 70 |
+ '--bin', |
|
| 71 |
+ action="store", |
|
| 72 |
+ dest="bin", |
|
| 73 |
+ help="Ceph binary (default : %s)" % CEPH_COMMAND, |
|
| 74 |
+ type='string', |
|
| 75 |
+ default=CEPH_COMMAND) |
|
| 76 |
+ |
|
| 77 |
+parser.add_option('--conf',
|
|
| 78 |
+ action="store", |
|
| 79 |
+ dest="conf", |
|
| 80 |
+ help="Ceph configuration file", |
|
| 81 |
+ type='string', |
|
| 82 |
+ default=None) |
|
| 83 |
+ |
|
| 84 |
+parser.add_option('-m',
|
|
| 85 |
+ '--mon', |
|
| 86 |
+ action="store", |
|
| 87 |
+ dest="mon", |
|
| 88 |
+ help="Ceph monitor address[:port]", |
|
| 89 |
+ type='string', |
|
| 90 |
+ default=None) |
|
| 91 |
+ |
|
| 92 |
+parser.add_option('-i',
|
|
| 93 |
+ '--id', |
|
| 94 |
+ action="store", |
|
| 95 |
+ dest="id", |
|
| 96 |
+ help="Ceph client id", |
|
| 97 |
+ type='string', |
|
| 98 |
+ default=None) |
|
| 99 |
+ |
|
| 100 |
+parser.add_option('-k',
|
|
| 101 |
+ '--keyring', |
|
| 102 |
+ action="store", |
|
| 103 |
+ dest="keyring", |
|
| 104 |
+ help="Ceph client keyring file", |
|
| 105 |
+ type='string', |
|
| 106 |
+ default=None) |
|
| 107 |
+ |
|
| 108 |
+parser.add_option('-w',
|
|
| 109 |
+ '--warning-lost-osd', |
|
| 110 |
+ action="store", |
|
| 111 |
+ dest="warnlostosd", |
|
| 112 |
+ help="Warning number of non-up OSDs (default : %s)" % WARN_LOST_OSD, |
|
| 113 |
+ type='int', |
|
| 114 |
+ default=WARN_LOST_OSD) |
|
| 115 |
+ |
|
| 116 |
+parser.add_option('-c',
|
|
| 117 |
+ '--critical-lost-osd', |
|
| 118 |
+ action="store", |
|
| 119 |
+ dest="critlostosd", |
|
| 120 |
+ help="Critical number of non-up OSDs (default : %s)" % CRIT_LOST_OSD, |
|
| 121 |
+ type='int', |
|
| 122 |
+ default=CRIT_LOST_OSD) |
|
| 123 |
+ |
|
| 124 |
+parser.add_option('-W',
|
|
| 125 |
+ '--warning-lost-mon', |
|
| 126 |
+ action="store", |
|
| 127 |
+ dest="warnlostmon", |
|
| 128 |
+ help="Warning number of non-up MONs (default : %s)" % WARN_LOST_MON, |
|
| 129 |
+ type='int', |
|
| 130 |
+ default=WARN_LOST_MON) |
|
| 131 |
+ |
|
| 132 |
+parser.add_option('-C',
|
|
| 133 |
+ '--critical-lost-mon', |
|
| 134 |
+ action="store", |
|
| 135 |
+ dest="critlostmon", |
|
| 136 |
+ help="Critical number of non-up MONs (default : %s)" % CRIT_LOST_MON, |
|
| 137 |
+ type='int', |
|
| 138 |
+ default=CRIT_LOST_MON) |
|
| 139 |
+ |
|
| 140 |
+(options, args) = parser.parse_args() |
|
| 141 |
+ |
|
| 142 |
+ # validate args |
|
| 143 |
+if not os.path.exists(options.bin): |
|
| 144 |
+ print "ERROR: ceph executable '%s' doesn't exist" % options.bin |
|
| 145 |
+ sys.exit(STATUS['UNKNOWN']) |
|
| 146 |
+ |
|
| 147 |
+if options.conf and not os.path.exists(options.conf): |
|
| 148 |
+ print "ERROR: ceph conf file '%s' doesn't exist" % options.conf |
|
| 149 |
+ sys.exit(STATUS['UNKNOWN']) |
|
| 150 |
+ |
|
| 151 |
+if options.keyring and not os.path.exists(options.keyring): |
|
| 152 |
+ print "ERROR: keyring file '%s' doesn't exist" % options.keyring |
|
| 153 |
+ sys.exit(STATUS['UNKNOWN']) |
|
| 154 |
+ |
|
| 155 |
+# build command |
|
| 156 |
+ceph_cmd = [options.bin] |
|
| 157 |
+if options.mon: |
|
| 158 |
+ ceph_cmd.append('-m')
|
|
| 159 |
+ ceph_cmd.append(options.mon) |
|
| 160 |
+if options.conf: |
|
| 161 |
+ ceph_cmd.append('-c')
|
|
| 162 |
+ ceph_cmd.append(options.conf) |
|
| 163 |
+if options.id: |
|
| 164 |
+ ceph_cmd.append('--id')
|
|
| 165 |
+ ceph_cmd.append(options.id) |
|
| 166 |
+if options.keyring: |
|
| 167 |
+ ceph_cmd.append('--keyring')
|
|
| 168 |
+ ceph_cmd.append(options.keyring) |
|
| 169 |
+ceph_cmd.append('status')
|
|
| 170 |
+ceph_cmd.append('--format=json')
|
|
| 171 |
+ |
|
| 172 |
+# exec command |
|
| 173 |
+p = subprocess.Popen(ceph_cmd,stdout=subprocess.PIPE,stderr=subprocess.PIPE) |
|
| 174 |
+output, err = p.communicate() |
|
| 175 |
+ |
|
| 176 |
+if output: |
|
| 177 |
+ data=json.loads(output) |
|
| 178 |
+ |
|
| 179 |
+ status='OK' |
|
| 180 |
+ |
|
| 181 |
+ health=data['health']['overall_status'] |
|
| 182 |
+ if health=='HEALTH_WARN': |
|
| 183 |
+ status='WARNING' |
|
| 184 |
+ elif health=='HEALTH_CRIT': |
|
| 185 |
+ status='CRITICAL' |
|
| 186 |
+ |
|
| 187 |
+ total_mon=len(data['monmap']['mons']) |
|
| 188 |
+ total_mon_up=len(data['health']['timechecks']['mons']) |
|
| 189 |
+ |
|
| 190 |
+ num_lost_mon=total_mon-total_mon_up |
|
| 191 |
+ if num_lost_mon==0: |
|
| 192 |
+ monstate="(MONs UP : %s/%s)" % (total_mon_up,total_mon) |
|
| 193 |
+ else: |
|
| 194 |
+ monstate="%s MONs down (MONs UP : %s/%s)" % (num_lost_mon,total_mon_up,total_mon) |
|
| 195 |
+ if num_lost_mon >= options.critlostmon: |
|
| 196 |
+ status='CRITICAL' |
|
| 197 |
+ elif num_lost_mon >= options.warnlostmon and status!='CRITICAL': |
|
| 198 |
+ status='WARNING' |
|
| 199 |
+ |
|
| 200 |
+ total_osd=data['osdmap']['osdmap']['num_osds'] |
|
| 201 |
+ total_osd_up=data['osdmap']['osdmap']['num_up_osds'] |
|
| 202 |
+ |
|
| 203 |
+ num_lost_osd=total_osd-total_osd_up |
|
| 204 |
+ |
|
| 205 |
+ if num_lost_osd>=options.critlostosd: |
|
| 206 |
+ status='CRITICAL' |
|
| 207 |
+ elif num_lost_osd>=options.warnlostosd and status!='CRITICAL': |
|
| 208 |
+ status='WARNING' |
|
| 209 |
+ |
|
| 210 |
+ total_pg=data['pgmap']['num_pgs'] |
|
| 211 |
+ pgstate="" |
|
| 212 |
+ for st in data['pgmap']['pgs_by_state']: |
|
| 213 |
+ if re.search('(down|inconsistent|imcomplete|stale)',st['state_name'],re.IGNORECASE):
|
|
| 214 |
+ status='CRITICAL' |
|
| 215 |
+ pgstate="%s / %s PGs %s" % (pgstate,st['count'],st['state_name']) |
|
| 216 |
+ elif re.search('(replay|degraded|repair|recovering|backfill)',st['state_name'],re.IGNORECASE):
|
|
| 217 |
+ if status!='CRITICAL': |
|
| 218 |
+ status="WARNING" |
|
| 219 |
+ pgstate="%s / %s PGs %s" % (pgstate,st['count'],st['state_name']) |
|
| 220 |
+ elif st['state_name']=="active+clean": |
|
| 221 |
+ pgstate="%s / %s/%s PGs active+clean" % (pgstate,st['count'],total_pg) |
|
| 222 |
+ |
|
| 223 |
+ msg="%s : %s%s %s" % (status,health,pgstate,monstate) |
|
| 224 |
+ |
|
| 225 |
+ |
|
| 226 |
+ if num_lost_osd==0: |
|
| 227 |
+ print "%s (OSDs UP : %s/%s)" % (msg,total_osd_up,total_osd) |
|
| 228 |
+ else: |
|
| 229 |
+ print "%s / %s OSDs down (OSDs UP : %s/%s)" % (msg,num_lost_osd,total_osd_up,total_osd) |
|
| 230 |
+ sys.exit(STATUS[status]) |
|
| 231 |
+else: |
|
| 232 |
+ print "UNKNOWN : fail to execute ceph status command" |
|
| 233 |
+ sys.exit(STATUS['UNKNOWN']) |
|
| 0 | 234 |