Initial commit
Benjamin Renard

Benjamin Renard commited on 2013-12-03 22:49:19
Showing 3 changed files, with 283 additions and 0 deletions.

... ...
@@ -0,0 +1 @@
1
+*~
... ...
@@ -0,0 +1,49 @@
1
+Nagios plugin to check Ceph cluster status
2
+==========================================
3
+
4
+This plugin check ceph health, number of OSDs UP, number of MONs UP
5
+and PGs states to determine Ceph cluster status.
6
+
7
+Usage
8
+-----
9
+
10
+  Usage: check_ceph_status [options]
11
+  
12
+  Options:
13
+    -h, --help            show this help message and exit
14
+    -d, --debug           
15
+    -b BIN, --bin=BIN     Ceph binary (default : /usr/bin/ceph)
16
+    --conf=CONF           Ceph configuration file
17
+    -m MON, --mon=MON     Ceph monitor address[:port]
18
+    -i ID, --id=ID        Ceph client id
19
+    -k KEYRING, --keyring=KEYRING
20
+                          Ceph client keyring file
21
+    -w WARNLOSTOSD, --warning-lost-osd=WARNLOSTOSD
22
+                          Warning number of non-up OSDs (default : 1)
23
+    -c CRITLOSTOSD, --critical-lost-osd=CRITLOSTOSD
24
+                          Critical number of non-up OSDs (default : 2)
25
+    -W WARNLOSTMON, --warning-lost-mon=WARNLOSTMON
26
+                          Warning number of non-up MONs (default : 1)
27
+    -C CRITLOSTMON, --critical-lost-mon=CRITLOSTMON
28
+                          Critical number of non-up MONs (default : 2)
29
+
30
+Copyright
31
+---------
32
+
33
+Copyright (c) 2013 Benjamin Renard <brenard@zionetrix.net>
34
+
35
+License
36
+-------
37
+
38
+This program is free software; you can redistribute it and/or
39
+modify it under the terms of the GNU General Public License version 2
40
+as published by the Free Software Foundation.
41
+
42
+This program is distributed in the hope that it will be useful,
43
+but WITHOUT ANY WARRANTY; without even the implied warranty of
44
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
45
+GNU General Public License for more details.
46
+
47
+You should have received a copy of the GNU General Public License
48
+along with this program; if not, write to the Free Software
49
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
... ...
@@ -0,0 +1,233 @@
1
+#!/usr/bin/python
2
+#
3
+# Nagios plugin to check Ceph cluster state
4
+#
5
+# This plugin check ceph health, number of OSDs UP, number of MONs UP
6
+# and PGs states to determine Ceph cluster status.
7
+#
8
+#  Usage: check_ceph_status [options]
9
+#  
10
+#  Options:
11
+#    -h, --help            show this help message and exit
12
+#    -d, --debug           
13
+#    -b BIN, --bin=BIN     Ceph binary (default : /usr/bin/ceph)
14
+#    --conf=CONF           Ceph configuration file
15
+#    -m MON, --mon=MON     Ceph monitor address[:port]
16
+#    -i ID, --id=ID        Ceph client id
17
+#    -k KEYRING, --keyring=KEYRING
18
+#                          Ceph client keyring file
19
+#    -w WARNLOSTOSD, --warning-lost-osd=WARNLOSTOSD
20
+#                          Warning number of non-up OSDs (default : 1)
21
+#    -c CRITLOSTOSD, --critical-lost-osd=CRITLOSTOSD
22
+#                          Critical number of non-up OSDs (default : 2)
23
+#    -W WARNLOSTMON, --warning-lost-mon=WARNLOSTMON
24
+#                          Warning number of non-up MONs (default : 1)
25
+#    -C CRITLOSTMON, --critical-lost-mon=CRITLOSTMON
26
+#                          Critical number of non-up MONs (default : 2)
27
+#
28
+# Copyright (c) 2013 Benjamin Renard <brenard@zionetrix.net>
29
+#
30
+# This program is free software; you can redistribute it and/or
31
+# modify it under the terms of the GNU General Public License version 2
32
+# as published by the Free Software Foundation.
33
+# 
34
+# This program is distributed in the hope that it will be useful,
35
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
36
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
37
+# GNU General Public License for more details.
38
+# 
39
+# You should have received a copy of the GNU General Public License
40
+# along with this program; if not, write to the Free Software
41
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
42
+#
43
+
44
+import sys,os,json,subprocess,re
45
+from optparse import OptionParser
46
+
47
+# default ceph values
48
+CEPH_COMMAND = '/usr/bin/ceph'
49
+WARN_LOST_OSD = 1
50
+CRIT_LOST_OSD = 2
51
+WARN_LOST_MON = 1
52
+CRIT_LOST_MON = 2
53
+
54
+# nagios exit code
55
+STATUS = {
56
+	'OK': 0,
57
+	'WARNING': 1,
58
+	'CRITICAL': 2,
59
+	'UNKNOWN': 3
60
+}
61
+
62
+parser = OptionParser()
63
+parser.add_option('-d',
64
+                  '--debug',
65
+                  action="store_true",
66
+                  dest="debug",
67
+                  default=False)
68
+
69
+parser.add_option('-b',
70
+                  '--bin',
71
+                  action="store",
72
+                  dest="bin",
73
+                  help="Ceph binary (default : %s)" % CEPH_COMMAND,
74
+                  type='string',
75
+                  default=CEPH_COMMAND)
76
+
77
+parser.add_option('--conf',
78
+                  action="store",
79
+                  dest="conf",
80
+                  help="Ceph configuration file",
81
+                  type='string',
82
+                  default=None)
83
+
84
+parser.add_option('-m',
85
+                  '--mon',
86
+                  action="store",
87
+                  dest="mon",
88
+                  help="Ceph monitor address[:port]",
89
+                  type='string',
90
+                  default=None)
91
+
92
+parser.add_option('-i',
93
+                  '--id',
94
+                  action="store",
95
+                  dest="id",
96
+                  help="Ceph client id",
97
+                  type='string',
98
+                  default=None)
99
+
100
+parser.add_option('-k',
101
+                  '--keyring',
102
+                  action="store",
103
+                  dest="keyring",
104
+                  help="Ceph client keyring file",
105
+                  type='string',
106
+                  default=None)
107
+
108
+parser.add_option('-w',
109
+                  '--warning-lost-osd',
110
+                  action="store",
111
+                  dest="warnlostosd",
112
+                  help="Warning number of non-up OSDs (default : %s)" % WARN_LOST_OSD,
113
+                  type='int',
114
+                  default=WARN_LOST_OSD)
115
+
116
+parser.add_option('-c',
117
+                  '--critical-lost-osd',
118
+                  action="store",
119
+                  dest="critlostosd",
120
+                  help="Critical number of non-up OSDs (default : %s)" % CRIT_LOST_OSD,
121
+                  type='int',
122
+                  default=CRIT_LOST_OSD)
123
+
124
+parser.add_option('-W',
125
+                  '--warning-lost-mon',
126
+                  action="store",
127
+                  dest="warnlostmon",
128
+                  help="Warning number of non-up MONs (default : %s)" % WARN_LOST_MON,
129
+                  type='int',
130
+                  default=WARN_LOST_MON)
131
+
132
+parser.add_option('-C',
133
+                  '--critical-lost-mon',
134
+                  action="store",
135
+                  dest="critlostmon",
136
+                  help="Critical number of non-up MONs (default : %s)" % CRIT_LOST_MON,
137
+                  type='int',
138
+                  default=CRIT_LOST_MON)
139
+
140
+(options, args) = parser.parse_args()
141
+
142
+ # validate args
143
+if not os.path.exists(options.bin):
144
+    print "ERROR: ceph executable '%s' doesn't exist" % options.bin
145
+    sys.exit(STATUS['UNKNOWN'])
146
+
147
+if options.conf and not os.path.exists(options.conf):
148
+    print "ERROR: ceph conf file '%s' doesn't exist" % options.conf
149
+    sys.exit(STATUS['UNKNOWN'])
150
+
151
+if options.keyring and not os.path.exists(options.keyring):
152
+    print "ERROR: keyring file '%s' doesn't exist" % options.keyring
153
+    sys.exit(STATUS['UNKNOWN'])
154
+
155
+# build command
156
+ceph_cmd = [options.bin]
157
+if options.mon:
158
+    ceph_cmd.append('-m')
159
+    ceph_cmd.append(options.mon)
160
+if options.conf:
161
+    ceph_cmd.append('-c')
162
+    ceph_cmd.append(options.conf)
163
+if options.id:
164
+    ceph_cmd.append('--id')
165
+    ceph_cmd.append(options.id)
166
+if options.keyring:
167
+    ceph_cmd.append('--keyring')
168
+    ceph_cmd.append(options.keyring)
169
+ceph_cmd.append('status')
170
+ceph_cmd.append('--format=json')
171
+    
172
+# exec command
173
+p = subprocess.Popen(ceph_cmd,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
174
+output, err = p.communicate()
175
+
176
+if output:
177
+	data=json.loads(output)
178
+
179
+	status='OK'
180
+
181
+	health=data['health']['overall_status']
182
+	if health=='HEALTH_WARN':
183
+		status='WARNING'
184
+	elif health=='HEALTH_CRIT':
185
+		status='CRITICAL'
186
+
187
+	total_mon=len(data['monmap']['mons'])
188
+	total_mon_up=len(data['health']['timechecks']['mons'])
189
+
190
+	num_lost_mon=total_mon-total_mon_up
191
+	if num_lost_mon==0:
192
+		monstate="(MONs UP : %s/%s)" % (total_mon_up,total_mon)
193
+	else:
194
+		monstate="%s MONs down (MONs UP : %s/%s)" % (num_lost_mon,total_mon_up,total_mon)
195
+		if num_lost_mon >= options.critlostmon:
196
+			status='CRITICAL'
197
+		elif num_lost_mon >= options.warnlostmon and status!='CRITICAL':
198
+			status='WARNING'
199
+
200
+	total_osd=data['osdmap']['osdmap']['num_osds']
201
+	total_osd_up=data['osdmap']['osdmap']['num_up_osds']
202
+
203
+	num_lost_osd=total_osd-total_osd_up
204
+
205
+	if num_lost_osd>=options.critlostosd:
206
+		status='CRITICAL'
207
+	elif num_lost_osd>=options.warnlostosd and status!='CRITICAL':
208
+		status='WARNING'
209
+
210
+	total_pg=data['pgmap']['num_pgs']
211
+	pgstate=""
212
+	for st in data['pgmap']['pgs_by_state']:
213
+		if re.search('(down|inconsistent|imcomplete|stale)',st['state_name'],re.IGNORECASE):
214
+			status='CRITICAL'
215
+			pgstate="%s / %s PGs %s" % (pgstate,st['count'],st['state_name'])
216
+		elif re.search('(replay|degraded|repair|recovering|backfill)',st['state_name'],re.IGNORECASE):
217
+			if status!='CRITICAL':
218
+				status="WARNING"
219
+			pgstate="%s / %s PGs %s" % (pgstate,st['count'],st['state_name'])
220
+		elif st['state_name']=="active+clean":
221
+			pgstate="%s / %s/%s PGs active+clean" % (pgstate,st['count'],total_pg)
222
+	
223
+	msg="%s : %s%s %s" % (status,health,pgstate,monstate)
224
+
225
+
226
+	if num_lost_osd==0:
227
+		print "%s (OSDs UP : %s/%s)" % (msg,total_osd_up,total_osd)
228
+	else:
229
+		print "%s / %s OSDs down (OSDs UP : %s/%s)" % (msg,num_lost_osd,total_osd_up,total_osd)
230
+	sys.exit(STATUS[status])
231
+else:
232
+	print "UNKNOWN : fail to execute ceph status command"
233
+	sys.exit(STATUS['UNKNOWN'])
0 234