Initial import from http://exchange.nagios.org/directory/Uncategorized/Check-SMART-status-modified/details
Benjamin Renard

Benjamin Renard commited on 2011-05-13 13:02:32
Showing 1 changed files, with 324 additions and 0 deletions.

... ...
@@ -0,0 +1,324 @@
1
+#!/usr/bin/perl -w
2
+# Check SMART status of ATA/SCSI disks, returning any usable metrics as perfdata.
3
+# For usage information, run ./check_smart -h
4
+#
5
+# This script was created under contract for the US Government and is therefore Public Domain
6
+#
7
+# Changes and Modifications
8
+# =========================
9
+# Feb 3, 2009: Kurt Yoder - initial version of script 1.0
10
+# Jan 27, 2010: Philippe Genonceaux - modifications for compatibility with megaraid, use smartmontool version >= 5.39 
11
+# Add this line to /etc/sudoers: "nagios        ALL=(root) NOPASSWD: /usr/sbin/smartctl"
12
+
13
+use strict;
14
+use Getopt::Long;
15
+
16
+use File::Basename qw(basename);
17
+my $basename = basename($0);
18
+
19
+my $revision = '$Revision: 1.0.1 $';
20
+
21
+use lib '/usr/lib/nagios/plugins/';
22
+use utils qw(%ERRORS &print_revision &support &usage);
23
+
24
+$ENV{'PATH'}='/bin:/usr/bin:/sbin:/usr/sbin';
25
+$ENV{'BASH_ENV'}=''; 
26
+$ENV{'ENV'}='';
27
+
28
+use vars qw($opt_d $opt_debug $opt_h $opt_i $opt_n $opt_v);
29
+Getopt::Long::Configure('bundling');
30
+GetOptions(
31
+	                  "debug"       => \$opt_debug,
32
+	"d=s" => \$opt_d, "device=s"    => \$opt_d,
33
+	"h"   => \$opt_h, "help"        => \$opt_h,
34
+	"i=s" => \$opt_i, "interface=s" => \$opt_i,
35
+	"n=s" => \$opt_n, "number=s"	=> \$opt_n,
36
+	"v"   => \$opt_v, "version"     => \$opt_v,
37
+);
38
+
39
+if ($opt_v) {
40
+	print_revision($basename,$revision);
41
+	exit $ERRORS{'OK'};
42
+}
43
+
44
+if ($opt_h) {
45
+	print_help(); 
46
+	exit $ERRORS{'OK'};
47
+}
48
+my ($device, $interface, $number) = qw//;
49
+if ($opt_d) {
50
+	unless($opt_i){
51
+		print "must specify an interface for $opt_d using -i/--interface!\n\n";
52
+		print_help();
53
+		exit $ERRORS{'UNKNOWN'};
54
+	}
55
+
56
+	if (-b $opt_d){
57
+		$device = $opt_d;
58
+	}
59
+	else{
60
+		print "$opt_d is not a valid block device!\n\n";
61
+		print_help();
62
+		exit $ERRORS{'UNKNOWN'};
63
+	}
64
+
65
+	if(grep {$opt_i eq $_} ('ata', 'scsi', 'megaraid')){
66
+		$interface = $opt_i;
67
+                if($interface eq 'megaraid'){
68
+                    if(defined($opt_n)){
69
+                        $number = $opt_n;
70
+                        $interface = $opt_i.",".$number;
71
+                    }
72
+                    else{
73
+                        print "must specify a physical disk number within the MegaRAID controller!\n\n";
74
+                        print_help();
75
+                        exit $ERRORS{'UNKNOWN'};
76
+                    }
77
+                }
78
+	}
79
+	else{
80
+		print "invalid interface $opt_i for $opt_d!\n\n";
81
+		print_help();
82
+		exit $ERRORS{'UNKNOWN'};
83
+	}
84
+}
85
+else{
86
+	print "must specify a device!\n\n";
87
+	print_help();
88
+	exit $ERRORS{'UNKNOWN'};
89
+}
90
+my $smart_command = '/usr/bin/sudo /usr/sbin/smartctl';
91
+my @error_messages = qw//;
92
+my $exit_status = 'OK';
93
+
94
+
95
+warn "###########################################################\n" if $opt_debug;
96
+warn "(debug) CHECK 1: getting overall SMART health status\n" if $opt_debug;
97
+warn "###########################################################\n\n\n" if $opt_debug;
98
+
99
+my $full_command = "$smart_command -d $interface -H $device";
100
+warn "(debug) executing:\n$full_command\n\n" if $opt_debug;
101
+
102
+my @output = `$full_command`;
103
+warn "(debug) output:\n@output\n\n" if $opt_debug;
104
+
105
+# parse ata output, looking for "health status: passed"
106
+my $found_status = 0;
107
+my $line_str = 'SMART overall-health self-assessment test result: '; # ATA SMART line
108
+my $ok_str = 'PASSED'; # ATA SMART OK string
109
+
110
+if ($interface eq 'megaraid'.",".$number or 'scsi'){
111
+	$line_str = 'SMART Health Status: '; # SCSI OR MEGARAID SMART line
112
+	$ok_str = 'OK'; #SCSI OR MEGARAID SMART OK string
113
+}
114
+
115
+foreach my $line (@output){
116
+	if($line =~ /$line_str(.+)/){
117
+		$found_status = 1;
118
+		warn "(debug) parsing line:\n$line\n\n" if $opt_debug;
119
+		if ($1 eq $ok_str) {
120
+			warn "(debug) found string '$ok_str'; status OK\n\n" if $opt_debug;
121
+		}
122
+		else {
123
+			warn "(debug) no '$ok_str' status; failing\n\n" if $opt_debug;
124
+			push(@error_messages, "Health status: $1");
125
+			escalate_status('CRITICAL');
126
+		}
127
+	}
128
+}
129
+
130
+unless ($found_status) {
131
+	push(@error_messages, 'No health status line found');
132
+	escalate_status('UNKNOWN');
133
+}
134
+
135
+
136
+warn "###########################################################\n" if $opt_debug;
137
+warn "(debug) CHECK 2: getting silent SMART health check\n" if $opt_debug;
138
+warn "###########################################################\n\n\n" if $opt_debug;
139
+
140
+$full_command = "$smart_command -d $interface -q silent -A $device";
141
+warn "(debug) executing:\n$full_command\n\n" if $opt_debug;
142
+
143
+system($full_command);
144
+my $return_code = $?;
145
+warn "(debug) exit code:\n$return_code\n\n" if $opt_debug;
146
+
147
+if ($return_code & 0x01) {
148
+	push(@error_messages, 'Commandline parse failure');
149
+	escalate_status('UNKNOWN');
150
+}
151
+if ($return_code & 0x02) {
152
+	push(@error_messages, 'Device could not be opened');
153
+	escalate_status('UNKNOWN');
154
+}
155
+if ($return_code & 0x04) {
156
+	push(@error_messages, 'Checksum failure');
157
+	escalate_status('WARNING');
158
+}
159
+if ($return_code & 0x08) {
160
+	push(@error_messages, 'Disk is failing');
161
+	escalate_status('CRITICAL');
162
+}
163
+if ($return_code & 0x10) {
164
+	push(@error_messages, 'Disk is in prefail');
165
+	escalate_status('WARNING');
166
+}
167
+if ($return_code & 0x20) {
168
+	push(@error_messages, 'Disk may be close to failure');
169
+	escalate_status('WARNING');
170
+}
171
+if ($return_code & 0x40) {
172
+	push(@error_messages, 'Error log contains errors');
173
+	escalate_status('WARNING');
174
+}
175
+if ($return_code & 0x80) {
176
+	push(@error_messages, 'Self-test log contains errors');
177
+	escalate_status('WARNING');
178
+}
179
+if ($return_code && !$exit_status) {
180
+	push(@error_messages, 'Unknown return code');
181
+	escalate_status('CRITICAL');
182
+}
183
+
184
+if ($return_code) {
185
+	warn "(debug) non-zero exit code, generating error condition\n\n" if $opt_debug;
186
+}
187
+else {
188
+	warn "(debug) zero exit code, status OK\n\n" if $opt_debug;
189
+}
190
+
191
+
192
+warn "###########################################################\n" if $opt_debug;
193
+warn "(debug) CHECK 3: getting detailed statistics\n" if $opt_debug;
194
+warn "(debug) information contains a few more potential trouble spots\n" if $opt_debug;
195
+warn "(debug) plus, we can also use the information for perfdata/graphing\n" if $opt_debug;
196
+warn "###########################################################\n\n\n" if $opt_debug;
197
+
198
+$full_command = "$smart_command -d $interface -A $device";
199
+warn "(debug) executing:\n$full_command\n\n" if $opt_debug;
200
+@output = `$full_command`;
201
+warn "(debug) output:\n@output\n\n" if $opt_debug;
202
+my @perfdata = qw//;
203
+
204
+# separate metric-gathering and output analysis for ATA vs SCSI SMART output
205
+if ($interface eq 'ata'){
206
+	foreach my $line(@output){
207
+		# get lines that look like this:
208
+		#    9 Power_On_Minutes        0x0032   241   241   000    Old_age   Always       -       113h+12m
209
+		next unless $line =~ /^\s*\d+\s(\S+)\s+(?:\S+\s+){6}(\S+)\s+(\d+)/;
210
+		my ($attribute_name, $when_failed, $raw_value) = ($1, $2, $3);
211
+		if ($when_failed ne '-'){
212
+			push(@error_messages, "Attribute $attribute_name failed at $when_failed");
213
+			escalate_status('WARNING');
214
+			warn "(debug) parsed SMART attribute $attribute_name with error condition:\n$when_failed\n\n" if $opt_debug;
215
+		}
216
+		# some attributes produce questionable data; no need to graph them
217
+		if (grep {$_ eq $attribute_name} ('Unknown_Attribute', 'Power_On_Minutes') ){
218
+			next;
219
+		}
220
+		push (@perfdata, "$attribute_name=$raw_value");
221
+
222
+		# do some manual checks
223
+		if ( ($attribute_name eq 'Current_Pending_Sector') && $raw_value ) {
224
+			push(@error_messages, "Sectors pending re-allocation");
225
+			escalate_status('WARNING');
226
+			warn "(debug) Current_Pending_Sector is non-zero ($raw_value)\n\n" if $opt_debug;
227
+		}
228
+	}
229
+}
230
+else{
231
+	my ($current_temperature, $max_temperature, $current_start_stop, $max_start_stop) = qw//;
232
+	foreach my $line(@output){
233
+		if ($line =~ /Current Drive Temperature:\s+(\d+)/){
234
+			$current_temperature = $1;
235
+		}
236
+		elsif ($line =~ /Drive Trip Temperature:\s+(\d+)/){
237
+			$max_temperature = $1;
238
+		}
239
+		elsif ($line =~ /Current start stop count:\s+(\d+)/){
240
+			$current_start_stop = $1;
241
+		}
242
+		elsif ($line =~ /Recommended maximum start stop count:\s+(\d+)/){
243
+			$max_start_stop = $1;
244
+		}
245
+		elsif ($line =~ /Elements in grown defect list:\s+(\d+)/){
246
+			push (@perfdata, "defect_list=$1");
247
+		}
248
+		elsif ($line =~ /Blocks sent to initiator =\s+(\d+)/){
249
+			push (@perfdata, "sent_blocks=$1");
250
+		}
251
+	}
252
+	if($current_temperature){
253
+		if($max_temperature){
254
+			push (@perfdata, "temperature=$current_temperature;;$max_temperature");
255
+			if($current_temperature > $max_temperature){
256
+				warn "(debug) Disk temperature is greater than max ($current_temperature > $max_temperature)\n\n" if $opt_debug;
257
+				push(@error_messages, 'Disk temperature is higher than maximum');
258
+				escalate_status('CRITICAL');
259
+			}
260
+		}
261
+		else{
262
+			push (@perfdata, "temperature=$current_temperature");
263
+		}
264
+	}
265
+	if($current_start_stop){
266
+		if($max_start_stop){
267
+			push (@perfdata, "start_stop=$current_start_stop;$max_start_stop");
268
+			if($current_start_stop > $max_start_stop){
269
+				warn "(debug) Disk start_stop is greater than max ($current_start_stop > $max_start_stop)\n\n" if $opt_debug;
270
+				push(@error_messages, 'Disk start_stop is higher than maximum');
271
+				escalate_status('WARNING');
272
+			}
273
+		}
274
+		else{
275
+			push (@perfdata, "start_stop=$current_start_stop");
276
+		}
277
+	}
278
+}
279
+warn "(debug) gathered perfdata:\n@perfdata\n\n" if $opt_debug;
280
+my $perf_string = join(' ', @perfdata);
281
+
282
+warn "###########################################################\n" if $opt_debug;
283
+warn "(debug) FINAL STATUS: $exit_status\n" if $opt_debug;
284
+warn "###########################################################\n\n\n" if $opt_debug;
285
+
286
+warn "(debug) final status/output:\n" if $opt_debug;
287
+
288
+my $status_string = '';
289
+
290
+if($exit_status ne 'OK'){
291
+	$status_string = "$exit_status: ".join(', ', @error_messages);
292
+}
293
+else {
294
+	$status_string = "OK: no SMART errors detected";
295
+}
296
+
297
+print "$status_string|$perf_string\n";
298
+exit $ERRORS{$exit_status};
299
+
300
+sub print_help {
301
+	print_revision($basename,$revision);
302
+	print "Usage: $basename (--device=<SMART device> --interface=(ata|scsi)|-h|-v) [--debug]\n";
303
+	print "  --debug: show debugging information\n";
304
+	print "  -d/--device: a device to be SMART monitored, eg /dev/sda\n";
305
+	print "  -i/--interface: ata, scsi, megaraid, depending upon the device's interface type\n";
306
+        print "  -n/--number: where in the argument megaraid, it is the physical disk number within the MegaRAID controller\n";
307
+	print "  -h/--help: this help\n";
308
+	print "  -v/--version: Version number\n";
309
+	support();
310
+}
311
+
312
+# escalate an exit status IFF it's more severe than the previous exit status
313
+sub escalate_status {
314
+	my $requested_status = shift;
315
+	# no test for 'CRITICAL'; automatically escalates upwards
316
+	if ($requested_status eq 'WARNING') {
317
+		return if $exit_status eq 'CRITICAL';
318
+	}
319
+	if ($requested_status eq 'UNKNOWN') {
320
+		return if $exit_status eq 'WARNING';
321
+		return if $exit_status eq 'CRITICAL';
322
+	}
323
+	$exit_status = $requested_status;
324
+}
0 325