Initial import from http://exchange.nagios.org/directory/Uncategorized/Check-SMART-status-modified/details (dc16b29)

check_smart

...	...	@@ -0,0 +1,324 @@
	1	+#!/usr/bin/perl -w
	2	+# Check SMART status of ATA/SCSI disks, returning any usable metrics as perfdata.
	3	+# For usage information, run ./check_smart -h
	4	+#
	5	+# This script was created under contract for the US Government and is therefore Public Domain
	6	+#
	7	+# Changes and Modifications
	8	+# =========================
	9	+# Feb 3, 2009: Kurt Yoder - initial version of script 1.0
	10	+# Jan 27, 2010: Philippe Genonceaux - modifications for compatibility with megaraid, use smartmontool version >= 5.39
	11	+# Add this line to /etc/sudoers: "nagios ALL=(root) NOPASSWD: /usr/sbin/smartctl"
	12	+
	13	+use strict;
	14	+use Getopt::Long;
	15	+
	16	+use File::Basename qw(basename);
	17	+my $basename = basename($0);
	18	+
	19	+my $revision = '$Revision: 1.0.1 $';
	20	+
	21	+use lib '/usr/lib/nagios/plugins/';
	22	+use utils qw(%ERRORS &print_revision &support &usage);
	23	+
	24	+$ENV{'PATH'}='/bin:/usr/bin:/sbin:/usr/sbin';
	25	+$ENV{'BASH_ENV'}='';
	26	+$ENV{'ENV'}='';
	27	+
	28	+use vars qw($opt_d $opt_debug $opt_h $opt_i $opt_n $opt_v);
	29	+Getopt::Long::Configure('bundling');
	30	+GetOptions(
	31	+ "debug" => \$opt_debug,
	32	+ "d=s" => \$opt_d, "device=s" => \$opt_d,
	33	+ "h" => \$opt_h, "help" => \$opt_h,
	34	+ "i=s" => \$opt_i, "interface=s" => \$opt_i,
	35	+ "n=s" => \$opt_n, "number=s" => \$opt_n,
	36	+ "v" => \$opt_v, "version" => \$opt_v,
	37	+);
	38	+
	39	+if ($opt_v) {
	40	+ print_revision($basename,$revision);
	41	+ exit $ERRORS{'OK'};
	42	+}
	43	+
	44	+if ($opt_h) {
	45	+ print_help();
	46	+ exit $ERRORS{'OK'};
	47	+}
	48	+my ($device, $interface, $number) = qw//;
	49	+if ($opt_d) {
	50	+ unless($opt_i){
	51	+ print "must specify an interface for $opt_d using -i/--interface!\n\n";
	52	+ print_help();
	53	+ exit $ERRORS{'UNKNOWN'};
	54	+ }
	55	+
	56	+ if (-b $opt_d){
	57	+ $device = $opt_d;
	58	+ }
	59	+ else{
	60	+ print "$opt_d is not a valid block device!\n\n";
	61	+ print_help();
	62	+ exit $ERRORS{'UNKNOWN'};
	63	+ }
	64	+
	65	+ if(grep {$opt_i eq $_} ('ata', 'scsi', 'megaraid')){
	66	+ $interface = $opt_i;
	67	+ if($interface eq 'megaraid'){
	68	+ if(defined($opt_n)){
	69	+ $number = $opt_n;
	70	+ $interface = $opt_i.",".$number;
	71	+ }
	72	+ else{
	73	+ print "must specify a physical disk number within the MegaRAID controller!\n\n";
	74	+ print_help();
	75	+ exit $ERRORS{'UNKNOWN'};
	76	+ }
	77	+ }
	78	+ }
	79	+ else{
	80	+ print "invalid interface $opt_i for $opt_d!\n\n";
	81	+ print_help();
	82	+ exit $ERRORS{'UNKNOWN'};
	83	+ }
	84	+}
	85	+else{
	86	+ print "must specify a device!\n\n";
	87	+ print_help();
	88	+ exit $ERRORS{'UNKNOWN'};
	89	+}
	90	+my $smart_command = '/usr/bin/sudo /usr/sbin/smartctl';
	91	+my @error_messages = qw//;
	92	+my $exit_status = 'OK';
	93	+
	94	+
	95	+warn "###########################################################\n" if $opt_debug;
	96	+warn "(debug) CHECK 1: getting overall SMART health status\n" if $opt_debug;
	97	+warn "###########################################################\n\n\n" if $opt_debug;
	98	+
	99	+my $full_command = "$smart_command -d $interface -H $device";
	100	+warn "(debug) executing:\n$full_command\n\n" if $opt_debug;
	101	+
	102	+my @output = `$full_command`;
	103	+warn "(debug) output:\n@output\n\n" if $opt_debug;
	104	+
	105	+# parse ata output, looking for "health status: passed"
	106	+my $found_status = 0;
	107	+my $line_str = 'SMART overall-health self-assessment test result: '; # ATA SMART line
	108	+my $ok_str = 'PASSED'; # ATA SMART OK string
	109	+
	110	+if ($interface eq 'megaraid'.",".$number or 'scsi'){
	111	+ $line_str = 'SMART Health Status: '; # SCSI OR MEGARAID SMART line
	112	+ $ok_str = 'OK'; #SCSI OR MEGARAID SMART OK string
	113	+}
	114	+
	115	+foreach my $line (@output){
	116	+ if($line =~ /$line_str(.+)/){
	117	+ $found_status = 1;
	118	+ warn "(debug) parsing line:\n$line\n\n" if $opt_debug;
	119	+ if ($1 eq $ok_str) {
	120	+ warn "(debug) found string '$ok_str'; status OK\n\n" if $opt_debug;
	121	+ }
	122	+ else {
	123	+ warn "(debug) no '$ok_str' status; failing\n\n" if $opt_debug;
	124	+ push(@error_messages, "Health status: $1");
	125	+ escalate_status('CRITICAL');
	126	+ }
	127	+ }
	128	+}
	129	+
	130	+unless ($found_status) {
	131	+ push(@error_messages, 'No health status line found');
	132	+ escalate_status('UNKNOWN');
	133	+}
	134	+
	135	+
	136	+warn "###########################################################\n" if $opt_debug;
	137	+warn "(debug) CHECK 2: getting silent SMART health check\n" if $opt_debug;
	138	+warn "###########################################################\n\n\n" if $opt_debug;
	139	+
	140	+$full_command = "$smart_command -d $interface -q silent -A $device";
	141	+warn "(debug) executing:\n$full_command\n\n" if $opt_debug;
	142	+
	143	+system($full_command);
	144	+my $return_code = $?;
	145	+warn "(debug) exit code:\n$return_code\n\n" if $opt_debug;
	146	+
	147	+if ($return_code & 0x01) {
	148	+ push(@error_messages, 'Commandline parse failure');
	149	+ escalate_status('UNKNOWN');
	150	+}
	151	+if ($return_code & 0x02) {
	152	+ push(@error_messages, 'Device could not be opened');
	153	+ escalate_status('UNKNOWN');
	154	+}
	155	+if ($return_code & 0x04) {
	156	+ push(@error_messages, 'Checksum failure');
	157	+ escalate_status('WARNING');
	158	+}
	159	+if ($return_code & 0x08) {
	160	+ push(@error_messages, 'Disk is failing');
	161	+ escalate_status('CRITICAL');
	162	+}
	163	+if ($return_code & 0x10) {
	164	+ push(@error_messages, 'Disk is in prefail');
	165	+ escalate_status('WARNING');
	166	+}
	167	+if ($return_code & 0x20) {
	168	+ push(@error_messages, 'Disk may be close to failure');
	169	+ escalate_status('WARNING');
	170	+}
	171	+if ($return_code & 0x40) {
	172	+ push(@error_messages, 'Error log contains errors');
	173	+ escalate_status('WARNING');
	174	+}
	175	+if ($return_code & 0x80) {
	176	+ push(@error_messages, 'Self-test log contains errors');
	177	+ escalate_status('WARNING');
	178	+}
	179	+if ($return_code && !$exit_status) {
	180	+ push(@error_messages, 'Unknown return code');
	181	+ escalate_status('CRITICAL');
	182	+}
	183	+
	184	+if ($return_code) {
	185	+ warn "(debug) non-zero exit code, generating error condition\n\n" if $opt_debug;
	186	+}
	187	+else {
	188	+ warn "(debug) zero exit code, status OK\n\n" if $opt_debug;
	189	+}
	190	+
	191	+
	192	+warn "###########################################################\n" if $opt_debug;
	193	+warn "(debug) CHECK 3: getting detailed statistics\n" if $opt_debug;
	194	+warn "(debug) information contains a few more potential trouble spots\n" if $opt_debug;
	195	+warn "(debug) plus, we can also use the information for perfdata/graphing\n" if $opt_debug;
	196	+warn "###########################################################\n\n\n" if $opt_debug;
	197	+
	198	+$full_command = "$smart_command -d $interface -A $device";
	199	+warn "(debug) executing:\n$full_command\n\n" if $opt_debug;
	200	+@output = `$full_command`;
	201	+warn "(debug) output:\n@output\n\n" if $opt_debug;
	202	+my @perfdata = qw//;
	203	+
	204	+# separate metric-gathering and output analysis for ATA vs SCSI SMART output
	205	+if ($interface eq 'ata'){
	206	+ foreach my $line(@output){
	207	+ # get lines that look like this:
	208	+ # 9 Power_On_Minutes 0x0032 241 241 000 Old_age Always - 113h+12m
	209	+ next unless $line =~ /^\s*\d+\s(\S+)\s+(?:\S+\s+){6}(\S+)\s+(\d+)/;
	210	+ my ($attribute_name, $when_failed, $raw_value) = ($1, $2, $3);
	211	+ if ($when_failed ne '-'){
	212	+ push(@error_messages, "Attribute $attribute_name failed at $when_failed");
	213	+ escalate_status('WARNING');
	214	+ warn "(debug) parsed SMART attribute $attribute_name with error condition:\n$when_failed\n\n" if $opt_debug;
	215	+ }
	216	+ # some attributes produce questionable data; no need to graph them
	217	+ if (grep {$_ eq $attribute_name} ('Unknown_Attribute', 'Power_On_Minutes') ){
	218	+ next;
	219	+ }
	220	+ push (@perfdata, "$attribute_name=$raw_value");
	221	+
	222	+ # do some manual checks
	223	+ if ( ($attribute_name eq 'Current_Pending_Sector') && $raw_value ) {
	224	+ push(@error_messages, "Sectors pending re-allocation");
	225	+ escalate_status('WARNING');
	226	+ warn "(debug) Current_Pending_Sector is non-zero ($raw_value)\n\n" if $opt_debug;
	227	+ }
	228	+ }
	229	+}
	230	+else{
	231	+ my ($current_temperature, $max_temperature, $current_start_stop, $max_start_stop) = qw//;
	232	+ foreach my $line(@output){
	233	+ if ($line =~ /Current Drive Temperature:\s+(\d+)/){
	234	+ $current_temperature = $1;
	235	+ }
	236	+ elsif ($line =~ /Drive Trip Temperature:\s+(\d+)/){
	237	+ $max_temperature = $1;
	238	+ }
	239	+ elsif ($line =~ /Current start stop count:\s+(\d+)/){
	240	+ $current_start_stop = $1;
	241	+ }
	242	+ elsif ($line =~ /Recommended maximum start stop count:\s+(\d+)/){
	243	+ $max_start_stop = $1;
	244	+ }
	245	+ elsif ($line =~ /Elements in grown defect list:\s+(\d+)/){
	246	+ push (@perfdata, "defect_list=$1");
	247	+ }
	248	+ elsif ($line =~ /Blocks sent to initiator =\s+(\d+)/){
	249	+ push (@perfdata, "sent_blocks=$1");
	250	+ }
	251	+ }
	252	+ if($current_temperature){
	253	+ if($max_temperature){
	254	+ push (@perfdata, "temperature=$current_temperature;;$max_temperature");
	255	+ if($current_temperature > $max_temperature){
	256	+ warn "(debug) Disk temperature is greater than max ($current_temperature > $max_temperature)\n\n" if $opt_debug;
	257	+ push(@error_messages, 'Disk temperature is higher than maximum');
	258	+ escalate_status('CRITICAL');
	259	+ }
	260	+ }
	261	+ else{
	262	+ push (@perfdata, "temperature=$current_temperature");
	263	+ }
	264	+ }
	265	+ if($current_start_stop){
	266	+ if($max_start_stop){
	267	+ push (@perfdata, "start_stop=$current_start_stop;$max_start_stop");
	268	+ if($current_start_stop > $max_start_stop){
	269	+ warn "(debug) Disk start_stop is greater than max ($current_start_stop > $max_start_stop)\n\n" if $opt_debug;
	270	+ push(@error_messages, 'Disk start_stop is higher than maximum');
	271	+ escalate_status('WARNING');
	272	+ }
	273	+ }
	274	+ else{
	275	+ push (@perfdata, "start_stop=$current_start_stop");
	276	+ }
	277	+ }
	278	+}
	279	+warn "(debug) gathered perfdata:\n@perfdata\n\n" if $opt_debug;
	280	+my $perf_string = join(' ', @perfdata);
	281	+
	282	+warn "###########################################################\n" if $opt_debug;
	283	+warn "(debug) FINAL STATUS: $exit_status\n" if $opt_debug;
	284	+warn "###########################################################\n\n\n" if $opt_debug;
	285	+
	286	+warn "(debug) final status/output:\n" if $opt_debug;
	287	+
	288	+my $status_string = '';
	289	+
	290	+if($exit_status ne 'OK'){
	291	+ $status_string = "$exit_status: ".join(', ', @error_messages);
	292	+}
	293	+else {
	294	+ $status_string = "OK: no SMART errors detected";
	295	+}
	296	+
	297	+print "$status_string\|$perf_string\n";
	298	+exit $ERRORS{$exit_status};
	299	+
	300	+sub print_help {
	301	+ print_revision($basename,$revision);
	302	+ print "Usage: $basename (--device=<SMART device> --interface=(ata\|scsi)\|-h\|-v) [--debug]\n";
	303	+ print " --debug: show debugging information\n";
	304	+ print " -d/--device: a device to be SMART monitored, eg /dev/sda\n";
	305	+ print " -i/--interface: ata, scsi, megaraid, depending upon the device's interface type\n";
	306	+ print " -n/--number: where in the argument megaraid, it is the physical disk number within the MegaRAID controller\n";
	307	+ print " -h/--help: this help\n";
	308	+ print " -v/--version: Version number\n";
	309	+ support();
	310	+}
	311	+
	312	+# escalate an exit status IFF it's more severe than the previous exit status
	313	+sub escalate_status {
	314	+ my $requested_status = shift;
	315	+ # no test for 'CRITICAL'; automatically escalates upwards
	316	+ if ($requested_status eq 'WARNING') {
	317	+ return if $exit_status eq 'CRITICAL';
	318	+ }
	319	+ if ($requested_status eq 'UNKNOWN') {
	320	+ return if $exit_status eq 'WARNING';
	321	+ return if $exit_status eq 'CRITICAL';
	322	+ }
	323	+ $exit_status = $requested_status;
	324	+}
0	325