Benjamin Renard commited on 2011-05-13 13:02:32
              Showing 1 changed files, with 324 additions and 0 deletions.
            
| ... | ... | @@ -0,0 +1,324 @@ | 
| 1 | +#!/usr/bin/perl -w | |
| 2 | +# Check SMART status of ATA/SCSI disks, returning any usable metrics as perfdata. | |
| 3 | +# For usage information, run ./check_smart -h | |
| 4 | +# | |
| 5 | +# This script was created under contract for the US Government and is therefore Public Domain | |
| 6 | +# | |
| 7 | +# Changes and Modifications | |
| 8 | +# ========================= | |
| 9 | +# Feb 3, 2009: Kurt Yoder - initial version of script 1.0 | |
| 10 | +# Jan 27, 2010: Philippe Genonceaux - modifications for compatibility with megaraid, use smartmontool version >= 5.39 | |
| 11 | +# Add this line to /etc/sudoers: "nagios ALL=(root) NOPASSWD: /usr/sbin/smartctl" | |
| 12 | + | |
| 13 | +use strict; | |
| 14 | +use Getopt::Long; | |
| 15 | + | |
| 16 | +use File::Basename qw(basename); | |
| 17 | +my $basename = basename($0); | |
| 18 | + | |
| 19 | +my $revision = '$Revision: 1.0.1 $'; | |
| 20 | + | |
| 21 | +use lib '/usr/lib/nagios/plugins/'; | |
| 22 | +use utils qw(%ERRORS &print_revision &support &usage); | |
| 23 | + | |
| 24 | +$ENV{'PATH'}='/bin:/usr/bin:/sbin:/usr/sbin'; | |
| 25 | +$ENV{'BASH_ENV'}='';  | |
| 26 | +$ENV{'ENV'}=''; | |
| 27 | + | |
| 28 | +use vars qw($opt_d $opt_debug $opt_h $opt_i $opt_n $opt_v); | |
| 29 | +Getopt::Long::Configure('bundling'); | |
| 30 | +GetOptions( | |
| 31 | + "debug" => \$opt_debug, | |
| 32 | + "d=s" => \$opt_d, "device=s" => \$opt_d, | |
| 33 | + "h" => \$opt_h, "help" => \$opt_h, | |
| 34 | + "i=s" => \$opt_i, "interface=s" => \$opt_i, | |
| 35 | + "n=s" => \$opt_n, "number=s" => \$opt_n, | |
| 36 | + "v" => \$opt_v, "version" => \$opt_v, | |
| 37 | +); | |
| 38 | + | |
| 39 | +if ($opt_v) { | |
| 40 | + print_revision($basename,$revision); | |
| 41 | +	exit $ERRORS{'OK'}; | |
| 42 | +} | |
| 43 | + | |
| 44 | +if ($opt_h) { | |
| 45 | + print_help(); | |
| 46 | +	exit $ERRORS{'OK'}; | |
| 47 | +} | |
| 48 | +my ($device, $interface, $number) = qw//; | |
| 49 | +if ($opt_d) { | |
| 50 | +	unless($opt_i){ | |
| 51 | + print "must specify an interface for $opt_d using -i/--interface!\n\n"; | |
| 52 | + print_help(); | |
| 53 | +		exit $ERRORS{'UNKNOWN'}; | |
| 54 | + } | |
| 55 | + | |
| 56 | +	if (-b $opt_d){ | |
| 57 | + $device = $opt_d; | |
| 58 | + } | |
| 59 | +	else{ | |
| 60 | + print "$opt_d is not a valid block device!\n\n"; | |
| 61 | + print_help(); | |
| 62 | +		exit $ERRORS{'UNKNOWN'}; | |
| 63 | + } | |
| 64 | + | |
| 65 | +	if(grep {$opt_i eq $_} ('ata', 'scsi', 'megaraid')){ | |
| 66 | + $interface = $opt_i; | |
| 67 | +                if($interface eq 'megaraid'){ | |
| 68 | +                    if(defined($opt_n)){ | |
| 69 | + $number = $opt_n; | |
| 70 | + $interface = $opt_i.",".$number; | |
| 71 | + } | |
| 72 | +                    else{ | |
| 73 | + print "must specify a physical disk number within the MegaRAID controller!\n\n"; | |
| 74 | + print_help(); | |
| 75 | +                        exit $ERRORS{'UNKNOWN'}; | |
| 76 | + } | |
| 77 | + } | |
| 78 | + } | |
| 79 | +	else{ | |
| 80 | + print "invalid interface $opt_i for $opt_d!\n\n"; | |
| 81 | + print_help(); | |
| 82 | +		exit $ERRORS{'UNKNOWN'}; | |
| 83 | + } | |
| 84 | +} | |
| 85 | +else{ | |
| 86 | + print "must specify a device!\n\n"; | |
| 87 | + print_help(); | |
| 88 | +	exit $ERRORS{'UNKNOWN'}; | |
| 89 | +} | |
| 90 | +my $smart_command = '/usr/bin/sudo /usr/sbin/smartctl'; | |
| 91 | +my @error_messages = qw//; | |
| 92 | +my $exit_status = 'OK'; | |
| 93 | + | |
| 94 | + | |
| 95 | +warn "###########################################################\n" if $opt_debug; | |
| 96 | +warn "(debug) CHECK 1: getting overall SMART health status\n" if $opt_debug; | |
| 97 | +warn "###########################################################\n\n\n" if $opt_debug; | |
| 98 | + | |
| 99 | +my $full_command = "$smart_command -d $interface -H $device"; | |
| 100 | +warn "(debug) executing:\n$full_command\n\n" if $opt_debug; | |
| 101 | + | |
| 102 | +my @output = `$full_command`; | |
| 103 | +warn "(debug) output:\n@output\n\n" if $opt_debug; | |
| 104 | + | |
| 105 | +# parse ata output, looking for "health status: passed" | |
| 106 | +my $found_status = 0; | |
| 107 | +my $line_str = 'SMART overall-health self-assessment test result: '; # ATA SMART line | |
| 108 | +my $ok_str = 'PASSED'; # ATA SMART OK string | |
| 109 | + | |
| 110 | +if ($interface eq 'megaraid'.",".$number or 'scsi'){ | |
| 111 | + $line_str = 'SMART Health Status: '; # SCSI OR MEGARAID SMART line | |
| 112 | + $ok_str = 'OK'; #SCSI OR MEGARAID SMART OK string | |
| 113 | +} | |
| 114 | + | |
| 115 | +foreach my $line (@output){ | |
| 116 | +	if($line =~ /$line_str(.+)/){ | |
| 117 | + $found_status = 1; | |
| 118 | + warn "(debug) parsing line:\n$line\n\n" if $opt_debug; | |
| 119 | +		if ($1 eq $ok_str) { | |
| 120 | + warn "(debug) found string '$ok_str'; status OK\n\n" if $opt_debug; | |
| 121 | + } | |
| 122 | +		else { | |
| 123 | + warn "(debug) no '$ok_str' status; failing\n\n" if $opt_debug; | |
| 124 | + push(@error_messages, "Health status: $1"); | |
| 125 | +			escalate_status('CRITICAL'); | |
| 126 | + } | |
| 127 | + } | |
| 128 | +} | |
| 129 | + | |
| 130 | +unless ($found_status) { | |
| 131 | + push(@error_messages, 'No health status line found'); | |
| 132 | +	escalate_status('UNKNOWN'); | |
| 133 | +} | |
| 134 | + | |
| 135 | + | |
| 136 | +warn "###########################################################\n" if $opt_debug; | |
| 137 | +warn "(debug) CHECK 2: getting silent SMART health check\n" if $opt_debug; | |
| 138 | +warn "###########################################################\n\n\n" if $opt_debug; | |
| 139 | + | |
| 140 | +$full_command = "$smart_command -d $interface -q silent -A $device"; | |
| 141 | +warn "(debug) executing:\n$full_command\n\n" if $opt_debug; | |
| 142 | + | |
| 143 | +system($full_command); | |
| 144 | +my $return_code = $?; | |
| 145 | +warn "(debug) exit code:\n$return_code\n\n" if $opt_debug; | |
| 146 | + | |
| 147 | +if ($return_code & 0x01) { | |
| 148 | + push(@error_messages, 'Commandline parse failure'); | |
| 149 | +	escalate_status('UNKNOWN'); | |
| 150 | +} | |
| 151 | +if ($return_code & 0x02) { | |
| 152 | + push(@error_messages, 'Device could not be opened'); | |
| 153 | +	escalate_status('UNKNOWN'); | |
| 154 | +} | |
| 155 | +if ($return_code & 0x04) { | |
| 156 | + push(@error_messages, 'Checksum failure'); | |
| 157 | +	escalate_status('WARNING'); | |
| 158 | +} | |
| 159 | +if ($return_code & 0x08) { | |
| 160 | + push(@error_messages, 'Disk is failing'); | |
| 161 | +	escalate_status('CRITICAL'); | |
| 162 | +} | |
| 163 | +if ($return_code & 0x10) { | |
| 164 | + push(@error_messages, 'Disk is in prefail'); | |
| 165 | +	escalate_status('WARNING'); | |
| 166 | +} | |
| 167 | +if ($return_code & 0x20) { | |
| 168 | + push(@error_messages, 'Disk may be close to failure'); | |
| 169 | +	escalate_status('WARNING'); | |
| 170 | +} | |
| 171 | +if ($return_code & 0x40) { | |
| 172 | + push(@error_messages, 'Error log contains errors'); | |
| 173 | +	escalate_status('WARNING'); | |
| 174 | +} | |
| 175 | +if ($return_code & 0x80) { | |
| 176 | + push(@error_messages, 'Self-test log contains errors'); | |
| 177 | +	escalate_status('WARNING'); | |
| 178 | +} | |
| 179 | +if ($return_code && !$exit_status) { | |
| 180 | + push(@error_messages, 'Unknown return code'); | |
| 181 | +	escalate_status('CRITICAL'); | |
| 182 | +} | |
| 183 | + | |
| 184 | +if ($return_code) { | |
| 185 | + warn "(debug) non-zero exit code, generating error condition\n\n" if $opt_debug; | |
| 186 | +} | |
| 187 | +else { | |
| 188 | + warn "(debug) zero exit code, status OK\n\n" if $opt_debug; | |
| 189 | +} | |
| 190 | + | |
| 191 | + | |
| 192 | +warn "###########################################################\n" if $opt_debug; | |
| 193 | +warn "(debug) CHECK 3: getting detailed statistics\n" if $opt_debug; | |
| 194 | +warn "(debug) information contains a few more potential trouble spots\n" if $opt_debug; | |
| 195 | +warn "(debug) plus, we can also use the information for perfdata/graphing\n" if $opt_debug; | |
| 196 | +warn "###########################################################\n\n\n" if $opt_debug; | |
| 197 | + | |
| 198 | +$full_command = "$smart_command -d $interface -A $device"; | |
| 199 | +warn "(debug) executing:\n$full_command\n\n" if $opt_debug; | |
| 200 | +@output = `$full_command`; | |
| 201 | +warn "(debug) output:\n@output\n\n" if $opt_debug; | |
| 202 | +my @perfdata = qw//; | |
| 203 | + | |
| 204 | +# separate metric-gathering and output analysis for ATA vs SCSI SMART output | |
| 205 | +if ($interface eq 'ata'){ | |
| 206 | +	foreach my $line(@output){ | |
| 207 | + # get lines that look like this: | |
| 208 | + # 9 Power_On_Minutes 0x0032 241 241 000 Old_age Always - 113h+12m | |
| 209 | +		next unless $line =~ /^\s*\d+\s(\S+)\s+(?:\S+\s+){6}(\S+)\s+(\d+)/; | |
| 210 | + my ($attribute_name, $when_failed, $raw_value) = ($1, $2, $3); | |
| 211 | +		if ($when_failed ne '-'){ | |
| 212 | + push(@error_messages, "Attribute $attribute_name failed at $when_failed"); | |
| 213 | +			escalate_status('WARNING'); | |
| 214 | + warn "(debug) parsed SMART attribute $attribute_name with error condition:\n$when_failed\n\n" if $opt_debug; | |
| 215 | + } | |
| 216 | + # some attributes produce questionable data; no need to graph them | |
| 217 | +		if (grep {$_ eq $attribute_name} ('Unknown_Attribute', 'Power_On_Minutes') ){ | |
| 218 | + next; | |
| 219 | + } | |
| 220 | + push (@perfdata, "$attribute_name=$raw_value"); | |
| 221 | + | |
| 222 | + # do some manual checks | |
| 223 | +		if ( ($attribute_name eq 'Current_Pending_Sector') && $raw_value ) { | |
| 224 | + push(@error_messages, "Sectors pending re-allocation"); | |
| 225 | +			escalate_status('WARNING'); | |
| 226 | + warn "(debug) Current_Pending_Sector is non-zero ($raw_value)\n\n" if $opt_debug; | |
| 227 | + } | |
| 228 | + } | |
| 229 | +} | |
| 230 | +else{ | |
| 231 | + my ($current_temperature, $max_temperature, $current_start_stop, $max_start_stop) = qw//; | |
| 232 | +	foreach my $line(@output){ | |
| 233 | +		if ($line =~ /Current Drive Temperature:\s+(\d+)/){ | |
| 234 | + $current_temperature = $1; | |
| 235 | + } | |
| 236 | +		elsif ($line =~ /Drive Trip Temperature:\s+(\d+)/){ | |
| 237 | + $max_temperature = $1; | |
| 238 | + } | |
| 239 | +		elsif ($line =~ /Current start stop count:\s+(\d+)/){ | |
| 240 | + $current_start_stop = $1; | |
| 241 | + } | |
| 242 | +		elsif ($line =~ /Recommended maximum start stop count:\s+(\d+)/){ | |
| 243 | + $max_start_stop = $1; | |
| 244 | + } | |
| 245 | +		elsif ($line =~ /Elements in grown defect list:\s+(\d+)/){ | |
| 246 | + push (@perfdata, "defect_list=$1"); | |
| 247 | + } | |
| 248 | +		elsif ($line =~ /Blocks sent to initiator =\s+(\d+)/){ | |
| 249 | + push (@perfdata, "sent_blocks=$1"); | |
| 250 | + } | |
| 251 | + } | |
| 252 | +	if($current_temperature){ | |
| 253 | +		if($max_temperature){ | |
| 254 | + push (@perfdata, "temperature=$current_temperature;;$max_temperature"); | |
| 255 | +			if($current_temperature > $max_temperature){ | |
| 256 | + warn "(debug) Disk temperature is greater than max ($current_temperature > $max_temperature)\n\n" if $opt_debug; | |
| 257 | + push(@error_messages, 'Disk temperature is higher than maximum'); | |
| 258 | +				escalate_status('CRITICAL'); | |
| 259 | + } | |
| 260 | + } | |
| 261 | +		else{ | |
| 262 | + push (@perfdata, "temperature=$current_temperature"); | |
| 263 | + } | |
| 264 | + } | |
| 265 | +	if($current_start_stop){ | |
| 266 | +		if($max_start_stop){ | |
| 267 | + push (@perfdata, "start_stop=$current_start_stop;$max_start_stop"); | |
| 268 | +			if($current_start_stop > $max_start_stop){ | |
| 269 | + warn "(debug) Disk start_stop is greater than max ($current_start_stop > $max_start_stop)\n\n" if $opt_debug; | |
| 270 | + push(@error_messages, 'Disk start_stop is higher than maximum'); | |
| 271 | +				escalate_status('WARNING'); | |
| 272 | + } | |
| 273 | + } | |
| 274 | +		else{ | |
| 275 | + push (@perfdata, "start_stop=$current_start_stop"); | |
| 276 | + } | |
| 277 | + } | |
| 278 | +} | |
| 279 | +warn "(debug) gathered perfdata:\n@perfdata\n\n" if $opt_debug; | |
| 280 | +my $perf_string = join(' ', @perfdata); | |
| 281 | + | |
| 282 | +warn "###########################################################\n" if $opt_debug; | |
| 283 | +warn "(debug) FINAL STATUS: $exit_status\n" if $opt_debug; | |
| 284 | +warn "###########################################################\n\n\n" if $opt_debug; | |
| 285 | + | |
| 286 | +warn "(debug) final status/output:\n" if $opt_debug; | |
| 287 | + | |
| 288 | +my $status_string = ''; | |
| 289 | + | |
| 290 | +if($exit_status ne 'OK'){ | |
| 291 | +	$status_string = "$exit_status: ".join(', ', @error_messages); | |
| 292 | +} | |
| 293 | +else { | |
| 294 | + $status_string = "OK: no SMART errors detected"; | |
| 295 | +} | |
| 296 | + | |
| 297 | +print "$status_string|$perf_string\n"; | |
| 298 | +exit $ERRORS{$exit_status}; | |
| 299 | + | |
| 300 | +sub print_help { | |
| 301 | + print_revision($basename,$revision); | |
| 302 | + print "Usage: $basename (--device=<SMART device> --interface=(ata|scsi)|-h|-v) [--debug]\n"; | |
| 303 | + print " --debug: show debugging information\n"; | |
| 304 | + print " -d/--device: a device to be SMART monitored, eg /dev/sda\n"; | |
| 305 | + print " -i/--interface: ata, scsi, megaraid, depending upon the device's interface type\n"; | |
| 306 | + print " -n/--number: where in the argument megaraid, it is the physical disk number within the MegaRAID controller\n"; | |
| 307 | + print " -h/--help: this help\n"; | |
| 308 | + print " -v/--version: Version number\n"; | |
| 309 | + support(); | |
| 310 | +} | |
| 311 | + | |
| 312 | +# escalate an exit status IFF it's more severe than the previous exit status | |
| 313 | +sub escalate_status { | |
| 314 | + my $requested_status = shift; | |
| 315 | + # no test for 'CRITICAL'; automatically escalates upwards | |
| 316 | +	if ($requested_status eq 'WARNING') { | |
| 317 | + return if $exit_status eq 'CRITICAL'; | |
| 318 | + } | |
| 319 | +	if ($requested_status eq 'UNKNOWN') { | |
| 320 | + return if $exit_status eq 'WARNING'; | |
| 321 | + return if $exit_status eq 'CRITICAL'; | |
| 322 | + } | |
| 323 | + $exit_status = $requested_status; | |
| 324 | +} | |
| 0 | 325 |