Update check-netapp-ng.pl

Hello Ran

This is Laurent DUFOUR (laurent.dufour@havas.com) from Paris France

I propose a small little change to the handling of timeouts in this check-netapp-ng script, in order to avoid the message "Return code of 142 is out of bounds" from Nagios. In fact we have to deal with two type of timeouts as I explain below, feel free to contact me if you need more explanations

CLARIFICATION FOR TIMEOUTS
There are multiples timeouts we depend on

Perl plugins timeout (utils.pm)--- > $TIMEOUT --- > originally 15 sec --> recommandation to raise it to 180
  
Net::SNMP timeout --- > Used in Net::SNMP->session --- > originally 5 sec --> recommandation to raise it to 60
Beware that the max value 60 seconds. If set above you get the error message "Can't create snmp session"

Do not forget that in nagios you need to increase service_check_timeout to a value above $TIMEOUT_PLUGINS
Nagios service check timeout (nagios.cfg) --- > service_check_timeout=240 --- > originally 30 sec
This commit is contained in:
dufourl 2017-08-07 15:27:37 +02:00 committed by GitHub
parent fc9c1902ab
commit c9271972a6
1 changed files with 769 additions and 740 deletions

View File

@ -30,8 +30,21 @@
## AUTOSUPPORTSTATUS|NFSOPS|
## CIFSOPS|SHELFINFO|...
##
#####################################
#####################################
##
##
## CLARIFICATION FOR TIMEOUTS
## There are multiples timeouts we depend on
##
## Perl plugins timeout (utils.pm)--- > $TIMEOUT --- > originally 15 sec --> recommandation to raise it to 180
##
## Net::SNMP timeout --- > Used in Net::SNMP->session --- > originally 5 sec --> recommandation to raise it to 60
## Beware that the max value 60 seconds. If set above you get the error message "Can't create snmp session"
##
## Do not forget that in nagios you need to increase service_check_timeout to a value above $TIMEOUT_PLUGINS
## Nagios service check timeout (nagios.cfg) --- > service_check_timeout=240 --- > originally 30 sec
##
####################################################################################################################################################
####################################################################################################################################################
use strict;
@ -47,13 +60,16 @@ use Getopt::Long;
use Time::Local;
use IPC::Cmd qw(run_forked);
Getopt::Long::Configure('bundling');
my $TIMEOUT_PLUGINS=$TIMEOUT;
my $stat = 0;
my $msg;
my $perf;
my $script_name = basename($0);
my $script_version = 1.3;
my $script_version = 1.3.1;
my $counterFilePath="/tmp";
my $counterFile;
@ -227,7 +243,7 @@ my %nvramBatteryStatus = (
5 => 'near end of life',
6 => 'at end of life',
7 => 'unknown',
);
);
my %GlobalStatusIndex = (
1 => 'other',
2 => 'unknown',
@ -235,7 +251,7 @@ my %GlobalStatusIndex = (
4 => 'nonCritical',
5 => 'critical',
6 => 'nonRecoverable',
);
);
my %AutoSupportStatusIndex = (
1 => 'ok',
@ -243,7 +259,7 @@ my %AutoSupportStatusIndex = (
3 => 'postFailure',
4 => 'smtpPostFailure',
5 => 'unknown',
);
);
my %cfSettingsIndex = (
1 => 'notConfigured',
@ -251,7 +267,7 @@ my %cfSettingsIndex = (
3 => 'disabled',
4 => 'takeoverByPartnerDisabled',
5 => 'thisNodeDead',
);
);
my %cfStateIndex = (
@ -259,7 +275,7 @@ my %cfStateIndex = (
2 => 'canTakeover',
3 => 'cannotTakeover',
4 => 'takeover',
);
);
my %cfCannotTakeoverCauseIndex = (
1 => 'ok',
@ -280,20 +296,20 @@ my %cfCannotTakeoverCauseIndex = (
16 => 'alreadyInTakenoverMode',
17 => 'nvramLogUnsynchronized',
18 => 'backupMailboxProblems',
);
);
my %cfPartnerStatusIndex = (
1 => 'maybeDown',
2 => 'ok',
3 => 'dead',
);
);
my %cfInterconnectStatusIndex = (
1 => 'notPresent',
2 => 'down',
3 => 'partialFailure',
4 => 'up',
);
);
my %EcnlStatusIndex = (
1 => 'initializing',
@ -302,13 +318,13 @@ my %EcnlStatusIndex = (
4 => 'inactive',
5 => 'reconfiguring',
6 => 'nonexistent',
);
);
my %fsOverallStatusIndex = (
1 => 'ok',
2 => 'Nearly Full',
3 => 'Full',
);
);
### Functions
###############
@ -338,10 +354,8 @@ sub _create_session(@) {
sub FSyntaxError($) {
my $err = shift;
print <<EOU;
$err
This is $script_name in version $script_version.
$err
This is $script_name in version $script_version.
Syntax:
-H <IP_or_Hostname> Ip/Dns Name of the Filer
-C <community_name> SNMP Community Name for read
@ -356,7 +370,6 @@ This is $script_name in version $script_version.
-e <vol1[,vol2[,...]]> Exclude volumes from snap check (SNAPSHOT/SNAPSHOTAGE)
-I Inform only, return OK every time (ignore -w and -c values)
-h This help
Available check types:
TEMP - Temperature
FAN - Fan Fail
@ -382,20 +395,15 @@ This is $script_name in version $script_version.
UPTIME - Only show\'s uptime
CACHEAGE - Cache Age (-w -c)
FSSTATUS - Overall file system health
Examples:
$script_name -H netapp.mydomain -C public -T UPTIME
UPTIME: 2 days, 23:03:21.09 | uptime=255801s
$script_name -H netapp.mydomain -C public -T DISKUSED -v /vol/data/ -w 90 -c 95 -V 2c
OK: DISKUSED 79% | /vol/data/=8104595240k
$script_name -H netapp.mydomain -C public -T GLOBALSTATUS
CRIT: GLOBALSTATUS nonCritical 4 Disk on adapter 1a, shelf 1, bay 9, failed. | globalstatus=4
$script_name -H netapp.mydomain -C public -T DISKUSED -v wtf
WARN: Unknown volume path or aggregate name 'wtf'. Available values: aggr_p1a_sas2_mirror /vol/vol0/ /vol/esx/ /vol/xen_a/
EOU
exit($ERRORS{'UNKNOWN'});
}
@ -493,6 +501,8 @@ $opt{'crit'} = 500;
$opt{'warn'} = 500;
$opt{'version'} = 2;
$opt{'timeout'} = 60;
$TIMEOUT_PLUGINS = 180 ;
my $result = GetOptions(\%opt,
'filer|H=s',
'community|C=s',
@ -507,6 +517,17 @@ my $result = GetOptions(\%opt,
"help|h",
);
if ( $opt{'timeout'} > 60)
{
#Set timeout for plugin to the parameter received via command line, but set snmp timeout to the max (60 seconds) if CLI timeout is above 60 seconds
$TIMEOUT_PLUGINS=$opt{'timeout'};
$opt{'timeout'}=60;
} else
{
$TIMEOUT_PLUGINS=$opt{'timeout'};
}
FSyntaxError("") if defined $opt{'help'};
FSyntaxError("Missing -H") unless defined $opt{'filer'};
FSyntaxError("Missing -C") unless defined $opt{'community'};
@ -536,9 +557,16 @@ if (!defined($counterFilePath)) {
# Just in case of problems, let's not hang Nagios
# with "Return code of 142 is out of bounds", instead we set the message "No response in time"
$SIG{'ALRM'} = sub {
print ("CRITICAL: No response in time\n");
exit $ERRORS{"CRITICAL"};
};
# Starting Alarm
alarm($TIMEOUT);
alarm($TIMEOUT_PLUGINS);
# Establish SNMP Session
our $snmp_session = _create_session($opt{'filer'},$opt{'community'},$opt{'version'},$opt{'timeout'});
@ -661,7 +689,7 @@ if("$opt{'check_type'}" eq "TEMP") {
$msg = "CRIT: Over $opt{'check_type'} !";
}
$perf = "overtemperature=$check";
### Fan ###
### Fan ###
} elsif("$opt{'check_type'}" eq "FAN") {
my $check = _get_oid_value($snmp_session,$snmpFailedFanCount);
if($check == 0) {
@ -672,7 +700,7 @@ if("$opt{'check_type'}" eq "TEMP") {
$msg = "CRIT: $opt{'check_type'} $check !";
}
$perf = "failedfans=$check";
### PS ###
### PS ###
} elsif("$opt{'check_type'}" eq "PS") {
my $check = _get_oid_value($snmp_session,$snmpFailPowerSupplyCount);
if($check == 0) {
@ -683,12 +711,12 @@ if("$opt{'check_type'}" eq "TEMP") {
$msg = "CRIT: $opt{'check_type'} Fail $check !";
}
$perf = "failedpowersupplies=$check";
### CPULOAD ###
### CPULOAD ###
} elsif("$opt{'check_type'}" eq "CPULOAD") {
my $check = _get_oid_value($snmp_session,$snmpcpuBusyTimePerCent);
($msg,$stat) = _clac_err_stat($check,$opt{'check_type'},$opt{'warn'},$opt{'crit'});
$perf = "cpuload=$check\%;$opt{'warn'};$opt{'crit'};;";
### NFSOPS ###
### NFSOPS ###
} elsif("$opt{'check_type'}" eq "NFSOPS") {
my $nfsops_per_seconds=floor ( ($total_nfs_ops-$fileNfsOps)/$elapsedtime );
@ -696,7 +724,7 @@ if("$opt{'check_type'}" eq "TEMP") {
($msg,$stat) = _clac_absolute_err_stat($check,$opt{'check_type'},$opt{'warn'},$opt{'crit'});
$perf = "nfsops=$check";
### CIFSOPS ###
### CIFSOPS ###
} elsif("$opt{'check_type'}" eq "CIFSOPS") {
my $cifsops_per_seconds=floor ( ($total_cifs_ops-$fileCifsOps)/$elapsedtime );
@ -704,7 +732,7 @@ if("$opt{'check_type'}" eq "TEMP") {
($msg,$stat) = _clac_absolute_err_stat($check,$opt{'check_type'},$opt{'warn'},$opt{'crit'});
$perf = "cifsops=$check";
### ISCSIOPS ###
### ISCSIOPS ###
} elsif("$opt{'check_type'}" eq "ISCSIOPS") {
my $iscsiops_per_seconds=floor ( ($blocks_iscsi_ops-$fileIscsiOps)/$elapsedtime );
my $iscsiread_per_seconds=floor ( ($blocks_iscsi_read-$fileIscsi64ReadBytes)/$elapsedtime );
@ -716,7 +744,7 @@ if("$opt{'check_type'}" eq "TEMP") {
($msg,$stat) = _clac_absolute_err_stat($check,$opt{'check_type'},$opt{'warn'},$opt{'crit'});
$msg = "$msg ops/s (iscsi read=$iscsiread_per_seconds B/s, iscsi write=$iscsiwrite_per_seconds B/s, disk read=$diskread_per_seconds B/s, disk write=$diskwrite_per_seconds B/s)";
$perf = "iscsiops=$check iscsiread=$iscsiread_per_seconds iscsiwrite=$iscsiwrite_per_seconds diskread=$diskread_per_seconds diskwrite=$diskwrite_per_seconds";
### FCPOPS ###
### FCPOPS ###
} elsif("$opt{'check_type'}" eq "FCPOPS") {
my $fcpops_per_seconds=floor ( ($blocks_fcp_ops-$fileFcpOps)/$elapsedtime );
my $fcpread_per_seconds=floor ( ($blocks_fcp_read-$fileFcp64ReadBytes)/$elapsedtime );
@ -729,7 +757,7 @@ if("$opt{'check_type'}" eq "TEMP") {
($msg,$stat) = _clac_absolute_err_stat($check,$opt{'check_type'},$opt{'warn'},$opt{'crit'});
$msg = "$msg ops/s (fcp read=$fcpread_per_seconds B/s, fcp write=$fcpwrite_per_seconds B/s, disk read=$diskread_per_seconds B/s, disk write=$diskwrite_per_seconds B/s)";
$perf = "fcpops=$check fcpread=$fcpread_per_seconds fcpwrite=$fcpwrite_per_seconds diskread=$diskread_per_seconds diskwrite=$diskwrite_per_seconds";
### NVRAM ###
### NVRAM ###
} elsif("$opt{'check_type'}" eq "NVRAM") {
my $check = _get_oid_value($snmp_session,$snmpnvramBatteryStatus);
if($check == 1) {
@ -740,7 +768,7 @@ if("$opt{'check_type'}" eq "TEMP") {
$msg = "CRIT: $opt{'check_type'} $nvramBatteryStatus{$check}";
}
$perf = "nvrambatterystatus=$check";
### DISKUSED ###
### DISKUSED ###
} elsif("$opt{'check_type'}" eq "DISKUSED") {
FSyntaxError("Missing -v") unless defined $opt{'vol'};
@ -779,7 +807,7 @@ if("$opt{'check_type'}" eq "TEMP") {
$msg .= " $$r_vol_tbl{$key}"
}
}
### SNAPSHOTAGE ###
### SNAPSHOTAGE ###
} elsif("$opt{'check_type'}" eq "SNAPSHOTAGE") {
my @exc_list = split(',',$opt{'exclude'});
@ -885,9 +913,9 @@ if("$opt{'check_type'}" eq "TEMP") {
$stat = $ERRORS{'UNKNOWN'};
$msg = "UNKNOW Errors";
}
$perf = "outdated_snapshots=$badcount";
$perf = "outdated_snapshots=$badcount";
### SNAPSHOT ###
### SNAPSHOT ###
} elsif("$opt{'check_type'}" eq "SNAPSHOT") {
my @exc_list = split(',',$opt{'exclude'});
my @vol_err;
@ -923,7 +951,7 @@ $perf = "outdated_snapshots=$badcount";
}
$perf = "snapoff=$err_count";
### FAILEDDISK ###
### FAILEDDISK ###
} elsif("$opt{'check_type'}" eq "FAILEDDISK") {
my $check = _get_oid_value($snmp_session,$snmpFailedDiskCount);
if($check == 0) {
@ -935,7 +963,7 @@ $perf = "outdated_snapshots=$badcount";
}
$perf = "faileddisks=$check";
### DISKSUMMARY ###
### DISKSUMMARY ###
} elsif("$opt{'check_type'}" eq "DISKSUMMARY") {
my $diskTotal = _get_oid_value($snmp_session,$snmp_netapp_disksummary_diskTotalCount);
my $diskActive = _get_oid_value($snmp_session,$snmp_netapp_disksummary_diskActiveCount);
@ -958,7 +986,7 @@ $perf = "outdated_snapshots=$badcount";
}
$perf = "faileddisks=$check total=$diskTotal active=$diskActive spare=$diskSpare reconstructing=$diskReconstructing";
### HA ###
### HA ###
} elsif("$opt{'check_type'}" eq "HA") {
my $cfSettings = _get_oid_value($snmp_session,$snmp_netapp_cfSettings);
@ -991,18 +1019,18 @@ $perf = "outdated_snapshots=$badcount";
$perf = "hasettings=$check";
### UPTIME ###
### UPTIME ###
} elsif("$opt{'check_type'}" eq "UPTIME") {
my $check = _get_oid_value($snmp_session,$snmpUpTime);
$msg = "$opt{'check_type'}: $check";
$check =~ m/^\s*(\d+)\s+days,\s+(\d+):(\d+):(\d+).*$/;
$perf = "uptime=" . ($1*86400 + $2*3600 + $3*60 + $4) . "s";
### CACHEAGE ###
### CACHEAGE ###
} elsif("$opt{'check_type'}" eq "CACHEAGE") {
my $check = _get_oid_value($snmp_session,$snmpCacheAge);
($msg,$stat) = _clac_minutes_err_stat($check,$opt{'check_type'},$opt{'warn'},$opt{'crit'});
$perf = "cache_age=$check";
### GLOBALSTATUS ###
### GLOBALSTATUS ###
} elsif("$opt{'check_type'}" eq "GLOBALSTATUS") {
my $check = _get_oid_value($snmp_session,$snmpGlobalStatus);
my $global_stat_txt = _get_oid_value($snmp_session,$snmpGlobalStatus_text);
@ -1014,7 +1042,7 @@ $perf = "outdated_snapshots=$badcount";
$msg = "CRIT: $opt{'check_type'} $GlobalStatusIndex{$check} $check $global_stat_txt";
}
$perf = "globalstatus=$check";
### AUTOSUPPORTSTATUS ###
### AUTOSUPPORTSTATUS ###
} elsif("$opt{'check_type'}" eq "AUTOSUPPORTSTATUS") {
my $check = _get_oid_value($snmp_session,$snmpAutoSupportStatus);
my $autosupport_stat_txt = _get_oid_value($snmp_session,$snmpAutoSupportStatus_text);
@ -1026,17 +1054,17 @@ $perf = "outdated_snapshots=$badcount";
$msg = "CRIT: $opt{'check_type'} $AutoSupportStatusIndex{$check} $check $autosupport_stat_txt";
}
$perf = "autosupportstatus=$check";
### NDMPSESSIONS ###
### NDMPSESSIONS ###
} elsif("$opt{'check_type'}" eq "NDMPSESSIONS") {
my $check = _get_oid_value($snmp_session,$snmpNdmpSessions);
($msg,$stat) = _clac_absolute_err_stat($check,$opt{'check_type'},$opt{'warn'},$opt{'crit'});
$perf = "ndmpsess=$check";
### CIFSSESSIONS ###
### CIFSSESSIONS ###
} elsif("$opt{'check_type'}" eq "CIFSSESSIONS") {
my $check = _get_oid_value($snmp_session,$snmpCifsSessions);
($msg,$stat) = _clac_absolute_err_stat($check,$opt{'check_type'},$opt{'warn'},$opt{'crit'});
$perf = "cifssess=$check";
### SHELF ###
### SHELF ###
} elsif ( ("$opt{'check_type'}" eq "SHELF") or ("$opt{'check_type'}" eq "SHELFINFO") ) {
my @errs;
my $r_shelf = $snmp_session->get_table($snmpEnclTableIndex);
@ -1124,7 +1152,7 @@ $perf = "outdated_snapshots=$badcount";
else
{ $perf = "shelf=1"; }
}
### FSSTATUS ###
### FSSTATUS ###
} elsif("$opt{'check_type'}" eq "FSSTATUS") {
my $check = _get_oid_value($snmp_session,$snmpfsOverallStatus);
my $global_stat_txt = _get_oid_value($snmp_session,$snmpfsOverallStatus_text);
@ -1140,7 +1168,7 @@ $perf = "outdated_snapshots=$badcount";
}
$perf = "fsstatus=$check";
### Syntax Error ###
### Syntax Error ###
} else {
FSyntaxError("$opt{'check_type'} invalid parameter !");
}
@ -1149,3 +1177,4 @@ $msg =~ s/\n//g;
$perf ? print "$msg | $perf\n" : print "$msg\n";
exit($stat);