mirror of
https://github.com/ranl/monitor-utils.git
synced 2024-11-04 15:33:42 +01:00
Update check-netapp-ng.pl
Hello Ran This is Laurent DUFOUR (laurent.dufour@havas.com) from Paris France I propose a small little change to the handling of timeouts in this check-netapp-ng script, in order to avoid the message "Return code of 142 is out of bounds" from Nagios. In fact we have to deal with two type of timeouts as I explain below, feel free to contact me if you need more explanations CLARIFICATION FOR TIMEOUTS There are multiples timeouts we depend on Perl plugins timeout (utils.pm)--- > $TIMEOUT --- > originally 15 sec --> recommandation to raise it to 180 Net::SNMP timeout --- > Used in Net::SNMP->session --- > originally 5 sec --> recommandation to raise it to 60 Beware that the max value 60 seconds. If set above you get the error message "Can't create snmp session" Do not forget that in nagios you need to increase service_check_timeout to a value above $TIMEOUT_PLUGINS Nagios service check timeout (nagios.cfg) --- > service_check_timeout=240 --- > originally 30 sec
This commit is contained in:
parent
fc9c1902ab
commit
c9271972a6
@ -30,8 +30,21 @@
|
||||
## AUTOSUPPORTSTATUS|NFSOPS|
|
||||
## CIFSOPS|SHELFINFO|...
|
||||
##
|
||||
#####################################
|
||||
#####################################
|
||||
##
|
||||
##
|
||||
## CLARIFICATION FOR TIMEOUTS
|
||||
## There are multiples timeouts we depend on
|
||||
##
|
||||
## Perl plugins timeout (utils.pm)--- > $TIMEOUT --- > originally 15 sec --> recommandation to raise it to 180
|
||||
##
|
||||
## Net::SNMP timeout --- > Used in Net::SNMP->session --- > originally 5 sec --> recommandation to raise it to 60
|
||||
## Beware that the max value 60 seconds. If set above you get the error message "Can't create snmp session"
|
||||
##
|
||||
## Do not forget that in nagios you need to increase service_check_timeout to a value above $TIMEOUT_PLUGINS
|
||||
## Nagios service check timeout (nagios.cfg) --- > service_check_timeout=240 --- > originally 30 sec
|
||||
##
|
||||
####################################################################################################################################################
|
||||
####################################################################################################################################################
|
||||
|
||||
|
||||
use strict;
|
||||
@ -47,13 +60,16 @@ use Getopt::Long;
|
||||
use Time::Local;
|
||||
use IPC::Cmd qw(run_forked);
|
||||
|
||||
|
||||
|
||||
Getopt::Long::Configure('bundling');
|
||||
|
||||
my $TIMEOUT_PLUGINS=$TIMEOUT;
|
||||
my $stat = 0;
|
||||
my $msg;
|
||||
my $perf;
|
||||
my $script_name = basename($0);
|
||||
my $script_version = 1.3;
|
||||
my $script_version = 1.3.1;
|
||||
|
||||
my $counterFilePath="/tmp";
|
||||
my $counterFile;
|
||||
@ -227,7 +243,7 @@ my %nvramBatteryStatus = (
|
||||
5 => 'near end of life',
|
||||
6 => 'at end of life',
|
||||
7 => 'unknown',
|
||||
);
|
||||
);
|
||||
my %GlobalStatusIndex = (
|
||||
1 => 'other',
|
||||
2 => 'unknown',
|
||||
@ -235,7 +251,7 @@ my %GlobalStatusIndex = (
|
||||
4 => 'nonCritical',
|
||||
5 => 'critical',
|
||||
6 => 'nonRecoverable',
|
||||
);
|
||||
);
|
||||
|
||||
my %AutoSupportStatusIndex = (
|
||||
1 => 'ok',
|
||||
@ -243,7 +259,7 @@ my %AutoSupportStatusIndex = (
|
||||
3 => 'postFailure',
|
||||
4 => 'smtpPostFailure',
|
||||
5 => 'unknown',
|
||||
);
|
||||
);
|
||||
|
||||
my %cfSettingsIndex = (
|
||||
1 => 'notConfigured',
|
||||
@ -251,7 +267,7 @@ my %cfSettingsIndex = (
|
||||
3 => 'disabled',
|
||||
4 => 'takeoverByPartnerDisabled',
|
||||
5 => 'thisNodeDead',
|
||||
);
|
||||
);
|
||||
|
||||
|
||||
my %cfStateIndex = (
|
||||
@ -259,7 +275,7 @@ my %cfStateIndex = (
|
||||
2 => 'canTakeover',
|
||||
3 => 'cannotTakeover',
|
||||
4 => 'takeover',
|
||||
);
|
||||
);
|
||||
|
||||
my %cfCannotTakeoverCauseIndex = (
|
||||
1 => 'ok',
|
||||
@ -280,20 +296,20 @@ my %cfCannotTakeoverCauseIndex = (
|
||||
16 => 'alreadyInTakenoverMode',
|
||||
17 => 'nvramLogUnsynchronized',
|
||||
18 => 'backupMailboxProblems',
|
||||
);
|
||||
);
|
||||
|
||||
my %cfPartnerStatusIndex = (
|
||||
1 => 'maybeDown',
|
||||
2 => 'ok',
|
||||
3 => 'dead',
|
||||
);
|
||||
);
|
||||
|
||||
my %cfInterconnectStatusIndex = (
|
||||
1 => 'notPresent',
|
||||
2 => 'down',
|
||||
3 => 'partialFailure',
|
||||
4 => 'up',
|
||||
);
|
||||
);
|
||||
|
||||
my %EcnlStatusIndex = (
|
||||
1 => 'initializing',
|
||||
@ -302,13 +318,13 @@ my %EcnlStatusIndex = (
|
||||
4 => 'inactive',
|
||||
5 => 'reconfiguring',
|
||||
6 => 'nonexistent',
|
||||
);
|
||||
);
|
||||
|
||||
my %fsOverallStatusIndex = (
|
||||
1 => 'ok',
|
||||
2 => 'Nearly Full',
|
||||
3 => 'Full',
|
||||
);
|
||||
);
|
||||
|
||||
### Functions
|
||||
###############
|
||||
@ -338,10 +354,8 @@ sub _create_session(@) {
|
||||
sub FSyntaxError($) {
|
||||
my $err = shift;
|
||||
print <<EOU;
|
||||
$err
|
||||
|
||||
This is $script_name in version $script_version.
|
||||
|
||||
$err
|
||||
This is $script_name in version $script_version.
|
||||
Syntax:
|
||||
-H <IP_or_Hostname> Ip/Dns Name of the Filer
|
||||
-C <community_name> SNMP Community Name for read
|
||||
@ -356,7 +370,6 @@ This is $script_name in version $script_version.
|
||||
-e <vol1[,vol2[,...]]> Exclude volumes from snap check (SNAPSHOT/SNAPSHOTAGE)
|
||||
-I Inform only, return OK every time (ignore -w and -c values)
|
||||
-h This help
|
||||
|
||||
Available check types:
|
||||
TEMP - Temperature
|
||||
FAN - Fan Fail
|
||||
@ -382,20 +395,15 @@ This is $script_name in version $script_version.
|
||||
UPTIME - Only show\'s uptime
|
||||
CACHEAGE - Cache Age (-w -c)
|
||||
FSSTATUS - Overall file system health
|
||||
|
||||
Examples:
|
||||
$script_name -H netapp.mydomain -C public -T UPTIME
|
||||
UPTIME: 2 days, 23:03:21.09 | uptime=255801s
|
||||
|
||||
$script_name -H netapp.mydomain -C public -T DISKUSED -v /vol/data/ -w 90 -c 95 -V 2c
|
||||
OK: DISKUSED 79% | /vol/data/=8104595240k
|
||||
|
||||
$script_name -H netapp.mydomain -C public -T GLOBALSTATUS
|
||||
CRIT: GLOBALSTATUS nonCritical 4 Disk on adapter 1a, shelf 1, bay 9, failed. | globalstatus=4
|
||||
|
||||
$script_name -H netapp.mydomain -C public -T DISKUSED -v wtf
|
||||
WARN: Unknown volume path or aggregate name 'wtf'. Available values: aggr_p1a_sas2_mirror /vol/vol0/ /vol/esx/ /vol/xen_a/
|
||||
|
||||
EOU
|
||||
exit($ERRORS{'UNKNOWN'});
|
||||
}
|
||||
@ -493,6 +501,8 @@ $opt{'crit'} = 500;
|
||||
$opt{'warn'} = 500;
|
||||
$opt{'version'} = 2;
|
||||
$opt{'timeout'} = 60;
|
||||
$TIMEOUT_PLUGINS = 180 ;
|
||||
|
||||
my $result = GetOptions(\%opt,
|
||||
'filer|H=s',
|
||||
'community|C=s',
|
||||
@ -507,6 +517,17 @@ my $result = GetOptions(\%opt,
|
||||
"help|h",
|
||||
);
|
||||
|
||||
if ( $opt{'timeout'} > 60)
|
||||
{
|
||||
#Set timeout for plugin to the parameter received via command line, but set snmp timeout to the max (60 seconds) if CLI timeout is above 60 seconds
|
||||
$TIMEOUT_PLUGINS=$opt{'timeout'};
|
||||
$opt{'timeout'}=60;
|
||||
} else
|
||||
{
|
||||
$TIMEOUT_PLUGINS=$opt{'timeout'};
|
||||
}
|
||||
|
||||
|
||||
FSyntaxError("") if defined $opt{'help'};
|
||||
FSyntaxError("Missing -H") unless defined $opt{'filer'};
|
||||
FSyntaxError("Missing -C") unless defined $opt{'community'};
|
||||
@ -536,9 +557,16 @@ if (!defined($counterFilePath)) {
|
||||
|
||||
|
||||
|
||||
# Just in case of problems, let's not hang Nagios
|
||||
# with "Return code of 142 is out of bounds", instead we set the message "No response in time"
|
||||
|
||||
$SIG{'ALRM'} = sub {
|
||||
print ("CRITICAL: No response in time\n");
|
||||
exit $ERRORS{"CRITICAL"};
|
||||
};
|
||||
|
||||
# Starting Alarm
|
||||
alarm($TIMEOUT);
|
||||
alarm($TIMEOUT_PLUGINS);
|
||||
|
||||
# Establish SNMP Session
|
||||
our $snmp_session = _create_session($opt{'filer'},$opt{'community'},$opt{'version'},$opt{'timeout'});
|
||||
@ -661,7 +689,7 @@ if("$opt{'check_type'}" eq "TEMP") {
|
||||
$msg = "CRIT: Over $opt{'check_type'} !";
|
||||
}
|
||||
$perf = "overtemperature=$check";
|
||||
### Fan ###
|
||||
### Fan ###
|
||||
} elsif("$opt{'check_type'}" eq "FAN") {
|
||||
my $check = _get_oid_value($snmp_session,$snmpFailedFanCount);
|
||||
if($check == 0) {
|
||||
@ -672,7 +700,7 @@ if("$opt{'check_type'}" eq "TEMP") {
|
||||
$msg = "CRIT: $opt{'check_type'} $check !";
|
||||
}
|
||||
$perf = "failedfans=$check";
|
||||
### PS ###
|
||||
### PS ###
|
||||
} elsif("$opt{'check_type'}" eq "PS") {
|
||||
my $check = _get_oid_value($snmp_session,$snmpFailPowerSupplyCount);
|
||||
if($check == 0) {
|
||||
@ -683,12 +711,12 @@ if("$opt{'check_type'}" eq "TEMP") {
|
||||
$msg = "CRIT: $opt{'check_type'} Fail $check !";
|
||||
}
|
||||
$perf = "failedpowersupplies=$check";
|
||||
### CPULOAD ###
|
||||
### CPULOAD ###
|
||||
} elsif("$opt{'check_type'}" eq "CPULOAD") {
|
||||
my $check = _get_oid_value($snmp_session,$snmpcpuBusyTimePerCent);
|
||||
($msg,$stat) = _clac_err_stat($check,$opt{'check_type'},$opt{'warn'},$opt{'crit'});
|
||||
$perf = "cpuload=$check\%;$opt{'warn'};$opt{'crit'};;";
|
||||
### NFSOPS ###
|
||||
### NFSOPS ###
|
||||
} elsif("$opt{'check_type'}" eq "NFSOPS") {
|
||||
my $nfsops_per_seconds=floor ( ($total_nfs_ops-$fileNfsOps)/$elapsedtime );
|
||||
|
||||
@ -696,7 +724,7 @@ if("$opt{'check_type'}" eq "TEMP") {
|
||||
|
||||
($msg,$stat) = _clac_absolute_err_stat($check,$opt{'check_type'},$opt{'warn'},$opt{'crit'});
|
||||
$perf = "nfsops=$check";
|
||||
### CIFSOPS ###
|
||||
### CIFSOPS ###
|
||||
} elsif("$opt{'check_type'}" eq "CIFSOPS") {
|
||||
my $cifsops_per_seconds=floor ( ($total_cifs_ops-$fileCifsOps)/$elapsedtime );
|
||||
|
||||
@ -704,7 +732,7 @@ if("$opt{'check_type'}" eq "TEMP") {
|
||||
|
||||
($msg,$stat) = _clac_absolute_err_stat($check,$opt{'check_type'},$opt{'warn'},$opt{'crit'});
|
||||
$perf = "cifsops=$check";
|
||||
### ISCSIOPS ###
|
||||
### ISCSIOPS ###
|
||||
} elsif("$opt{'check_type'}" eq "ISCSIOPS") {
|
||||
my $iscsiops_per_seconds=floor ( ($blocks_iscsi_ops-$fileIscsiOps)/$elapsedtime );
|
||||
my $iscsiread_per_seconds=floor ( ($blocks_iscsi_read-$fileIscsi64ReadBytes)/$elapsedtime );
|
||||
@ -716,7 +744,7 @@ if("$opt{'check_type'}" eq "TEMP") {
|
||||
($msg,$stat) = _clac_absolute_err_stat($check,$opt{'check_type'},$opt{'warn'},$opt{'crit'});
|
||||
$msg = "$msg ops/s (iscsi read=$iscsiread_per_seconds B/s, iscsi write=$iscsiwrite_per_seconds B/s, disk read=$diskread_per_seconds B/s, disk write=$diskwrite_per_seconds B/s)";
|
||||
$perf = "iscsiops=$check iscsiread=$iscsiread_per_seconds iscsiwrite=$iscsiwrite_per_seconds diskread=$diskread_per_seconds diskwrite=$diskwrite_per_seconds";
|
||||
### FCPOPS ###
|
||||
### FCPOPS ###
|
||||
} elsif("$opt{'check_type'}" eq "FCPOPS") {
|
||||
my $fcpops_per_seconds=floor ( ($blocks_fcp_ops-$fileFcpOps)/$elapsedtime );
|
||||
my $fcpread_per_seconds=floor ( ($blocks_fcp_read-$fileFcp64ReadBytes)/$elapsedtime );
|
||||
@ -729,7 +757,7 @@ if("$opt{'check_type'}" eq "TEMP") {
|
||||
($msg,$stat) = _clac_absolute_err_stat($check,$opt{'check_type'},$opt{'warn'},$opt{'crit'});
|
||||
$msg = "$msg ops/s (fcp read=$fcpread_per_seconds B/s, fcp write=$fcpwrite_per_seconds B/s, disk read=$diskread_per_seconds B/s, disk write=$diskwrite_per_seconds B/s)";
|
||||
$perf = "fcpops=$check fcpread=$fcpread_per_seconds fcpwrite=$fcpwrite_per_seconds diskread=$diskread_per_seconds diskwrite=$diskwrite_per_seconds";
|
||||
### NVRAM ###
|
||||
### NVRAM ###
|
||||
} elsif("$opt{'check_type'}" eq "NVRAM") {
|
||||
my $check = _get_oid_value($snmp_session,$snmpnvramBatteryStatus);
|
||||
if($check == 1) {
|
||||
@ -740,7 +768,7 @@ if("$opt{'check_type'}" eq "TEMP") {
|
||||
$msg = "CRIT: $opt{'check_type'} $nvramBatteryStatus{$check}";
|
||||
}
|
||||
$perf = "nvrambatterystatus=$check";
|
||||
### DISKUSED ###
|
||||
### DISKUSED ###
|
||||
} elsif("$opt{'check_type'}" eq "DISKUSED") {
|
||||
|
||||
FSyntaxError("Missing -v") unless defined $opt{'vol'};
|
||||
@ -779,7 +807,7 @@ if("$opt{'check_type'}" eq "TEMP") {
|
||||
$msg .= " $$r_vol_tbl{$key}"
|
||||
}
|
||||
}
|
||||
### SNAPSHOTAGE ###
|
||||
### SNAPSHOTAGE ###
|
||||
} elsif("$opt{'check_type'}" eq "SNAPSHOTAGE") {
|
||||
|
||||
my @exc_list = split(',',$opt{'exclude'});
|
||||
@ -885,9 +913,9 @@ if("$opt{'check_type'}" eq "TEMP") {
|
||||
$stat = $ERRORS{'UNKNOWN'};
|
||||
$msg = "UNKNOW Errors";
|
||||
}
|
||||
$perf = "outdated_snapshots=$badcount";
|
||||
$perf = "outdated_snapshots=$badcount";
|
||||
|
||||
### SNAPSHOT ###
|
||||
### SNAPSHOT ###
|
||||
} elsif("$opt{'check_type'}" eq "SNAPSHOT") {
|
||||
my @exc_list = split(',',$opt{'exclude'});
|
||||
my @vol_err;
|
||||
@ -923,7 +951,7 @@ $perf = "outdated_snapshots=$badcount";
|
||||
}
|
||||
$perf = "snapoff=$err_count";
|
||||
|
||||
### FAILEDDISK ###
|
||||
### FAILEDDISK ###
|
||||
} elsif("$opt{'check_type'}" eq "FAILEDDISK") {
|
||||
my $check = _get_oid_value($snmp_session,$snmpFailedDiskCount);
|
||||
if($check == 0) {
|
||||
@ -935,7 +963,7 @@ $perf = "outdated_snapshots=$badcount";
|
||||
}
|
||||
$perf = "faileddisks=$check";
|
||||
|
||||
### DISKSUMMARY ###
|
||||
### DISKSUMMARY ###
|
||||
} elsif("$opt{'check_type'}" eq "DISKSUMMARY") {
|
||||
my $diskTotal = _get_oid_value($snmp_session,$snmp_netapp_disksummary_diskTotalCount);
|
||||
my $diskActive = _get_oid_value($snmp_session,$snmp_netapp_disksummary_diskActiveCount);
|
||||
@ -958,7 +986,7 @@ $perf = "outdated_snapshots=$badcount";
|
||||
}
|
||||
$perf = "faileddisks=$check total=$diskTotal active=$diskActive spare=$diskSpare reconstructing=$diskReconstructing";
|
||||
|
||||
### HA ###
|
||||
### HA ###
|
||||
} elsif("$opt{'check_type'}" eq "HA") {
|
||||
|
||||
my $cfSettings = _get_oid_value($snmp_session,$snmp_netapp_cfSettings);
|
||||
@ -991,18 +1019,18 @@ $perf = "outdated_snapshots=$badcount";
|
||||
$perf = "hasettings=$check";
|
||||
|
||||
|
||||
### UPTIME ###
|
||||
### UPTIME ###
|
||||
} elsif("$opt{'check_type'}" eq "UPTIME") {
|
||||
my $check = _get_oid_value($snmp_session,$snmpUpTime);
|
||||
$msg = "$opt{'check_type'}: $check";
|
||||
$check =~ m/^\s*(\d+)\s+days,\s+(\d+):(\d+):(\d+).*$/;
|
||||
$perf = "uptime=" . ($1*86400 + $2*3600 + $3*60 + $4) . "s";
|
||||
### CACHEAGE ###
|
||||
### CACHEAGE ###
|
||||
} elsif("$opt{'check_type'}" eq "CACHEAGE") {
|
||||
my $check = _get_oid_value($snmp_session,$snmpCacheAge);
|
||||
($msg,$stat) = _clac_minutes_err_stat($check,$opt{'check_type'},$opt{'warn'},$opt{'crit'});
|
||||
$perf = "cache_age=$check";
|
||||
### GLOBALSTATUS ###
|
||||
### GLOBALSTATUS ###
|
||||
} elsif("$opt{'check_type'}" eq "GLOBALSTATUS") {
|
||||
my $check = _get_oid_value($snmp_session,$snmpGlobalStatus);
|
||||
my $global_stat_txt = _get_oid_value($snmp_session,$snmpGlobalStatus_text);
|
||||
@ -1014,7 +1042,7 @@ $perf = "outdated_snapshots=$badcount";
|
||||
$msg = "CRIT: $opt{'check_type'} $GlobalStatusIndex{$check} $check $global_stat_txt";
|
||||
}
|
||||
$perf = "globalstatus=$check";
|
||||
### AUTOSUPPORTSTATUS ###
|
||||
### AUTOSUPPORTSTATUS ###
|
||||
} elsif("$opt{'check_type'}" eq "AUTOSUPPORTSTATUS") {
|
||||
my $check = _get_oid_value($snmp_session,$snmpAutoSupportStatus);
|
||||
my $autosupport_stat_txt = _get_oid_value($snmp_session,$snmpAutoSupportStatus_text);
|
||||
@ -1026,17 +1054,17 @@ $perf = "outdated_snapshots=$badcount";
|
||||
$msg = "CRIT: $opt{'check_type'} $AutoSupportStatusIndex{$check} $check $autosupport_stat_txt";
|
||||
}
|
||||
$perf = "autosupportstatus=$check";
|
||||
### NDMPSESSIONS ###
|
||||
### NDMPSESSIONS ###
|
||||
} elsif("$opt{'check_type'}" eq "NDMPSESSIONS") {
|
||||
my $check = _get_oid_value($snmp_session,$snmpNdmpSessions);
|
||||
($msg,$stat) = _clac_absolute_err_stat($check,$opt{'check_type'},$opt{'warn'},$opt{'crit'});
|
||||
$perf = "ndmpsess=$check";
|
||||
### CIFSSESSIONS ###
|
||||
### CIFSSESSIONS ###
|
||||
} elsif("$opt{'check_type'}" eq "CIFSSESSIONS") {
|
||||
my $check = _get_oid_value($snmp_session,$snmpCifsSessions);
|
||||
($msg,$stat) = _clac_absolute_err_stat($check,$opt{'check_type'},$opt{'warn'},$opt{'crit'});
|
||||
$perf = "cifssess=$check";
|
||||
### SHELF ###
|
||||
### SHELF ###
|
||||
} elsif ( ("$opt{'check_type'}" eq "SHELF") or ("$opt{'check_type'}" eq "SHELFINFO") ) {
|
||||
my @errs;
|
||||
my $r_shelf = $snmp_session->get_table($snmpEnclTableIndex);
|
||||
@ -1124,7 +1152,7 @@ $perf = "outdated_snapshots=$badcount";
|
||||
else
|
||||
{ $perf = "shelf=1"; }
|
||||
}
|
||||
### FSSTATUS ###
|
||||
### FSSTATUS ###
|
||||
} elsif("$opt{'check_type'}" eq "FSSTATUS") {
|
||||
my $check = _get_oid_value($snmp_session,$snmpfsOverallStatus);
|
||||
my $global_stat_txt = _get_oid_value($snmp_session,$snmpfsOverallStatus_text);
|
||||
@ -1140,7 +1168,7 @@ $perf = "outdated_snapshots=$badcount";
|
||||
}
|
||||
$perf = "fsstatus=$check";
|
||||
|
||||
### Syntax Error ###
|
||||
### Syntax Error ###
|
||||
} else {
|
||||
FSyntaxError("$opt{'check_type'} invalid parameter !");
|
||||
}
|
||||
@ -1149,3 +1177,4 @@ $msg =~ s/\n//g;
|
||||
$perf ? print "$msg | $perf\n" : print "$msg\n";
|
||||
|
||||
exit($stat);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user