1
0
mirror of https://github.com/ranl/monitor-utils.git synced 2025-04-12 02:03:39 +02:00
This commit is contained in:
dufourl 2017-08-07 13:27:54 +00:00 committed by GitHub
commit f296f7465f

View File

@ -30,8 +30,21 @@
## AUTOSUPPORTSTATUS|NFSOPS|
## CIFSOPS|SHELFINFO|...
##
#####################################
#####################################
##
##
## CLARIFICATION FOR TIMEOUTS
## There are multiples timeouts we depend on
##
## Perl plugins timeout (utils.pm)--- > $TIMEOUT --- > originally 15 sec --> recommandation to raise it to 180
##
## Net::SNMP timeout --- > Used in Net::SNMP->session --- > originally 5 sec --> recommandation to raise it to 60
## Beware that the max value 60 seconds. If set above you get the error message "Can't create snmp session"
##
## Do not forget that in nagios you need to increase service_check_timeout to a value above $TIMEOUT_PLUGINS
## Nagios service check timeout (nagios.cfg) --- > service_check_timeout=240 --- > originally 30 sec
##
####################################################################################################################################################
####################################################################################################################################################
use strict;
@ -47,13 +60,16 @@ use Getopt::Long;
use Time::Local;
use IPC::Cmd qw(run_forked);
Getopt::Long::Configure('bundling');
my $TIMEOUT_PLUGINS=$TIMEOUT;
my $stat = 0;
my $msg;
my $perf;
my $script_name = basename($0);
my $script_version = 1.3;
my $script_version = 1.3.1;
my $counterFilePath="/tmp";
my $counterFile;
@ -227,7 +243,7 @@ my %nvramBatteryStatus = (
5 => 'near end of life',
6 => 'at end of life',
7 => 'unknown',
);
);
my %GlobalStatusIndex = (
1 => 'other',
2 => 'unknown',
@ -235,7 +251,7 @@ my %GlobalStatusIndex = (
4 => 'nonCritical',
5 => 'critical',
6 => 'nonRecoverable',
);
);
my %AutoSupportStatusIndex = (
1 => 'ok',
@ -243,7 +259,7 @@ my %AutoSupportStatusIndex = (
3 => 'postFailure',
4 => 'smtpPostFailure',
5 => 'unknown',
);
);
my %cfSettingsIndex = (
1 => 'notConfigured',
@ -251,7 +267,7 @@ my %cfSettingsIndex = (
3 => 'disabled',
4 => 'takeoverByPartnerDisabled',
5 => 'thisNodeDead',
);
);
my %cfStateIndex = (
@ -259,7 +275,7 @@ my %cfStateIndex = (
2 => 'canTakeover',
3 => 'cannotTakeover',
4 => 'takeover',
);
);
my %cfCannotTakeoverCauseIndex = (
1 => 'ok',
@ -280,20 +296,20 @@ my %cfCannotTakeoverCauseIndex = (
16 => 'alreadyInTakenoverMode',
17 => 'nvramLogUnsynchronized',
18 => 'backupMailboxProblems',
);
);
my %cfPartnerStatusIndex = (
1 => 'maybeDown',
2 => 'ok',
3 => 'dead',
);
);
my %cfInterconnectStatusIndex = (
1 => 'notPresent',
2 => 'down',
3 => 'partialFailure',
4 => 'up',
);
);
my %EcnlStatusIndex = (
1 => 'initializing',
@ -302,13 +318,13 @@ my %EcnlStatusIndex = (
4 => 'inactive',
5 => 'reconfiguring',
6 => 'nonexistent',
);
);
my %fsOverallStatusIndex = (
1 => 'ok',
2 => 'Nearly Full',
3 => 'Full',
);
);
### Functions
###############
@ -338,10 +354,8 @@ sub _create_session(@) {
sub FSyntaxError($) {
my $err = shift;
print <<EOU;
$err
This is $script_name in version $script_version.
$err
This is $script_name in version $script_version.
Syntax:
-H <IP_or_Hostname> Ip/Dns Name of the Filer
-C <community_name> SNMP Community Name for read
@ -356,7 +370,6 @@ This is $script_name in version $script_version.
-e <vol1[,vol2[,...]]> Exclude volumes from snap check (SNAPSHOT/SNAPSHOTAGE)
-I Inform only, return OK every time (ignore -w and -c values)
-h This help
Available check types:
TEMP - Temperature
FAN - Fan Fail
@ -382,20 +395,15 @@ This is $script_name in version $script_version.
UPTIME - Only show\'s uptime
CACHEAGE - Cache Age (-w -c)
FSSTATUS - Overall file system health
Examples:
$script_name -H netapp.mydomain -C public -T UPTIME
UPTIME: 2 days, 23:03:21.09 | uptime=255801s
$script_name -H netapp.mydomain -C public -T DISKUSED -v /vol/data/ -w 90 -c 95 -V 2c
OK: DISKUSED 79% | /vol/data/=8104595240k
$script_name -H netapp.mydomain -C public -T GLOBALSTATUS
CRIT: GLOBALSTATUS nonCritical 4 Disk on adapter 1a, shelf 1, bay 9, failed. | globalstatus=4
$script_name -H netapp.mydomain -C public -T DISKUSED -v wtf
WARN: Unknown volume path or aggregate name 'wtf'. Available values: aggr_p1a_sas2_mirror /vol/vol0/ /vol/esx/ /vol/xen_a/
EOU
exit($ERRORS{'UNKNOWN'});
}
@ -493,6 +501,8 @@ $opt{'crit'} = 500;
$opt{'warn'} = 500;
$opt{'version'} = 2;
$opt{'timeout'} = 60;
$TIMEOUT_PLUGINS = 180 ;
my $result = GetOptions(\%opt,
'filer|H=s',
'community|C=s',
@ -507,6 +517,17 @@ my $result = GetOptions(\%opt,
"help|h",
);
if ( $opt{'timeout'} > 60)
{
#Set timeout for plugin to the parameter received via command line, but set snmp timeout to the max (60 seconds) if CLI timeout is above 60 seconds
$TIMEOUT_PLUGINS=$opt{'timeout'};
$opt{'timeout'}=60;
} else
{
$TIMEOUT_PLUGINS=$opt{'timeout'};
}
FSyntaxError("") if defined $opt{'help'};
FSyntaxError("Missing -H") unless defined $opt{'filer'};
FSyntaxError("Missing -C") unless defined $opt{'community'};
@ -536,9 +557,16 @@ if (!defined($counterFilePath)) {
# Just in case of problems, let's not hang Nagios
# with "Return code of 142 is out of bounds", instead we set the message "No response in time"
$SIG{'ALRM'} = sub {
print ("CRITICAL: No response in time\n");
exit $ERRORS{"CRITICAL"};
};
# Starting Alarm
alarm($TIMEOUT);
alarm($TIMEOUT_PLUGINS);
# Establish SNMP Session
our $snmp_session = _create_session($opt{'filer'},$opt{'community'},$opt{'version'},$opt{'timeout'});
@ -661,7 +689,7 @@ if("$opt{'check_type'}" eq "TEMP") {
$msg = "CRIT: Over $opt{'check_type'} !";
}
$perf = "overtemperature=$check";
### Fan ###
### Fan ###
} elsif("$opt{'check_type'}" eq "FAN") {
my $check = _get_oid_value($snmp_session,$snmpFailedFanCount);
if($check == 0) {
@ -672,7 +700,7 @@ if("$opt{'check_type'}" eq "TEMP") {
$msg = "CRIT: $opt{'check_type'} $check !";
}
$perf = "failedfans=$check";
### PS ###
### PS ###
} elsif("$opt{'check_type'}" eq "PS") {
my $check = _get_oid_value($snmp_session,$snmpFailPowerSupplyCount);
if($check == 0) {
@ -683,12 +711,12 @@ if("$opt{'check_type'}" eq "TEMP") {
$msg = "CRIT: $opt{'check_type'} Fail $check !";
}
$perf = "failedpowersupplies=$check";
### CPULOAD ###
### CPULOAD ###
} elsif("$opt{'check_type'}" eq "CPULOAD") {
my $check = _get_oid_value($snmp_session,$snmpcpuBusyTimePerCent);
($msg,$stat) = _clac_err_stat($check,$opt{'check_type'},$opt{'warn'},$opt{'crit'});
$perf = "cpuload=$check\%;$opt{'warn'};$opt{'crit'};;";
### NFSOPS ###
### NFSOPS ###
} elsif("$opt{'check_type'}" eq "NFSOPS") {
my $nfsops_per_seconds=floor ( ($total_nfs_ops-$fileNfsOps)/$elapsedtime );
@ -696,7 +724,7 @@ if("$opt{'check_type'}" eq "TEMP") {
($msg,$stat) = _clac_absolute_err_stat($check,$opt{'check_type'},$opt{'warn'},$opt{'crit'});
$perf = "nfsops=$check";
### CIFSOPS ###
### CIFSOPS ###
} elsif("$opt{'check_type'}" eq "CIFSOPS") {
my $cifsops_per_seconds=floor ( ($total_cifs_ops-$fileCifsOps)/$elapsedtime );
@ -704,7 +732,7 @@ if("$opt{'check_type'}" eq "TEMP") {
($msg,$stat) = _clac_absolute_err_stat($check,$opt{'check_type'},$opt{'warn'},$opt{'crit'});
$perf = "cifsops=$check";
### ISCSIOPS ###
### ISCSIOPS ###
} elsif("$opt{'check_type'}" eq "ISCSIOPS") {
my $iscsiops_per_seconds=floor ( ($blocks_iscsi_ops-$fileIscsiOps)/$elapsedtime );
my $iscsiread_per_seconds=floor ( ($blocks_iscsi_read-$fileIscsi64ReadBytes)/$elapsedtime );
@ -716,7 +744,7 @@ if("$opt{'check_type'}" eq "TEMP") {
($msg,$stat) = _clac_absolute_err_stat($check,$opt{'check_type'},$opt{'warn'},$opt{'crit'});
$msg = "$msg ops/s (iscsi read=$iscsiread_per_seconds B/s, iscsi write=$iscsiwrite_per_seconds B/s, disk read=$diskread_per_seconds B/s, disk write=$diskwrite_per_seconds B/s)";
$perf = "iscsiops=$check iscsiread=$iscsiread_per_seconds iscsiwrite=$iscsiwrite_per_seconds diskread=$diskread_per_seconds diskwrite=$diskwrite_per_seconds";
### FCPOPS ###
### FCPOPS ###
} elsif("$opt{'check_type'}" eq "FCPOPS") {
my $fcpops_per_seconds=floor ( ($blocks_fcp_ops-$fileFcpOps)/$elapsedtime );
my $fcpread_per_seconds=floor ( ($blocks_fcp_read-$fileFcp64ReadBytes)/$elapsedtime );
@ -729,7 +757,7 @@ if("$opt{'check_type'}" eq "TEMP") {
($msg,$stat) = _clac_absolute_err_stat($check,$opt{'check_type'},$opt{'warn'},$opt{'crit'});
$msg = "$msg ops/s (fcp read=$fcpread_per_seconds B/s, fcp write=$fcpwrite_per_seconds B/s, disk read=$diskread_per_seconds B/s, disk write=$diskwrite_per_seconds B/s)";
$perf = "fcpops=$check fcpread=$fcpread_per_seconds fcpwrite=$fcpwrite_per_seconds diskread=$diskread_per_seconds diskwrite=$diskwrite_per_seconds";
### NVRAM ###
### NVRAM ###
} elsif("$opt{'check_type'}" eq "NVRAM") {
my $check = _get_oid_value($snmp_session,$snmpnvramBatteryStatus);
if($check == 1) {
@ -740,7 +768,7 @@ if("$opt{'check_type'}" eq "TEMP") {
$msg = "CRIT: $opt{'check_type'} $nvramBatteryStatus{$check}";
}
$perf = "nvrambatterystatus=$check";
### DISKUSED ###
### DISKUSED ###
} elsif("$opt{'check_type'}" eq "DISKUSED") {
FSyntaxError("Missing -v") unless defined $opt{'vol'};
@ -779,7 +807,7 @@ if("$opt{'check_type'}" eq "TEMP") {
$msg .= " $$r_vol_tbl{$key}"
}
}
### SNAPSHOTAGE ###
### SNAPSHOTAGE ###
} elsif("$opt{'check_type'}" eq "SNAPSHOTAGE") {
my @exc_list = split(',',$opt{'exclude'});
@ -885,9 +913,9 @@ if("$opt{'check_type'}" eq "TEMP") {
$stat = $ERRORS{'UNKNOWN'};
$msg = "UNKNOW Errors";
}
$perf = "outdated_snapshots=$badcount";
$perf = "outdated_snapshots=$badcount";
### SNAPSHOT ###
### SNAPSHOT ###
} elsif("$opt{'check_type'}" eq "SNAPSHOT") {
my @exc_list = split(',',$opt{'exclude'});
my @vol_err;
@ -923,7 +951,7 @@ $perf = "outdated_snapshots=$badcount";
}
$perf = "snapoff=$err_count";
### FAILEDDISK ###
### FAILEDDISK ###
} elsif("$opt{'check_type'}" eq "FAILEDDISK") {
my $check = _get_oid_value($snmp_session,$snmpFailedDiskCount);
if($check == 0) {
@ -935,7 +963,7 @@ $perf = "outdated_snapshots=$badcount";
}
$perf = "faileddisks=$check";
### DISKSUMMARY ###
### DISKSUMMARY ###
} elsif("$opt{'check_type'}" eq "DISKSUMMARY") {
my $diskTotal = _get_oid_value($snmp_session,$snmp_netapp_disksummary_diskTotalCount);
my $diskActive = _get_oid_value($snmp_session,$snmp_netapp_disksummary_diskActiveCount);
@ -958,7 +986,7 @@ $perf = "outdated_snapshots=$badcount";
}
$perf = "faileddisks=$check total=$diskTotal active=$diskActive spare=$diskSpare reconstructing=$diskReconstructing";
### HA ###
### HA ###
} elsif("$opt{'check_type'}" eq "HA") {
my $cfSettings = _get_oid_value($snmp_session,$snmp_netapp_cfSettings);
@ -991,18 +1019,18 @@ $perf = "outdated_snapshots=$badcount";
$perf = "hasettings=$check";
### UPTIME ###
### UPTIME ###
} elsif("$opt{'check_type'}" eq "UPTIME") {
my $check = _get_oid_value($snmp_session,$snmpUpTime);
$msg = "$opt{'check_type'}: $check";
$check =~ m/^\s*(\d+)\s+days,\s+(\d+):(\d+):(\d+).*$/;
$perf = "uptime=" . ($1*86400 + $2*3600 + $3*60 + $4) . "s";
### CACHEAGE ###
### CACHEAGE ###
} elsif("$opt{'check_type'}" eq "CACHEAGE") {
my $check = _get_oid_value($snmp_session,$snmpCacheAge);
($msg,$stat) = _clac_minutes_err_stat($check,$opt{'check_type'},$opt{'warn'},$opt{'crit'});
$perf = "cache_age=$check";
### GLOBALSTATUS ###
### GLOBALSTATUS ###
} elsif("$opt{'check_type'}" eq "GLOBALSTATUS") {
my $check = _get_oid_value($snmp_session,$snmpGlobalStatus);
my $global_stat_txt = _get_oid_value($snmp_session,$snmpGlobalStatus_text);
@ -1014,7 +1042,7 @@ $perf = "outdated_snapshots=$badcount";
$msg = "CRIT: $opt{'check_type'} $GlobalStatusIndex{$check} $check $global_stat_txt";
}
$perf = "globalstatus=$check";
### AUTOSUPPORTSTATUS ###
### AUTOSUPPORTSTATUS ###
} elsif("$opt{'check_type'}" eq "AUTOSUPPORTSTATUS") {
my $check = _get_oid_value($snmp_session,$snmpAutoSupportStatus);
my $autosupport_stat_txt = _get_oid_value($snmp_session,$snmpAutoSupportStatus_text);
@ -1026,17 +1054,17 @@ $perf = "outdated_snapshots=$badcount";
$msg = "CRIT: $opt{'check_type'} $AutoSupportStatusIndex{$check} $check $autosupport_stat_txt";
}
$perf = "autosupportstatus=$check";
### NDMPSESSIONS ###
### NDMPSESSIONS ###
} elsif("$opt{'check_type'}" eq "NDMPSESSIONS") {
my $check = _get_oid_value($snmp_session,$snmpNdmpSessions);
($msg,$stat) = _clac_absolute_err_stat($check,$opt{'check_type'},$opt{'warn'},$opt{'crit'});
$perf = "ndmpsess=$check";
### CIFSSESSIONS ###
### CIFSSESSIONS ###
} elsif("$opt{'check_type'}" eq "CIFSSESSIONS") {
my $check = _get_oid_value($snmp_session,$snmpCifsSessions);
($msg,$stat) = _clac_absolute_err_stat($check,$opt{'check_type'},$opt{'warn'},$opt{'crit'});
$perf = "cifssess=$check";
### SHELF ###
### SHELF ###
} elsif ( ("$opt{'check_type'}" eq "SHELF") or ("$opt{'check_type'}" eq "SHELFINFO") ) {
my @errs;
my $r_shelf = $snmp_session->get_table($snmpEnclTableIndex);
@ -1124,7 +1152,7 @@ $perf = "outdated_snapshots=$badcount";
else
{ $perf = "shelf=1"; }
}
### FSSTATUS ###
### FSSTATUS ###
} elsif("$opt{'check_type'}" eq "FSSTATUS") {
my $check = _get_oid_value($snmp_session,$snmpfsOverallStatus);
my $global_stat_txt = _get_oid_value($snmp_session,$snmpfsOverallStatus_text);
@ -1140,7 +1168,7 @@ $perf = "outdated_snapshots=$badcount";
}
$perf = "fsstatus=$check";
### Syntax Error ###
### Syntax Error ###
} else {
FSyntaxError("$opt{'check_type'} invalid parameter !");
}
@ -1149,3 +1177,4 @@ $msg =~ s/\n//g;
$perf ? print "$msg | $perf\n" : print "$msg\n";
exit($stat);