nagios-plugins/check_ibm_bladecenter/check_ibm_bladecenter.py

592 lines
20 KiB
Python

#!/usr/bin/python
#
# Copyright 2010, Pall Sigurdsson <palli@opensource.is>
#
# This script is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This script is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# About this script
#
# This script will check the status of a remote IBM Bladecenter via SNMP.
# No real need to change anything below here
version = "1.1.2"
ok = 0
warning = 1
critical = 2
unknown = 3
not_present = -1
exit_status = -1
state = {}
state[not_present] = "Not Present"
state[ok] = "OK"
state[warning] = "Warning"
state[critical] = "Critical"
state[unknown] = "Unknown"
longserviceoutput = "\n"
perfdata = ""
summary = ""
sudo = False
from sys import exit
from sys import argv
from os import getenv, putenv, environ
import subprocess
# Parse some Arguments
from optparse import OptionParser
parser = OptionParser()
parser.add_option("-m", "--mode", dest="mode",
help="Which check mode is in use (powermodules,system-health,temperature,chassis-status,bladehealth,blowers,switchmodules)")
parser.add_option("-H", "--host", dest="host",
help="Hostname or IP address of the host to check")
parser.add_option("-w", "--warning", dest="warning_threshold",
help="Warning threshold", type="int", default=None)
parser.add_option("-c", "--critical", type="int", dest="critical_threshold",
help="Critical threshold", default=None)
parser.add_option("-e", "--exclude", dest="exclude",
help="Exclude specific object", default=None)
parser.add_option("-v", "--snmp_version", dest="snmp_version",
help="SNMP Version to use (1, 2c or 3)", default="1")
parser.add_option("-u", "--snmp_username", dest="snmp_username",
help="SNMP username (only with SNMP v3)", default=None)
parser.add_option("-C", "--snmp_community", dest="snmp_community",
help="SNMP Community (only with SNMP v1|v2c)", default=None)
parser.add_option("-p", "--snmp_password", dest="snmp_password",
help="SNMP password (only with SNMP v3)", default=None)
parser.add_option("-l", "--snmp_security_level", dest="snmp_seclevel",
help="SNMP security level (only with SNMP v3) (noAuthNoPriv|authNoPriv|authPriv)", default=None)
parser.add_option("-t", "--snmp_timeout", dest="snmp_timeout",
help="Timeout in seconds for SNMP", default=10)
parser.add_option("-d", "--debug", dest="debug",
help="Enable debugging (for troubleshooting", action="store_true", default=False)
(opts, args) = parser.parse_args()
if opts.host is None:
parser.error("Hostname (-H) is required.")
if opts.mode is None:
parser.error("Mode (--mode) is required.")
snmp_options = ""
def set_snmp_options():
global snmp_options
if opts.snmp_version is not None:
snmp_options = snmp_options + " -v%s" % opts.snmp_version
if opts.snmp_version == "3":
if opts.snmp_username is None:
parser.error("--snmp_username required with --snmp_version=3")
if opts.snmp_seclevel is None:
parser.error(
"--snmp_security_level required with --snmp_version=3")
if opts.snmp_password is None:
parser.error("--snmp_password required with --snmp_version=3")
snmp_options = snmp_options + " -u %s -l %s -A %s " % (
opts.snmp_username, opts.snmp_seclevel, opts.snmp_password)
else:
if opts.snmp_community is None:
parser.error(
"--snmp_community is required with --snmp_version=1|2c")
snmp_options = snmp_options + " -c %s " % opts.snmp_community
snmp_options += " -t %s " % (opts.snmp_timeout)
def error(errortext):
print "* Error: %s" % errortext
exit(unknown)
def debug(debugtext):
if opts.debug:
print debugtext
def nagios_status(newStatus):
global exit_status
exit_status = max(exit_status, newStatus)
return exit_status
def runCommand(command):
'''runCommand: Runs command from the shell prompt. Exit Nagios style if unsuccessful'''
debug("Executing: %s" % command)
proc = subprocess.Popen(
command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE,)
stdout, stderr = proc.communicate('through stdin to stdout')
if proc.returncode > 0:
print "Error %s: %s\n command was: '%s'" % (proc.returncode, stderr.strip(), command)
debug("results: %s" % (stdout.strip()))
if proc.returncode == 127: # File not found, lets print path
path = getenv("PATH")
print "Check if your path is correct %s" % (path)
if stderr.find('Password:') == 0 and command.find('sudo') == 0:
print "Check if user is in the sudoers file"
if stderr.find('sorry, you must have a tty to run sudo') == 0 and command.find('sudo') == 0:
print "Please remove 'requiretty' from /etc/sudoers"
exit(unknown)
else:
return stdout
def end():
global summary
global longserviceoutput
global perfdata
global exit_status
print "%s - %s | %s" % (state[exit_status], summary, perfdata)
print longserviceoutput
if exit_status < 0:
exit_status = unknown
exit(exit_status)
def add_perfdata(text):
global perfdata
text = text.strip()
perfdata = perfdata + " %s " % (text)
def add_long(text):
global longserviceoutput
longserviceoutput = longserviceoutput + text + '\n'
def add_summary(text):
global summary
summary = summary + text
def set_path(path):
current_path = getenv('PATH')
if current_path.find('C:\\') > -1: # We are on this platform
if path == '':
pass
else:
path = ';' + path
else: # Unix/Linux, etc
if path == '':
path = ":/usr/sbin"
else:
path = ':' + path
current_path = "%s%s" % (current_path, path)
environ['PATH'] = current_path
def snmpget(oid):
snmpgetcommand = "snmpget %s %s %s" % (snmp_options, opts.host, oid)
output = runCommand(snmpgetcommand)
oid, result = output.strip().split(' = ', 1)
resultType, resultValue = result.split(': ', 1)
if resultType == 'STRING': # strip quotes of the string
resultValue = resultValue[1:-1]
return resultValue
# snmpwalk -v3 -u v3get mgmt-rek-proxy-p02 -A proxy2011 -l authNoPriv
# 1.3.6.1.4.1.15497
def snmpwalk(base_oid):
snmpwalkcommand = "snmpwalk %s %s %s" % (snmp_options, opts.host, base_oid)
output = runCommand(snmpwalkcommand + " " + base_oid)
return output
def getTable(base_oid):
myTable = {}
output = snmpwalk(base_oid)
for line in output.split('\n'):
tmp = line.strip().split(' = ', 1)
if len(tmp) == 2:
oid, result = tmp
else:
continue
tmp = result.split(': ', 1)
if len(tmp) > 1:
resultType, resultValue = tmp[0], tmp[1]
else:
resultType = None
resultValue = tmp[0]
if resultType == 'STRING': # strip quotes of the string
resultValue = resultValue[1:-1]
index = oid.strip().split('.')
column = int(index.pop())
row = int(index.pop())
if not myTable.has_key(column):
myTable[column] = {}
myTable[column][row] = resultValue
return myTable
def check_powermodules():
powermodules = getTable('1.3.6.1.4.1.2.3.51.2.2.4')
index = 1
exists = 2
status = 3
details = 4
num_ok = 0
num_no = 0
for i in powermodules.values():
myIndex = i[index]
myStatus = i[status]
myDetails = i[details]
myExists = i[exists]
if myIndex == opts.exclude:
continue
if myExists == "0":
num_no = num_no + 1
else:
if myStatus != "1":
nagios_status(warning)
add_summary('Powermodule "%s" status "%s". %s. ' %
(myIndex, myStatus, myDetails))
else:
num_ok = num_ok + 1
add_long('Powersupply "%s" status "%s". %s. ' %
(myIndex, myStatus, myDetails))
add_summary("%s out of %s powermodules are healthy" %
(num_ok, len(powermodules)))
add_perfdata("'Number of powermodules'=%s" %
(len(powermodules) - num_no))
nagios_status(ok)
def check_switchmodules():
switchmodules = getTable("1.3.6.1.4.1.2.3.51.2.22.3.1.1")
# The following oid is undocumented, but contains some useful extra info
try:
extrainfo = getTable("1.3.6.1.4.1.2.3.51.2.22.3.1.7").values()
except:
extrainfo = []
for module in switchmodules.values():
myIndex = module[1]
healthstate = module[15]
resultavailable = module[3]
resultvalue = module[4]
enabledisable = module[6]
if resultavailable == "1":
'this module is installed'
if healthstate == "1":
nagios_status(ok)
add_long("Module%s health good.\n post=%s" %
(myIndex, resultvalue))
else:
nagios_status(warning)
add_long("Module%s health bad(%s).\n post=%s" %
(myIndex, healthstate, resultvalue))
add_summary("Problem with Module %s. " % (myIndex))
if len(extrainfo) > int(myIndex):
try:
myExtraInfo = extrainfo[int(myIndex) - 1]
module_type = myExtraInfo[22]
module_ip = myExtraInfo[6]
add_long(" type=%s ip=%s" % (module_type, module_ip))
except:
pass
if exit_status == ok:
add_summary("All switchmodules healthy")
def check_blowers():
" Check blower status "
blowers = getTable("1.3.6.1.4.1.2.3.51.2.2.3")
# This mib only seems to support 2 blowers.
blower1speed = snmpget("1.3.6.1.4.1.2.3.51.2.2.3.1.0")
blower1state = snmpget("1.3.6.1.4.1.2.3.51.2.2.3.10.0")
blower2speed = snmpget("1.3.6.1.4.1.2.3.51.2.2.3.2.0")
blower2state = snmpget("1.3.6.1.4.1.2.3.51.2.2.3.11.0")
add_long("Blower 1 state=%s speed=%s" % (blower1state, blower1speed))
add_long("Blower 2 state=%s speed=%s" % (blower2state, blower2speed))
add_perfdata("blower1=%s" % (blower1speed.split(None, 1)[0]))
add_perfdata("blower2=%s" % (blower2speed.split(None, 1)[0]))
# Check blower 1
if blower1state == "1":
nagios_status(ok)
add_summary("Blower1 OK. ")
else:
add_summary("Blower1 NOT OK. ")
nagios_status(warning)
# Check blower 2
if blower2state == "1":
nagios_status(ok)
add_summary("Blower2 OK. ")
else:
add_summary("Blower2 NOT OK. ")
nagios_status(warning)
if blower1state != "1" and blower2state != "1":
nagios_status(critical)
def check_chassis_status():
chassis = getTable('1.3.6.1.4.1.2.3.51.2.2.5.2')
oids = chassis.values()[0]
chassis_oid = {
1: "bistSdram",
10: "bistBootRomFlashImage",
11: "bistEthernetPort1",
113: "bistSwitchModulesCommunicating",
12: "bistEthernetPort2",
13: "bistInternalPCIBus",
14: "bistExternalI2CDevices",
15: "bistUSBController",
16: "bistVideoCompressorBoard",
17: "bistPrimaryBus",
18: "bistInternalEthernetSwitch",
2: "bistRs485Port1",
3: "bistRs485Port2",
33: "bistBladesInstalled",
4: "bistNvram",
49: "bistBladesCommunicating",
6: "bistRtc",
65: "bistBlowersInstalled",
7: "bistLocalI2CBus",
73: "bistBlowersFunctional",
74: "bistMediaTrayInstalled",
75: "bistMediaTrayCommunicating",
8: "bistPrimaryMainAppFlashImage",
81: "bistPowerModulesInstalled",
89: "bistPowerModulesFunctional",
9: "bistSecondaryMainAppFlashImage",
97: "bistSwitchModulesInstalled",
}
# Check if all blades are working
bistBladesInstalled = 33
bistBlowersInstalled = 65
bistMediaTrayInstalled = 74
bistPowerModulesInstalled = 81
bistSwitchModulesInstalled = 97
bistSwitchModulesCommunicating = 113
bistBladesCommunicating = 49
bistMediaTrayCommunicating = 75
bistBlowersFunctional = 73
bistPowerModulesFunctional = 89
# Check Blade Communications
if not oids.has_key(bistBladesInstalled) or not oids.has_key(bistBladesCommunicating):
add_summary("Blades N/A. ")
elif oids[bistBladesInstalled] == oids[bistBladesCommunicating]:
nagios_status(ok)
add_summary("Blades OK. ")
else:
nagios_status(warning)
add_summary("Blades NOT OK. ")
# Check PowerModule Status
if not oids.has_key(bistPowerModulesFunctional) or not oids.has_key(bistPowerModulesInstalled):
add_summary("Powermodules N/A. ")
elif oids[bistPowerModulesFunctional] == oids[bistPowerModulesInstalled]:
nagios_status(ok)
add_summary("PowerModules OK. ")
else:
nagios_status(warning)
add_summary("PowerModules NOT OK. ")
# Check SwitcModule Communications
if not oids.has_key(bistSwitchModulesCommunicating) or not oids.has_key(bistSwitchModulesInstalled):
add_summary("SwitchModules N/A. ")
if oids[bistSwitchModulesCommunicating] == oids[bistSwitchModulesInstalled]:
nagios_status(ok)
add_summary("Switchmodules OK. ")
else:
nagios_status(warning)
add_summary("Switchmodules NOT OK. ")
# Check blower status
if not oids.has_key(bistBlowersInstalled) or not oids.has_key(bistBlowersFunctional):
add_summary("Blowers N/A. ")
elif oids[bistBlowersInstalled] == oids[bistBlowersFunctional]:
nagios_status(ok)
add_summary("Blowers OK. ")
else:
nagios_status(warning)
add_summary("Blowers NOT OK. ")
# Check Media Tray Status
if not oids.has_key(bistMediaTrayCommunicating) or not oids.has_key(bistMediaTrayInstalled):
nagios_status(ok)
add_summary("Media Trays N/A. ")
elif oids[bistMediaTrayCommunicating] == oids[bistMediaTrayInstalled]:
add_summary("Media Trays OK. ")
else:
nagios_status(warning)
add_summary("Media Trays NOT OK. ")
# status_oids, oids that where 0 == ok
status_oids = (2, 3, 5, 7, 8, 9, 10, 11, 14, 18,
19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, )
add_long("Other Sensors: ")
sensor_status = ok
for oid in status_oids:
if not chassis_oid.has_key(oid):
continue
oidValue = oids[oid]
oidName = chassis_oid[oid]
if oidValue == "0":
friendly_status = "%s (ok)" % oidValue
else:
friendly_status = "%s (not ok)" % oidValue
nagios_status(warning)
sensor_status = warning
add_summary("%s is %s" % oidName, friendly_status)
add_long(" %s status: %s" % (oidName, friendly_status))
if sensor_status == ok:
add_summary("Other Sensors: OK. ")
def check_bladehealth():
blades = getTable('1.3.6.1.4.1.2.3.51.2.22.1.5.2.1')
bladestate = getTable('1.3.6.1.4.1.2.3.51.2.22.1.5.1.1').values()
index, bladeid, severity, description = (1, 2, 3, 4)
good_blades = 0
total_blades = 0
for i, row in enumerate(blades.values()):
myIndex = row[index]
myBladeid = row[bladeid]
mySeverity = row[severity]
myDescription = row[description]
try:
myName = bladestate[i][6]
except:
myName = ""
if mySeverity == "(No severity)":
continue
add_long("blade%s (%s): %s %s" %
(myBladeid, myName, mySeverity, myDescription))
if opts.exclude:
if myDescription.find(opts.exclude) > -1:
continue
total_blades += 1
if mySeverity == 'Good':
nagios_status(ok)
good_blades += 1
else:
nagios_status(warning)
add_summary("blade%s (%s): %s %s. " %
(myBladeid, myName, mySeverity, myDescription))
if good_blades == total_blades:
add_summary("%s out of %s blades in Good health. " %
(good_blades, total_blades))
nagios_status(ok)
else:
nagios_status(warning)
def check_systemhealth():
systemhealthstat = snmpget('1.3.6.1.4.1.2.3.51.2.2.7.1.0')
summary = getTable('1.3.6.1.4.1.2.3.51.2.2.7.2.1')
index, severity, description, date = (1, 2, 3, 4)
# Sometimes chassis delivers warning when absolutely nothing is going on.
# Lets work around that
workaround = [{1: '1', 2: 'Good', 3: 'No critical or warning events', 4: 'No timestamp'}]
# Check overall health
if systemhealthstat == '255':
nagios_status(ok)
add_summary("Bladecenter health: OK. ")
elif summary.values() == workaround:
add_summary("Non-Critical Error (bug in firmware): '%s' " %
workaround[0][description])
nagios_status(ok)
return
elif systemhealthstat == "2":
nagios_status(warning)
add_summary("Non-Critical Error. ")
elif systemhealthstat == "4":
nagios_status(critical)
add_summary("System-Level Error. ")
elif systemhealthstat == "0":
nagios_status(critical)
add_summary("Critical. ")
else:
nagios_status(unknown)
add_summary(
"Bladecenter health unkown (oid 1.3.6.1.4.1.2.3.51.2.2.7.1.0 returns %s). " %
systemhealthstat)
for row in summary.values():
if row[severity] == 'Good':
nagios_status(ok)
elif row[severity] == 'Warning':
nagios_status(warning)
elif row[severity] == 'System Level':
nagios_status(warning)
else:
nagios_status(critical)
add_summary("%s. " % (row[description]))
add_long("* %s. " % (row[description]))
def check_temperature():
# set some sensible defaults
if opts.warning_threshold is None:
opts.warning_threshold = 28
if opts.critical_threshold is None:
opts.critical_threshold = 35
str_temp = snmpget('1.3.6.1.4.1.2.3.51.2.2.1.5.1.0')
float_temp, measurement = str_temp.split(None, 1)
float_temp = float(float_temp)
if opts.critical_threshold is not None and float_temp > opts.critical_threshold:
nagios_status(critical)
add_summary(
"ambient temperature (%s) is over critical thresholds (%s). " %
(str_temp, opts.critical_threshold))
elif opts.warning_threshold is not None and float_temp > opts.warning_threshold:
nagios_status(warning)
add_summary(
"ambient temperature (%s) is over warning thresholds (%s). " %
(str_temp, opts.warning_threshold))
else:
add_summary("Ambient temperature = %s. " % (str_temp))
add_perfdata("'ambient_temp'=%s;%s;%s " %
(float_temp, opts.warning_threshold, opts.critical_threshold))
#add_long( "Ambient Temperature = %s" % (str_temp) )
nagios_status(ok)
if __name__ == '__main__':
try:
set_snmp_options()
if opts.mode == 'powermodules':
check_powermodules()
elif opts.mode == 'system-health':
check_systemhealth()
elif opts.mode == 'temperature':
check_temperature()
elif opts.mode == 'chassis-status':
check_chassis_status()
elif opts.mode == 'bladehealth':
check_bladehealth()
elif opts.mode == 'blowers':
check_blowers()
elif opts.mode == 'switchmodules':
check_switchmodules()
else:
parser.error("%s is not a valid option for --mode" % opts.mode)
except Exception, e:
print "Unhandled exception while running script: %s" % e
exit(unknown)
end()