mirror of
https://github.com/opinkerfi/nagios-plugins.git
synced 2024-11-05 01:53:44 +01:00
check_rhcs added
This commit is contained in:
parent
f8503f699c
commit
c2c93c05fd
163
misc/check_rhcs
Normal file
163
misc/check_rhcs
Normal file
@ -0,0 +1,163 @@
|
||||
#!/bin/env python
|
||||
|
||||
#
|
||||
# Gather the cluster state and the current node state
|
||||
#
|
||||
# Output example:
|
||||
#<clustat version="4.1.1">
|
||||
# <cluster name="LabCluster" id="22068" generation="172"/>
|
||||
# <quorum quorate="1" groupmember="1"/>
|
||||
# <nodes>
|
||||
# <node name="clusternode1.lab.inetu.net" state="1" local="0" \
|
||||
# estranged="0" rgmanager="1" rgmanager_master="0" qdisk="0" nodeid="0x00000001"/>
|
||||
# <node name="clusternode2.lab.inetu.net" state="1" local="1" \
|
||||
# estranged="0" rgmanager="1" rgmanager_master="0" qdisk="0" nodeid="0x00000002"/>
|
||||
# <node name="/dev/disk/by-id/scsi-36002219000b9642b000027124a3b61f1-part1" state="1" \
|
||||
# local="0" estranged="0" rgmanager="0" rgmanager_master="0" qdisk="1" nodeid="0x00000000"/>
|
||||
# </nodes>
|
||||
# <groups>
|
||||
# <group name="service:MySQL" state="112" state_str="started" flags="0" flags_str="" \
|
||||
# owner="clusternode2.lab.inetu.net" last_owner="clusternode1.lab.inetu.net" restarts="0" \
|
||||
# last_transition="1245765274" last_transition_str="Tue Jun 23 09:54:34 2009"/>
|
||||
# </groups>
|
||||
#</clustat>
|
||||
#
|
||||
# Frank Clements <frank @ sixthtoe.net>
|
||||
|
||||
import xml.dom.minidom
|
||||
import os
|
||||
import sys, socket
|
||||
import getopt
|
||||
|
||||
def usage():
|
||||
"""
|
||||
Display usage information
|
||||
"""
|
||||
print """
|
||||
Usage: """ + sys.argv[0] + """ ([-s serviceName] | [-c])
|
||||
|
||||
-c, --cluster
|
||||
Gathers the overall cluster status for the local node
|
||||
-s, --service
|
||||
Gets the stats of the named service
|
||||
-h, --help
|
||||
Display this
|
||||
"""
|
||||
|
||||
def getQuorumState(dom):
|
||||
"""
|
||||
Get the quorum state. This is a single inline element which only
|
||||
has attributes and no children elements.
|
||||
"""
|
||||
quorumList = dom.getElementsByTagName('quorum')
|
||||
quorumElement = quorumList[0]
|
||||
|
||||
return quorumElement.attributes['quorate'].value
|
||||
|
||||
|
||||
def getClusterName(dom):
|
||||
"""
|
||||
Get the name of the cluster from the clustat output.
|
||||
This assumes only a single cluster is running for the moment.
|
||||
"""
|
||||
clusterList = dom.getElementsByTagName('cluster')
|
||||
clusterElement = clusterList[0]
|
||||
|
||||
return clusterElement.attributes['name'].value
|
||||
|
||||
|
||||
def getLocalNodeState(dom):
|
||||
"""
|
||||
Get the state of the local node
|
||||
"""
|
||||
hostname = socket.gethostname()
|
||||
nodesList = dom.getElementsByTagName('node')
|
||||
nodeState = {}
|
||||
|
||||
for node in nodesList:
|
||||
if node.attributes['name'].value == hostname:
|
||||
nodeState['name'] = node.attributes['name'].value
|
||||
nodeState['state'] = node.attributes['state'].value
|
||||
nodeState['rgmanager'] = node.attributes['rgmanager'].value
|
||||
|
||||
elif node.attributes['qdisk'].value == "1":
|
||||
if node.attributes['state'].value != "1":
|
||||
print "CRITICAL: Quorum disk " + node.attributes['name'].value + " is unavailable!"
|
||||
sys.exit(2)
|
||||
|
||||
return nodeState
|
||||
|
||||
|
||||
def getServiceState(dom, service):
|
||||
"""
|
||||
Get the state of the named service
|
||||
"""
|
||||
groupList = dom.getElementsByTagName('group')
|
||||
hostname = socket.gethostname()
|
||||
serviceState = {}
|
||||
|
||||
for group in groupList:
|
||||
if group.attributes['name'].value == "service:"+service:
|
||||
serviceState['owner'] = group.attributes['owner'].value
|
||||
serviceState['state'] = group.attributes['state_str'].value
|
||||
|
||||
return serviceState
|
||||
|
||||
|
||||
def main():
|
||||
try:
|
||||
opts, args = getopt.getopt(sys.argv[1:], 's:ch', ['service=', 'cluster', 'help'])
|
||||
except getopt.GetoptError:
|
||||
usage()
|
||||
sys.exit(2)
|
||||
|
||||
for o, a in opts:
|
||||
if o in ('-c', '--cluster'):
|
||||
typeCheck = 'cluster'
|
||||
if o in ('-s', '--service'):
|
||||
typeCheck = 'service'
|
||||
serviceName = a
|
||||
if o in ('-h', '--help'):
|
||||
usage()
|
||||
sys.exit()
|
||||
|
||||
clustatOutput = os.popen('clustat -fx')
|
||||
dom = xml.dom.minidom.parse(clustatOutput)
|
||||
|
||||
if typeCheck == 'cluster':
|
||||
|
||||
# First we query for the state of the cluster itself.
|
||||
# Should it be found tha the cluste ris not quorate we alert and exit immediately
|
||||
cluster = getClusterName(dom)
|
||||
qState = getQuorumState(dom)
|
||||
|
||||
# There are some serious problems if the cluster is inquorate so we simply alert immediately!
|
||||
if qState != "1":
|
||||
print "CRITICAL: Cluster " + cluster + " is inquorate!"
|
||||
sys.exit(2)
|
||||
|
||||
# Now we find the status of the local node from clustat.
|
||||
# We only care about the local state since this way we can tie the alert to the host.
|
||||
nodeStates = getLocalNodeState(dom)
|
||||
if nodeStates['state'] != "1":
|
||||
print "WARNING: Local node state is offline!"
|
||||
sys.exit(1)
|
||||
elif nodeStates['rgmanager'] != "1":
|
||||
print "CRITICAL: RGManager service not running on " + nodeStates['name'] + "!"
|
||||
sys.exit(1)
|
||||
else:
|
||||
print "OK: Cluster node " + nodeStates['name'] + " is online and cluster is quorate."
|
||||
sys.exit(0)
|
||||
|
||||
elif typeCheck == 'service':
|
||||
serviceState = getServiceState(dom, serviceName)
|
||||
if serviceState['state'] != 'started':
|
||||
print "CRITICAL: Service " + serviceName + " on " + serviceState['owner'] + " is in " + serviceState['state'] + " state"
|
||||
sys.exit(2)
|
||||
else:
|
||||
print "OK: Service " + serviceName + " on " + serviceState['owner'] + " is in " + serviceState['state'] + " state"
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
3
misc/nrpe.d/check_rhcs.cfg
Normal file
3
misc/nrpe.d/check_rhcs.cfg
Normal file
@ -0,0 +1,3 @@
|
||||
command[check_rhcs]=/usr/lib64/nagios/plugins/check_rhcs -H rek-oraheart-p04 -c
|
||||
command[check_rhcs_service]=/usr/lib64/nagios/plugins/check_rhcs -s '$ARG1$'
|
||||
|
Loading…
Reference in New Issue
Block a user