1
0
mirror of https://github.com/opinkerfi/nagios-plugins.git synced 2024-09-28 16:33:46 +02:00
This commit is contained in:
Tómas Edwardsson 2010-12-20 15:43:57 +00:00
parent 821d8654df
commit a6dc1900c7

View File

@ -1,163 +1,163 @@
#!/bin/env python #!/bin/env python
# #
# Gather the cluster state and the current node state # Gather the cluster state and the current node state
# #
# Output example: # Output example:
#<clustat version="4.1.1"> #<clustat version="4.1.1">
# <cluster name="LabCluster" id="22068" generation="172"/> # <cluster name="LabCluster" id="22068" generation="172"/>
# <quorum quorate="1" groupmember="1"/> # <quorum quorate="1" groupmember="1"/>
# <nodes> # <nodes>
# <node name="clusternode1.lab.inetu.net" state="1" local="0" \ # <node name="clusternode1.lab.inetu.net" state="1" local="0" \
# estranged="0" rgmanager="1" rgmanager_master="0" qdisk="0" nodeid="0x00000001"/> # estranged="0" rgmanager="1" rgmanager_master="0" qdisk="0" nodeid="0x00000001"/>
# <node name="clusternode2.lab.inetu.net" state="1" local="1" \ # <node name="clusternode2.lab.inetu.net" state="1" local="1" \
# estranged="0" rgmanager="1" rgmanager_master="0" qdisk="0" nodeid="0x00000002"/> # estranged="0" rgmanager="1" rgmanager_master="0" qdisk="0" nodeid="0x00000002"/>
# <node name="/dev/disk/by-id/scsi-36002219000b9642b000027124a3b61f1-part1" state="1" \ # <node name="/dev/disk/by-id/scsi-36002219000b9642b000027124a3b61f1-part1" state="1" \
# local="0" estranged="0" rgmanager="0" rgmanager_master="0" qdisk="1" nodeid="0x00000000"/> # local="0" estranged="0" rgmanager="0" rgmanager_master="0" qdisk="1" nodeid="0x00000000"/>
# </nodes> # </nodes>
# <groups> # <groups>
# <group name="service:MySQL" state="112" state_str="started" flags="0" flags_str="" \ # <group name="service:MySQL" state="112" state_str="started" flags="0" flags_str="" \
# owner="clusternode2.lab.inetu.net" last_owner="clusternode1.lab.inetu.net" restarts="0" \ # owner="clusternode2.lab.inetu.net" last_owner="clusternode1.lab.inetu.net" restarts="0" \
# last_transition="1245765274" last_transition_str="Tue Jun 23 09:54:34 2009"/> # last_transition="1245765274" last_transition_str="Tue Jun 23 09:54:34 2009"/>
# </groups> # </groups>
#</clustat> #</clustat>
# #
# Frank Clements <frank @ sixthtoe.net> # Frank Clements <frank @ sixthtoe.net>
import xml.dom.minidom import xml.dom.minidom
import os import os
import sys, socket import sys, socket
import getopt import getopt
def usage(): def usage():
""" """
Display usage information Display usage information
""" """
print """ print """
Usage: """ + sys.argv[0] + """ ([-s serviceName] | [-c]) Usage: """ + sys.argv[0] + """ ([-s serviceName] | [-c])
-c, --cluster -c, --cluster
Gathers the overall cluster status for the local node Gathers the overall cluster status for the local node
-s, --service -s, --service
Gets the stats of the named service Gets the stats of the named service
-h, --help -h, --help
Display this Display this
""" """
def getQuorumState(dom): def getQuorumState(dom):
""" """
Get the quorum state. This is a single inline element which only Get the quorum state. This is a single inline element which only
has attributes and no children elements. has attributes and no children elements.
""" """
quorumList = dom.getElementsByTagName('quorum') quorumList = dom.getElementsByTagName('quorum')
quorumElement = quorumList[0] quorumElement = quorumList[0]
return quorumElement.attributes['quorate'].value return quorumElement.attributes['quorate'].value
def getClusterName(dom): def getClusterName(dom):
""" """
Get the name of the cluster from the clustat output. Get the name of the cluster from the clustat output.
This assumes only a single cluster is running for the moment. This assumes only a single cluster is running for the moment.
""" """
clusterList = dom.getElementsByTagName('cluster') clusterList = dom.getElementsByTagName('cluster')
clusterElement = clusterList[0] clusterElement = clusterList[0]
return clusterElement.attributes['name'].value return clusterElement.attributes['name'].value
def getLocalNodeState(dom): def getLocalNodeState(dom):
""" """
Get the state of the local node Get the state of the local node
""" """
hostname = socket.gethostname() hostname = socket.gethostname()
nodesList = dom.getElementsByTagName('node') nodesList = dom.getElementsByTagName('node')
nodeState = {} nodeState = {}
for node in nodesList: for node in nodesList:
if node.attributes['name'].value == hostname: if node.attributes['name'].value == hostname:
nodeState['name'] = node.attributes['name'].value nodeState['name'] = node.attributes['name'].value
nodeState['state'] = node.attributes['state'].value nodeState['state'] = node.attributes['state'].value
nodeState['rgmanager'] = node.attributes['rgmanager'].value nodeState['rgmanager'] = node.attributes['rgmanager'].value
elif node.attributes['qdisk'].value == "1": elif node.attributes['qdisk'].value == "1":
if node.attributes['state'].value != "1": if node.attributes['state'].value != "1":
print "CRITICAL: Quorum disk " + node.attributes['name'].value + " is unavailable!" print "CRITICAL: Quorum disk " + node.attributes['name'].value + " is unavailable!"
sys.exit(2) sys.exit(2)
return nodeState return nodeState
def getServiceState(dom, service): def getServiceState(dom, service):
""" """
Get the state of the named service Get the state of the named service
""" """
groupList = dom.getElementsByTagName('group') groupList = dom.getElementsByTagName('group')
hostname = socket.gethostname() hostname = socket.gethostname()
serviceState = {} serviceState = {}
for group in groupList: for group in groupList:
if group.attributes['name'].value == "service:"+service: if group.attributes['name'].value == "service:"+service:
serviceState['owner'] = group.attributes['owner'].value serviceState['owner'] = group.attributes['owner'].value
serviceState['state'] = group.attributes['state_str'].value serviceState['state'] = group.attributes['state_str'].value
return serviceState return serviceState
def main(): def main():
try: try:
opts, args = getopt.getopt(sys.argv[1:], 's:ch', ['service=', 'cluster', 'help']) opts, args = getopt.getopt(sys.argv[1:], 's:ch', ['service=', 'cluster', 'help'])
except getopt.GetoptError: except getopt.GetoptError:
usage() usage()
sys.exit(2) sys.exit(2)
for o, a in opts: for o, a in opts:
if o in ('-c', '--cluster'): if o in ('-c', '--cluster'):
typeCheck = 'cluster' typeCheck = 'cluster'
if o in ('-s', '--service'): if o in ('-s', '--service'):
typeCheck = 'service' typeCheck = 'service'
serviceName = a serviceName = a
if o in ('-h', '--help'): if o in ('-h', '--help'):
usage() usage()
sys.exit() sys.exit()
clustatOutput = os.popen('clustat -fx') clustatOutput = os.popen('/usr/sbin/clustat -fx')
dom = xml.dom.minidom.parse(clustatOutput) dom = xml.dom.minidom.parse(clustatOutput)
if typeCheck == 'cluster': if typeCheck == 'cluster':
# First we query for the state of the cluster itself. # First we query for the state of the cluster itself.
# Should it be found tha the cluste ris not quorate we alert and exit immediately # Should it be found tha the cluste ris not quorate we alert and exit immediately
cluster = getClusterName(dom) cluster = getClusterName(dom)
qState = getQuorumState(dom) qState = getQuorumState(dom)
# There are some serious problems if the cluster is inquorate so we simply alert immediately! # There are some serious problems if the cluster is inquorate so we simply alert immediately!
if qState != "1": if qState != "1":
print "CRITICAL: Cluster " + cluster + " is inquorate!" print "CRITICAL: Cluster " + cluster + " is inquorate!"
sys.exit(2) sys.exit(2)
# Now we find the status of the local node from clustat. # Now we find the status of the local node from clustat.
# We only care about the local state since this way we can tie the alert to the host. # We only care about the local state since this way we can tie the alert to the host.
nodeStates = getLocalNodeState(dom) nodeStates = getLocalNodeState(dom)
if nodeStates['state'] != "1": if nodeStates['state'] != "1":
print "WARNING: Local node state is offline!" print "WARNING: Local node state is offline!"
sys.exit(1) sys.exit(1)
elif nodeStates['rgmanager'] != "1": elif nodeStates['rgmanager'] != "1":
print "CRITICAL: RGManager service not running on " + nodeStates['name'] + "!" print "CRITICAL: RGManager service not running on " + nodeStates['name'] + "!"
sys.exit(1) sys.exit(1)
else: else:
print "OK: Cluster node " + nodeStates['name'] + " is online and cluster is quorate." print "OK: Cluster node " + nodeStates['name'] + " is online and cluster is quorate."
sys.exit(0) sys.exit(0)
elif typeCheck == 'service': elif typeCheck == 'service':
serviceState = getServiceState(dom, serviceName) serviceState = getServiceState(dom, serviceName)
if serviceState['state'] != 'started': if serviceState['state'] != 'started':
print "CRITICAL: Service " + serviceName + " on " + serviceState['owner'] + " is in " + serviceState['state'] + " state" print "CRITICAL: Service " + serviceName + " on " + serviceState['owner'] + " is in " + serviceState['state'] + " state"
sys.exit(2) sys.exit(2)
else: else:
print "OK: Service " + serviceName + " on " + serviceState['owner'] + " is in " + serviceState['state'] + " state" print "OK: Service " + serviceName + " on " + serviceState['owner'] + " is in " + serviceState['state'] + " state"
sys.exit(0) sys.exit(0)
if __name__ == "__main__": if __name__ == "__main__":
main() main()