From c2c93c05fd6df8ce13f5c416c6a939110e9407c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1ll=20Gu=C3=B0j=C3=B3n=20Sigur=C3=B0sson?= Date: Wed, 1 Dec 2010 13:44:59 +0000 Subject: [PATCH] check_rhcs added --- misc/check_rhcs | 163 +++++++++++++++++++++++++++++++++++++ misc/nrpe.d/check_rhcs.cfg | 3 + 2 files changed, 166 insertions(+) create mode 100644 misc/check_rhcs create mode 100644 misc/nrpe.d/check_rhcs.cfg diff --git a/misc/check_rhcs b/misc/check_rhcs new file mode 100644 index 0000000..504e3b6 --- /dev/null +++ b/misc/check_rhcs @@ -0,0 +1,163 @@ +#!/bin/env python + +# +# Gather the cluster state and the current node state +# +# Output example: +# +# +# +# +# +# +# +# +# +# +# +# +# +# Frank Clements + +import xml.dom.minidom +import os +import sys, socket +import getopt + +def usage(): + """ + Display usage information + """ + print """ +Usage: """ + sys.argv[0] + """ ([-s serviceName] | [-c]) + +-c, --cluster + Gathers the overall cluster status for the local node +-s, --service + Gets the stats of the named service +-h, --help + Display this +""" + +def getQuorumState(dom): + """ + Get the quorum state. This is a single inline element which only + has attributes and no children elements. + """ + quorumList = dom.getElementsByTagName('quorum') + quorumElement = quorumList[0] + + return quorumElement.attributes['quorate'].value + + +def getClusterName(dom): + """ + Get the name of the cluster from the clustat output. + This assumes only a single cluster is running for the moment. + """ + clusterList = dom.getElementsByTagName('cluster') + clusterElement = clusterList[0] + + return clusterElement.attributes['name'].value + + +def getLocalNodeState(dom): + """ + Get the state of the local node + """ + hostname = socket.gethostname() + nodesList = dom.getElementsByTagName('node') + nodeState = {} + + for node in nodesList: + if node.attributes['name'].value == hostname: + nodeState['name'] = node.attributes['name'].value + nodeState['state'] = node.attributes['state'].value + nodeState['rgmanager'] = node.attributes['rgmanager'].value + + elif node.attributes['qdisk'].value == "1": + if node.attributes['state'].value != "1": + print "CRITICAL: Quorum disk " + node.attributes['name'].value + " is unavailable!" + sys.exit(2) + + return nodeState + + +def getServiceState(dom, service): + """ + Get the state of the named service + """ + groupList = dom.getElementsByTagName('group') + hostname = socket.gethostname() + serviceState = {} + + for group in groupList: + if group.attributes['name'].value == "service:"+service: + serviceState['owner'] = group.attributes['owner'].value + serviceState['state'] = group.attributes['state_str'].value + + return serviceState + + +def main(): + try: + opts, args = getopt.getopt(sys.argv[1:], 's:ch', ['service=', 'cluster', 'help']) + except getopt.GetoptError: + usage() + sys.exit(2) + + for o, a in opts: + if o in ('-c', '--cluster'): + typeCheck = 'cluster' + if o in ('-s', '--service'): + typeCheck = 'service' + serviceName = a + if o in ('-h', '--help'): + usage() + sys.exit() + + clustatOutput = os.popen('clustat -fx') + dom = xml.dom.minidom.parse(clustatOutput) + + if typeCheck == 'cluster': + + # First we query for the state of the cluster itself. + # Should it be found tha the cluste ris not quorate we alert and exit immediately + cluster = getClusterName(dom) + qState = getQuorumState(dom) + + # There are some serious problems if the cluster is inquorate so we simply alert immediately! + if qState != "1": + print "CRITICAL: Cluster " + cluster + " is inquorate!" + sys.exit(2) + + # Now we find the status of the local node from clustat. + # We only care about the local state since this way we can tie the alert to the host. + nodeStates = getLocalNodeState(dom) + if nodeStates['state'] != "1": + print "WARNING: Local node state is offline!" + sys.exit(1) + elif nodeStates['rgmanager'] != "1": + print "CRITICAL: RGManager service not running on " + nodeStates['name'] + "!" + sys.exit(1) + else: + print "OK: Cluster node " + nodeStates['name'] + " is online and cluster is quorate." + sys.exit(0) + + elif typeCheck == 'service': + serviceState = getServiceState(dom, serviceName) + if serviceState['state'] != 'started': + print "CRITICAL: Service " + serviceName + " on " + serviceState['owner'] + " is in " + serviceState['state'] + " state" + sys.exit(2) + else: + print "OK: Service " + serviceName + " on " + serviceState['owner'] + " is in " + serviceState['state'] + " state" + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/misc/nrpe.d/check_rhcs.cfg b/misc/nrpe.d/check_rhcs.cfg new file mode 100644 index 0000000..ceef14e --- /dev/null +++ b/misc/nrpe.d/check_rhcs.cfg @@ -0,0 +1,3 @@ +command[check_rhcs]=/usr/lib64/nagios/plugins/check_rhcs -H rek-oraheart-p04 -c +command[check_rhcs_service]=/usr/lib64/nagios/plugins/check_rhcs -s '$ARG1$' +