|
Server : Apache/2.2.17 (Unix) mod_ssl/2.2.17 OpenSSL/0.9.8e-fips-rhel5 DAV/2 PHP/5.2.17 System : Linux localhost 2.6.18-419.el5 #1 SMP Fri Feb 24 22:47:42 UTC 2017 x86_64 User : nobody ( 99) PHP Version : 5.2.17 Disable Function : NONE Directory : /proc/21573/task/21573/root/proc/21573/root/usr/lib/python2.4/site-packages/sos/plugins/ |
Upload File : |
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
## You should have received a copy of the GNU General Public License
## along with this program; if not, write to the Free Software
## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
import sos.plugintools
import commands, os, re
import libxml2, glob
# pylint: disable-msg = C0301
# libxml2 error handler
def noerr(ctx, str):
pass
class cluster(sos.plugintools.PluginBase):
"""cluster suite and GFS related information
"""
optionList = [("gfslockdump", 'gather output of gfs lockdumps', 'slow', False),
('lockdump', 'gather dlm lockdumps', 'slow', False)]
def checkenabled(self):
# enable if any related package is installed
for pkg in [ "rgmanager", "luci", "ricci", "system-config-cluster",
"gfs-utils", "gnbd", "kmod-gfs", "kmod-gnbd", "lvm2-cluster" ]:
if self.cInfo["policy"].pkgByName(pkg) != None:
return True
# enable if any related file is present
for fname in [ "/etc/cluster/cluster.conf" ]:
try:
os.stat(fname)
except:
pass
else:
return True
# no data related to RHCS/GFS exists
return False
def has_gfs(self):
fp = open("/proc/mounts","r")
for line in fp.readlines():
mntline = line.split(" ")
if mntline[2] == "gfs":
return True
fp.close()
return False
def diagnose(self):
try:
rhelver = self.cInfo["policy"].rhelVersion()
except:
rhelver = None
# FIXME: we should only run tests specific for the version, now just do them all regardless
if rhelver == 5:
# check that kernel module packages are installed for
# running kernel version
pkgs_check = [ ]
if self.has_gfs():
pkgs_check.append("kmod-gfs")
for pkgname in pkgs_check:
if not self.cInfo["policy"].pkgByName(pkgname):
self.addDiagnose("required package is missing: %s" % pkgname)
# check if the minimum set of packages is installed
# for RHEL4 RHCS(ccs, cman, cman-kernel, magma, magma-plugins, (dlm, dlm-kernel) || gulm, perl-Net-Telnet, rgmanager, fence)
# RHEL4 GFS (GFS, GFS-kernel, ccs, lvm2-cluster, fence)
for pkg in [ "cman", "perl-Net-Telnet", "rgmanager" ]:
if self.cInfo["policy"].pkgByName(pkg) == None:
self.addDiagnose("required package is missing: %s" % pkg)
# let's make modules sure are loaded
mods_check = [ "dlm" ]
if self.has_gfs():
mods_check.append("gfs")
for module in mods_check:
if len(self.fileGrep("^%s " % module, "/proc/modules")) == 0:
self.addDiagnose("required module is not loaded: %s" % module)
# check if all the needed daemons are active at sosreport time
# check if they are started at boot time in RHEL5 RHCS (rgmanager, cman)
# and GFS (gfs, ccsd, clvmd, fenced)
checkserv = [ "cman", "rgmanager" ]
if self.has_gfs():
checkserv.extend( ["gfs", "clvmd"] )
for service in checkserv:
status, output = commands.getstatusoutput("/sbin/service %s status" % service)
if status:
self.addDiagnose("service %s is not running" % service)
if not self.cInfo["policy"].runlevelDefault() in self.cInfo["policy"].runlevelByService(service):
self.addDiagnose("service %s is not started in default runlevel" % service)
# FIXME: any cman service whose state != run ?
# Fence Domain: "default" 2 2 run -
# is cluster quorate
if not self.is_cluster_quorate():
self.addDiagnose("cluster node is not quorate")
# if there is no cluster.conf, diagnose() finishes here.
try:
os.stat("/etc/cluster/cluster.conf")
except:
self.addDiagnose("/etc/cluster/cluster.conf is missing")
return
# suppress libxml2 console output
libxml2.registerErrorHandler(noerr, None)
# setup XML xpath context
try:
xml = libxml2.parseFile("/etc/cluster/cluster.conf")
except libxml2.parserError:
self.addDiagnose("/etc/cluster/cluster.conf contains malformed XML")
return
xpathContext = xml.xpathNewContext()
# check fencing (warn on no fencing)
if len(xpathContext.xpathEval("/cluster/clusternodes/clusternode[not(fence/method/device)]")):
if self.has_gfs():
self.addDiagnose("one or more nodes have no fencing agent configured: fencing is required for GFS to work")
else:
self.addDiagnose("one or more nodes have no fencing agent configured: the cluster infrastructure might not work as intended")
# check fencing (warn on manual)
if len(xpathContext.xpathEval("/cluster/clusternodes/clusternode[/cluster/fencedevices/fencedevice[@agent='fence_manual']/@name=fence/method/device/@name]")):
self.addDiagnose("one or more nodes have manual fencing agent configured (data integrity is not guaranteed)")
# if fence_ilo or fence_drac, make sure acpid is not running
hostname = commands.getoutput("/bin/uname -n").split(".")[0]
if len(xpathContext.xpathEval('/cluster/clusternodes/clusternode[@name = "%s" and /cluster/fencedevices/fencedevice[@agent="fence_rsa" or @agent="fence_drac"]/@name=fence/method/device/@name]' % hostname )):
status, output = commands.getstatusoutput("/sbin/service acpid status")
if status == 0 or self.cInfo["policy"].runlevelDefault() in self.cInfo["policy"].runlevelByService("acpid"):
self.addDiagnose("acpid is enabled, this may cause problems with your fencing method.")
# check for fs exported via nfs without nfsid attribute
if len(xpathContext.xpathEval("/cluster/rm/service//fs[not(@fsid)]/nfsexport")):
for xmlNode in xpathContext.xpathEval("/cluster/rm/service//fs[not(@fsid)]"):
fsRefAttribute = xmlNode.xpathEval("@ref")
if (len(fsRefAttribute) > 0) :
fsRefName = fsRefAttribute[0].content
if len(xpathContext.xpathEval("cluster/rm/resources/fs[@name='%s'][not(@fsid)]" % fsRefName)):
self.addDiagnose("one or more nfs export do not have a fsid attribute set.")
break
else:
self.addDiagnose("one or more nfs export do not have a fsid attribute set.")
# cluster.conf file version and the in-memory cluster configuration version matches
status, cluster_version = commands.getstatusoutput("cman_tool status | grep 'Config version'")
if not status:
cluster_version = cluster_version[16:]
else:
cluster_version = None
conf_version = xpathContext.xpathEval("/cluster/@config_version")[0].content
if status == 0 and conf_version != cluster_version:
self.addDiagnose("cluster.conf and in-memory configuration version differ (%s != %s)" % (conf_version, cluster_version) )
# make sure the first part of the lock table matches the cluster name
# and that the locking protocol is sane
cluster_name = xpathContext.xpathEval("/cluster/@name")[0].content
for fs in self.fileGrep(r'^[^#][/\w]*\W*[/\w]*\W*gfs', "/etc/fstab"):
# for each gfs entry
fs = fs.split()
lockproto = self.get_gfs_sb_field(fs[0], "sb_lockproto")
if lockproto and lockproto != self.get_locking_proto():
self.addDiagnose("gfs mountpoint (%s) is using the wrong locking protocol (%s)" % (fs[0], lockproto) )
locktable = self.get_gfs_sb_field(fs[0], "sb_locktable")
try: locktable = locktable.split(":")[0]
except: continue
if locktable != cluster_name:
self.addDiagnose("gfs mountpoint (%s) is using the wrong locking table" % fs[0])
# Test fence groups for valid id and stat
self.test_fence_id()
# Check for existence of weak-updates in gfs2 prior to 2.6.18-128
if rhelver == 5:
vermagic = commands.getoutput("modinfo -F vermagic gfs2")
# just kernel release from vermagic line
vermagic = vermagic[len("2.6.18-"):]
vermagic = vermagic[:vermagic.find('.')]
if int(vermagic) < 128:
self.addDiagnose('GFS2 is being used via weak-updates, kmod-gfs2 should be uninstalled and system rebooted ' \
'to allow for kernel provided gfs2 module to be used.')
# libxml2 python binding objects are not reference counted
xpathContext.xpathFreeContext()
xml.freeDoc()
def setup(self):
self.collectExtOutput("/sbin/fdisk -l")
self.addCopySpec("/etc/cluster.conf")
self.addCopySpec("/etc/cluster.xml")
self.addCopySpec("/etc/cluster")
self.collectExtOutput("/usr/sbin/rg_test test /etc/cluster/cluster.conf")
self.collectExtOutput("cman_tool status")
self.collectExtOutput("cman_tool services")
self.collectExtOutput("cman_tool -af nodes")
self.collectExtOutput("ccs_tool lsnode")
self.collectExtOutput("openais-cfgtool -s")
self.collectExtOutput("clustat")
# Check RHEL version, if 5, then grab openais-cfgtool output for multicast information.
try:
rhelver = self.cInfo["policy"].rhelVersion()
except:
rhelver = None
if rhelver == 5:
self.collectExtOutput("openais-cfgtool -s")
self.collectExtOutput("group_tool -v")
self.collectExtOutput("group_tool dump fence")
self.collectExtOutput("group_tool dump gfs")
self.collectExtOutput("group_tool dump")
if self.isOptionEnabled('gfslockdump'):
self.do_gfslockdump()
if self.isOptionEnabled('lockdump'):
self.do_lockdump()
return
def do_lockdump(self):
status, output, time = self.callExtProgWithOutput("group_tool")
for lockspace in re.compile(r'^dlm\s+[^\s]+\s+([^\s]+)', re.MULTILINE).findall(output):
self.collectExtOutput("dlm_tool lockdebug '%s'" % lockspace,
suggest_filename = "dlm_locks_%s" % lockspace)
def get_locking_proto(self):
# FIXME: what's the best way to find out ?
return "lock_dlm"
return "lock_gulm"
def do_gfslockdump(self):
fp = open("/proc/mounts","r")
for line in fp.readlines():
mntline = line.split(" ")
if mntline[2] == "gfs":
self.collectExtOutput("/sbin/gfs_tool lockdump %s" % mntline[1], symlink = "gfs_lockdump_" + self.mangleCommand(mntline[1]) )
fp.close()
def do_rgmgr_bt(self):
# FIXME: threads backtrace
return
def postproc(self):
for cluster_conf in glob.glob("/etc/cluster/cluster.conf*"):
self.doRegexSub(cluster_conf, r"(\s*\<fencedevice\s*.*\s*passwd\s*=\s*)\S+(\")", r"\1%s" %('"***"'))
return
def is_cluster_quorate(self):
output = commands.getoutput("cman_tool status | grep '^Membership state: '")
try:
if output.split(":")[1].strip() == "Cluster-Member":
return True
else:
return False
except:
pass
return None
def get_gfs_sb_field(self, device, field):
for line in commands.getoutput("/sbin/gfs_tool sb %s all" % device).split("\n"):
if re.match('^\W*%s = ' % field, line):
return line.split("=")[1].strip()
return False
# Diagnostic testing functions
def test_fence_id(self):
# resolves rhbz 499468 and 499472
for line in commands.getoutput("/sbin/group_tool ls | grep -v '^\['").split("\n")[1:]:
for a in line.split():
# we can do this since fence id is a fix field
if re.match('00000000', a):
self.addDiagnose('Invalid fence id: %s' % (line,))
if line.split()[-1] != 'none':
self.addDiagnose("Possible incorrect state: %s, for group: %s" % (line.split()[-1], line))
return