KGRKJGETMRETU895U-589TY5MIGM5JGB5SDFESFREWTGR54TY
Server : Apache/2.2.17 (Unix) mod_ssl/2.2.17 OpenSSL/0.9.8e-fips-rhel5 DAV/2 PHP/5.2.17
System : Linux localhost 2.6.18-419.el5 #1 SMP Fri Feb 24 22:47:42 UTC 2017 x86_64
User : nobody ( 99)
PHP Version : 5.2.17
Disable Function : NONE
Directory :  /proc/21585/root/usr/lib/python2.4/site-packages/sos/plugins/

Upload File :
current_dir [ Writeable ] document_root [ Writeable ]

 

Current File : //proc/21585/root/usr/lib/python2.4/site-packages/sos/plugins/cluster.py
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.

## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
## GNU General Public License for more details.

## You should have received a copy of the GNU General Public License
## along with this program; if not, write to the Free Software
## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

import sos.plugintools
import commands, os, re
import libxml2, glob

# pylint: disable-msg = C0301

# libxml2 error handler
def noerr(ctx, str):
    pass

class cluster(sos.plugintools.PluginBase):
    """cluster suite and GFS related information
    """

    optionList = [("gfslockdump", 'gather output of gfs lockdumps', 'slow', False),
                  ('lockdump', 'gather dlm lockdumps', 'slow', False)]

    def checkenabled(self):
        # enable if any related package is installed
        for pkg in [ "rgmanager", "luci", "ricci", "system-config-cluster",
                     "gfs-utils", "gnbd", "kmod-gfs", "kmod-gnbd", "lvm2-cluster" ]:
            if self.cInfo["policy"].pkgByName(pkg) != None:
                return True

        # enable if any related file is present
        for fname in [ "/etc/cluster/cluster.conf" ]:
            try:
                os.stat(fname)
            except:
                pass
            else:
                return True

        # no data related to RHCS/GFS exists
        return False

    def has_gfs(self):
        fp = open("/proc/mounts","r")
        for line in fp.readlines():
            mntline = line.split(" ")
            if mntline[2] == "gfs":
                return True
        fp.close()
        return False

    def diagnose(self):
        try: 
            rhelver = self.cInfo["policy"].rhelVersion()
        except:
            rhelver = None

        # FIXME: we should only run tests specific for the version, now just do them all regardless
        if rhelver == 5:
            # check that kernel module packages are installed for
            # running kernel version
            pkgs_check = [ ]
            if self.has_gfs():
                pkgs_check.append("kmod-gfs")

            for pkgname in pkgs_check:
                if not self.cInfo["policy"].pkgByName(pkgname):
                    self.addDiagnose("required package is missing: %s" % pkgname)

                    # check if the minimum set of packages is installed
                    # for RHEL4 RHCS(ccs, cman, cman-kernel, magma, magma-plugins, (dlm, dlm-kernel) || gulm, perl-Net-Telnet, rgmanager, fence)
                    # RHEL4 GFS (GFS, GFS-kernel, ccs, lvm2-cluster, fence)

                    for pkg in [ "cman", "perl-Net-Telnet", "rgmanager" ]:
                        if self.cInfo["policy"].pkgByName(pkg) == None:
                            self.addDiagnose("required package is missing: %s" % pkg)

            # let's make modules sure are loaded
            mods_check = [ "dlm" ]
            if self.has_gfs():
                mods_check.append("gfs")
            for module in mods_check:
                if len(self.fileGrep("^%s " % module, "/proc/modules")) == 0:
                    self.addDiagnose("required module is not loaded: %s" % module)

            # check if all the needed daemons are active at sosreport time
            # check if they are started at boot time in RHEL5 RHCS (rgmanager, cman)
            # and GFS (gfs, ccsd, clvmd, fenced)
            checkserv = [ "cman", "rgmanager" ]
            if self.has_gfs():
                checkserv.extend( ["gfs", "clvmd"] )
            for service in checkserv:
                status, output = commands.getstatusoutput("/sbin/service %s status" % service)
                if status:
                    self.addDiagnose("service %s is not running" % service)

                if not self.cInfo["policy"].runlevelDefault() in self.cInfo["policy"].runlevelByService(service):
                    self.addDiagnose("service %s is not started in default runlevel" % service)

            # FIXME: any cman service whose state != run ?
            # Fence Domain:    "default"                                  2   2 run       -

            # is cluster quorate
            if not self.is_cluster_quorate():
                self.addDiagnose("cluster node is not quorate")

            # if there is no cluster.conf, diagnose() finishes here.
            try:
                os.stat("/etc/cluster/cluster.conf")
            except:
                self.addDiagnose("/etc/cluster/cluster.conf is missing")
                return

            # suppress libxml2 console output
            libxml2.registerErrorHandler(noerr, None)
            # setup XML xpath context
            try:
                xml = libxml2.parseFile("/etc/cluster/cluster.conf")
            except libxml2.parserError:
                self.addDiagnose("/etc/cluster/cluster.conf contains malformed XML")
                return

            xpathContext = xml.xpathNewContext()

            # check fencing (warn on no fencing)
            if len(xpathContext.xpathEval("/cluster/clusternodes/clusternode[not(fence/method/device)]")):
                if self.has_gfs():
                    self.addDiagnose("one or more nodes have no fencing agent configured: fencing is required for GFS to work")
                else:
                    self.addDiagnose("one or more nodes have no fencing agent configured: the cluster infrastructure might not work as intended")

            # check fencing (warn on manual)
            if len(xpathContext.xpathEval("/cluster/clusternodes/clusternode[/cluster/fencedevices/fencedevice[@agent='fence_manual']/@name=fence/method/device/@name]")):
                self.addDiagnose("one or more nodes have manual fencing agent configured (data integrity is not guaranteed)")

            # if fence_ilo or fence_drac, make sure acpid is not running
            hostname = commands.getoutput("/bin/uname -n").split(".")[0]
            if len(xpathContext.xpathEval('/cluster/clusternodes/clusternode[@name = "%s" and /cluster/fencedevices/fencedevice[@agent="fence_rsa" or @agent="fence_drac"]/@name=fence/method/device/@name]' % hostname )):
                status, output = commands.getstatusoutput("/sbin/service acpid status")
                if status == 0 or self.cInfo["policy"].runlevelDefault() in self.cInfo["policy"].runlevelByService("acpid"):
                    self.addDiagnose("acpid is enabled, this may cause problems with your fencing method.")

            # check for fs exported via nfs without nfsid attribute
            if len(xpathContext.xpathEval("/cluster/rm/service//fs[not(@fsid)]/nfsexport")):
                for xmlNode in xpathContext.xpathEval("/cluster/rm/service//fs[not(@fsid)]"):
                    fsRefAttribute = xmlNode.xpathEval("@ref")
                    if (len(fsRefAttribute) > 0) :
                        fsRefName = fsRefAttribute[0].content
                        if len(xpathContext.xpathEval("cluster/rm/resources/fs[@name='%s'][not(@fsid)]" % fsRefName)):
                            self.addDiagnose("one or more nfs export do not have a fsid attribute set.")
                            break
                    else:
                        self.addDiagnose("one or more nfs export do not have a fsid attribute set.")

            # cluster.conf file version and the in-memory cluster configuration version matches
            status, cluster_version = commands.getstatusoutput("cman_tool status | grep 'Config version'")
            if not status:
                cluster_version = cluster_version[16:]
            else: 
                cluster_version = None
            conf_version = xpathContext.xpathEval("/cluster/@config_version")[0].content

            if status == 0 and conf_version != cluster_version:
                self.addDiagnose("cluster.conf and in-memory configuration version differ (%s != %s)" % (conf_version, cluster_version) )

            # make sure the first part of the lock table matches the cluster name
            # and that the locking protocol is sane
            cluster_name = xpathContext.xpathEval("/cluster/@name")[0].content

            for fs in self.fileGrep(r'^[^#][/\w]*\W*[/\w]*\W*gfs', "/etc/fstab"):
                # for each gfs entry
                fs = fs.split()

                lockproto = self.get_gfs_sb_field(fs[0], "sb_lockproto")
                if lockproto and lockproto != self.get_locking_proto():
                    self.addDiagnose("gfs mountpoint (%s) is using the wrong locking protocol (%s)" % (fs[0], lockproto) )

                locktable = self.get_gfs_sb_field(fs[0], "sb_locktable")
                try: locktable = locktable.split(":")[0]
                except: continue
                if locktable != cluster_name:
                    self.addDiagnose("gfs mountpoint (%s) is using the wrong locking table" % fs[0])

            # Test fence groups for valid id and stat
            self.test_fence_id()

            # Check for existence of weak-updates in gfs2 prior to 2.6.18-128
            if rhelver == 5:
                vermagic = commands.getoutput("modinfo -F vermagic gfs2")
                # just kernel release from vermagic line
                vermagic = vermagic[len("2.6.18-"):]
                vermagic = vermagic[:vermagic.find('.')]
                if int(vermagic) < 128:
                    self.addDiagnose('GFS2 is being used via weak-updates, kmod-gfs2 should be uninstalled and system rebooted ' \
                                         'to allow for kernel provided gfs2 module to be used.')               

            # libxml2 python binding objects are not reference counted
            xpathContext.xpathFreeContext()
            xml.freeDoc()

    def setup(self):
        self.collectExtOutput("/sbin/fdisk -l")
        self.addCopySpec("/etc/cluster.conf")
        self.addCopySpec("/etc/cluster.xml")
        self.addCopySpec("/etc/cluster")
        self.collectExtOutput("/usr/sbin/rg_test test /etc/cluster/cluster.conf")

        self.collectExtOutput("cman_tool status")
        self.collectExtOutput("cman_tool services")
        self.collectExtOutput("cman_tool -af nodes")

        self.collectExtOutput("ccs_tool lsnode")
        self.collectExtOutput("openais-cfgtool -s")
        self.collectExtOutput("clustat")

        # Check RHEL version, if 5, then grab openais-cfgtool output for multicast information.
        try:
            rhelver = self.cInfo["policy"].rhelVersion()
        except:
            rhelver = None

        if rhelver == 5:
            self.collectExtOutput("openais-cfgtool -s")
            self.collectExtOutput("group_tool -v")
            self.collectExtOutput("group_tool dump fence")
            self.collectExtOutput("group_tool dump gfs")
            self.collectExtOutput("group_tool dump")

        if self.isOptionEnabled('gfslockdump'):
            self.do_gfslockdump()
        if self.isOptionEnabled('lockdump'):
            self.do_lockdump()

        return

    def do_lockdump(self):
        status, output, time = self.callExtProgWithOutput("group_tool")
        for lockspace in re.compile(r'^dlm\s+[^\s]+\s+([^\s]+)', re.MULTILINE).findall(output):
            self.collectExtOutput("dlm_tool lockdebug '%s'" % lockspace,
                suggest_filename = "dlm_locks_%s" % lockspace)

    def get_locking_proto(self):
        # FIXME: what's the best way to find out ?
        return "lock_dlm"
        return "lock_gulm"

    def do_gfslockdump(self):
        fp = open("/proc/mounts","r")
        for line in fp.readlines():
            mntline = line.split(" ")
            if mntline[2] == "gfs":
                self.collectExtOutput("/sbin/gfs_tool lockdump %s" % mntline[1], symlink = "gfs_lockdump_" + self.mangleCommand(mntline[1]) )
        fp.close()

    def do_rgmgr_bt(self):
        # FIXME: threads backtrace
        return

    def postproc(self):
        for cluster_conf in glob.glob("/etc/cluster/cluster.conf*"):
            self.doRegexSub(cluster_conf, r"(\s*\<fencedevice\s*.*\s*passwd\s*=\s*)\S+(\")", r"\1%s" %('"***"'))
        return

    def is_cluster_quorate(self):
        output = commands.getoutput("cman_tool status | grep '^Membership state: '")
        try:
            if output.split(":")[1].strip() == "Cluster-Member":
                return True
            else:
                return False
        except:
            pass
        return None

    def get_gfs_sb_field(self, device, field):
        for line in commands.getoutput("/sbin/gfs_tool sb %s all" % device).split("\n"):
            if re.match('^\W*%s = ' % field, line):
                return line.split("=")[1].strip()
        return False

    # Diagnostic testing functions
    def test_fence_id(self):
        # resolves rhbz 499468 and 499472
        for line in commands.getoutput("/sbin/group_tool ls | grep -v '^\['").split("\n")[1:]:
            for a in line.split():
                # we can do this since fence id is a fix field
                if re.match('00000000', a):
                    self.addDiagnose('Invalid fence id: %s' % (line,))
            if line.split()[-1] != 'none':
                self.addDiagnose("Possible incorrect state: %s, for group: %s" % (line.split()[-1], line))
        return


Anon7 - 2021