#!/usr/bin/python3 # # PowerFlex DAX issues detection & remediation utilities V1.0.0 # ############################################################################### # GENERAL UTILITIES START ############################################################################### import logging import argparse import subprocess import sys import os import glob import shutil import collections import re import json logger = logging.getLogger(__name__) ############################################################################### def verifyRequiredCommands(*args): for curCmd in args: if shutil.which(curCmd) == None: logger.error( "Required command {} is not available.".format(curCmd)) return False else: logger.info("Required command {} is available.".format(curCmd)) return True ############################################################################### def setupLogging(logFileName): logging.basicConfig(level=logging.INFO, handlers=(logging.StreamHandler(sys.stdout), logging.FileHandler( os.path.join(os.path.dirname(__file__), logFileName))), format="%(asctime)s - %(levelname)s - %(message)s") ############################################################################### BadNvdimmStatus = collections.namedtuple( "BadNvdimmStatus", [ "badDimms", "badNamespaces" ]) ############################################################################### HostStatus = collections.namedtuple( "HostStatus", [ "hostId", "hostDescr", "badNvdimmStatus" ]) ############################################################################### class NdctlAnalyzer(object): def __init__(self): self.__goodHostsList = [] self.__badHostsList = [] self.__unkHostsList = [] def addUnknownHost(self, hostId, hostDescr, badNvdimmStatus=None): self.__unkHostsList.append( HostStatus(hostId = hostId, hostDescr = hostDescr, badNvdimmStatus = badNvdimmStatus)) def processNdctlFileByPath(self, hostId, hostDescr, ndctlFilePath): try: with open(ndctlFilePath, "r") as ndctlFile: self.processNdctlFile(hostId, hostDescr, ndctlFile) except Exception as ex: logger.error( "Caught exception {} while processing file {}.".format( ex, ndctlFilePath)) def processNdctlFile(self, hostId, hostDescr, ndctlFile): try: ndctlJson = json.load(ndctlFile) badNvdimmStatus = NdctlAnalyzer.sGetBadNvdimmStatusFromNdctlJson( ndctlJson) if badNvdimmStatus is None: # No bad NVDIMM status means that the host is good. self.__goodHostsList.append( HostStatus(hostId = hostId, hostDescr = hostDescr, badNvdimmStatus = None)) elif NdctlAnalyzer.sIsBadStatusUnknown(badNvdimmStatus): # Maybe it's just "unknown" flags due to /dev/nmem permissions # errors self.__unkHostsList.append( HostStatus(hostId = hostId, hostDescr = hostDescr, badNvdimmStatus = badNvdimmStatus)) else: # Definitely bad. self.__badHostsList.append( HostStatus(hostId = hostId, hostDescr = hostDescr, badNvdimmStatus = badNvdimmStatus)) except Exception as ex: logger.error( "Caught exception {} while processing host {} {}!".format( ex, hostId, hostDescr)) self.__unkHostsList.append( HostStatus(hostId = hostId, hostDescr = hostDescr, badNvdimmStatus = None)) @staticmethod def sGetBadNvdimmStatusFromNdctlJson(ndctlJson): badDimmsByName = {} badNamespacesByName = {} # 1. Walk over dimms and collect bad ones # 2. Walk over regions # 2a. Collect a union of bad flags for the region, based on its # dimms. # 2b. Collect all namespaces in the region, associating the union # of bad flags with each. dimmsJsonArray = ndctlJson[0].get("dimms") logger.info("ndctl JSON contains {} dimms.".format(len(dimmsJsonArray))) for curDimmJson in dimmsJsonArray: (curDimmName, curDimmBadStatusSet) = NdctlAnalyzer.sGetDimmJsonNodeBadStatus( curDimmJson) if len(curDimmBadStatusSet) > 0: logger.error("DIMM {} is NOT HEALTHY {}.".format( curDimmName, curDimmBadStatusSet)) badDimmsByName[curDimmName] = curDimmBadStatusSet else: logger.info("DIMM {} is OK".format(curDimmName)) regionsJsonArray = ndctlJson[0].get("regions") for curRegionJson in regionsJsonArray: curRegionBadFlags = set() curRegionName = curRegionJson["dev"] if curRegionJson.get("badblock_count") is not None: curRegionBadFlags.add("badblock_count") for curMappingJson in curRegionJson["mappings"]: curDimmName = curMappingJson["dimm"] curDimmBadFlags = badDimmsByName.get(curDimmName) if curDimmBadFlags is not None: curRegionBadFlags.update(curDimmBadFlags) logger.error( "Region {} has mapping to an unhealthy dimm {}".format( curRegionName, curDimmName)) if len(curRegionBadFlags) > 0: # This region is bad, so let's add all its namespaces # to the bad dictionary. for curNsJson in curRegionJson["namespaces"]: curBadNsName = curNsJson["dev"] curBadNsData = {} curBadNsData["region"] = curRegionName curBadNsData["mode"] = curNsJson["mode"] curBadNsData["size"] = curNsJson["size"] curBadNsData["align"] = curNsJson.get("align") curBadNsData["region_align"] = curRegionJson["align"] curBadNsData["flags"] = list(curRegionBadFlags) curBadNsData["devices"] = [] if curNsJson["mode"] == "devdax": # map is "dev" or "mem" curBadNsData["map"] = curNsJson["map"] curNsDaxregionJson = curNsJson.get("daxregion") if curNsDaxregionJson is None: logger.warning( ("Unable to find \"daxregion\" " + "object in {}").format(curBadNsName)) continue curNsDaxregionDevicesJson = curNsDaxregionJson["devices"] if curNsDaxregionDevicesJson is None: logger.warning( ("No \"devices\" section in " + "\"daxregion\" namespace {}").format( curBadNsName)) continue for curDaxDevJson in curNsDaxregionDevicesJson: curDaxDevName = curDaxDevJson.get("chardev") if curDaxDevName is None: logger.warning( "Unable to find name for device in {}".format( curBadNsName)) continue logger.info("Adding DAX device {} to bad list.".format( curDaxDevName)) curBadNsData["devices"].append(curDaxDevName) elif curNsJson["size"] == 0: logger.info("Skipping dummy namespace {}".format( curBadNsName)) continue logger.info( "Adding namespace {} (mode {}) to the bad list.".format( curBadNsName, curBadNsData["mode"])) badNamespacesByName[curBadNsName] = curBadNsData # Return the status tuple if we have something if len(badDimmsByName) + len(badNamespacesByName) > 0: return BadNvdimmStatus( badDimms = badDimmsByName, badNamespaces = badNamespacesByName ) else: return None @staticmethod def sGetDimmJsonNodeBadStatus(dimmJson): badStatusSet = set() dimmJsonName = dimmJson.get("dev") dimmJsonHealth = dimmJson.get("health") dimmJsonHealthState = dimmJsonHealth.get("health_state") if dimmJsonHealthState != "ok": badStatusSet.add("health_state_{}".format(dimmJsonHealthState)) if dimmJson.get("flag_failed_arm") == True: badStatusSet.add("flag_failed_arm") if dimmJson.get("flag_failed_map") == True: badStatusSet.add("flag_failed_map") if dimmJson.get("flag_failed_save") == True: badStatusSet.add("flag_failed_save") if dimmJson.get("flag_failed_restore") == True: badStatusSet.add("flag_failed_restore") if dimmJson.get("flag_failed_flush") == True: badStatusSet.add("flag_failed_flush") if dimmJson.get("flag_smart_event") == True: badStatusSet.add("flag_smart_event") return (dimmJsonName, badStatusSet) def writeHealthSummary(self, logsDir, hostIdPrefix = ""): logger.info("") logger.info("--------------------") logger.info("NVDIMM CHECK SUMMARY") logger.info("--------------------") logger.info("") # Print good hosts outFileName = "{}/hosts_ok".format(logsDir) outFile = open(outFileName, "w") if len(self.__goodHostsList) > 0: logger.info("Hosts without errors") logger.info("====================") for curHost in self.__goodHostsList: logger.info("{} {} {}:".format(hostIdPrefix, curHost.hostId, curHost.hostDescr)) outFile.write("{}\n".format(curHost.hostId)) logger.info("Wrote {}".format(outFileName)) logger.info("") outFile.close() # Print unknown hosts outFileName = "{}/hosts_unknown".format(logsDir) outFile = open(outFileName, "w") if len(self.__unkHostsList) > 0: logger.info("Hosts with UNKNOWN status") logger.info("=========================") for curHost in self.__unkHostsList: logger.info("{} {} {}:".format(hostIdPrefix, curHost.hostId, curHost.hostDescr)) if curHost.badNvdimmStatus: NdctlAnalyzer.sLogBadNvdimmStatus(curHost.badNvdimmStatus) outFile.write("{}\n".format(curHost.hostId)) logger.info("Wrote {}".format(outFileName)) logger.info("") outFile.close() # Print bad hosts outFileName = "{}/hosts_with_errors".format(logsDir) outFile = open(outFileName, "w") if len(self.__badHostsList) == 0: logger.info("There are no hosts with NVDIMM errors.") else: logger.info("HOSTS WITH ERRORS") logger.info("=================") for curHost in self.__badHostsList: logger.info("{} {} {}:".format(hostIdPrefix, curHost.hostId, curHost.hostDescr)) NdctlAnalyzer.sLogBadNvdimmStatus(curHost.badNvdimmStatus) outFile.write("{}\n".format(curHost.hostId)) logger.info("Wrote {}".format(outFileName)) logger.info("") outFile.close() @staticmethod def sLogBadNvdimmStatus(badNvdimmStatus): for (curBadDimmName, curBadDimmFlags) in \ badNvdimmStatus.badDimms.items(): logger.info("DIMM: {} ({})".format( curBadDimmName, curBadDimmFlags)) for (curBadNsName, curBadNsData) in \ badNvdimmStatus.badNamespaces.items(): logger.info("NAMESPACE: {} ({}) {} ({})".format( curBadNsName, curBadNsData["mode"], curBadNsData["devices"], curBadNsData["flags"])) @staticmethod def sIsBadStatusCritical(badNvdimmStatus): bCritical = False logger.info("Checking for critical flags in:") NdctlAnalyzer.sLogBadNvdimmStatus(badNvdimmStatus) for (curDimmName, curDimmFlags) in badNvdimmStatus.badDimms.items(): if (("flag_failed_restore" in curDimmFlags) or ("flag_failed_save" in curDimmFlags)): logger.error("Critical flags found in DIMM {} ({})".format( curDimmName, curDimmFlags)) bCritical = True for (curBadNsName, curBadNsData) in \ badNvdimmStatus.badNamespaces.items(): if "badblock_count" in curBadNsData["flags"]: logger.error("Bad blocks are present in namespace {}".format( curBadNsName)) bCritical = True return bCritical @staticmethod def sIsBadStatusUnknown(badNvdimmStatus): for (curDimmName, curDimmFlags) in badNvdimmStatus.badDimms.items(): if (len(curDimmFlags) != 1 or "health_state_unknown" not in curDimmFlags): return False return True ############################################################################### def getNvdimmBadStatusWithNdctl(): try: ndctlOutput = subprocess.check_output("ndctl list -vvv", shell = True) ndctlJson = json.loads(ndctlOutput) badNvdimmStatus = NdctlAnalyzer.sGetBadNvdimmStatusFromNdctlJson( ndctlJson) except Exception as ex: logger.error("Caught exception {} during ndctl run.".format(ex)) raise return badNvdimmStatus ############################################################################### def destroyNamespace(nsName): try: ndctlOutput = subprocess.check_output( "ndctl destroy-namespace {} -f".format(nsName), shell = True) except Exception as ex: logger.error("Failed to destroy namespace {}".format(nsName)) return False return True ############################################################################### class UnexpectedData(RuntimeError): pass ############################################################################### # GENERAL UTILITIES END ############################################################################### ############################################################################### def processNdctlOutputsDir(logsDir, ndctlOutputsDir): goodHostsList = [] badHostsList = [] unkHostsList = [] analyzer = NdctlAnalyzer() for curNdctlOutputFilePath in glob.glob("{}/*".format(ndctlOutputsDir)): logger.info("Processing ndctl output file {}...".format( curNdctlOutputFilePath)) curFileName = os.path.basename(curNdctlOutputFilePath) curFileNameParts = curFileName.split("-") if len(curFileNameParts) == 2: hostId = curFileNameParts[0] hostDescr = curFileNameParts[1] else: hostId = curFileName hostDescr = "" analyzer.processNdctlFileByPath(hostId, hostDescr, curNdctlOutputFilePath) analyzer.writeHealthSummary(logsDir, "SDS") ############################################################################### def main(): if len(sys.argv) != 3: print( "Expected 2 arguments: ", file=sys.stderr) else: setupLogging("{}/analyze_ndctl.log".format(sys.argv[1])) logger.info("Started") processNdctlOutputsDir(sys.argv[1], sys.argv[2]) logger.info("Finished") ############################################################################### if __name__ == '__main__': main()