######################################################################## # File : Watchdog.py # Author: Stuart Paterson ######################################################################## """ The Watchdog class is used by the Job Wrapper to resolve and monitor the system resource consumption. The Watchdog can determine if a running job is stalled and indicate this to the Job Wrapper. Furthermore, the Watchdog will identify when the Job CPU limit has been exceeded and fail jobs meaningfully. Information is returned to the WMS via the heart-beat mechanism. This also interprets control signals from the WMS e.g. to kill a running job. - Still to implement: - CPU normalization for correct comparison with job limit """ from __future__ import absolute_import from __future__ import division from __future__ import print_function __RCSID__ = "4d690cae8 (2022-03-22 10:49:54 +0100) Federico Stagni " import os import re import time import resource import errno import socket import getpass import psutil from DIRAC import S_OK, S_ERROR, gLogger from DIRAC.Core.Utilities import Time from DIRAC.Core.Utilities import MJF from DIRAC.Core.Utilities.Profiler import Profiler from DIRAC.Core.Utilities.Os import getDiskSpace from DIRAC.Core.Utilities.Subprocess import getChildrenPIDs from DIRAC.ConfigurationSystem.Client.Config import gConfig from DIRAC.ConfigurationSystem.Client.PathFinder import getSystemInstance from DIRAC.Resources.Computing.BatchSystems.TimeLeft.TimeLeft import TimeLeft from DIRAC.WorkloadManagementSystem.Client.JobStateUpdateClient import JobStateUpdateClient from DIRAC.WorkloadManagementSystem.Client import JobMinorStatus class Watchdog(object): ############################################################################# def __init__(self, pid, exeThread, spObject, jobCPUTime, memoryLimit=0, processors=1, jobArgs={}): """Constructor, takes system flag as argument.""" self.stopSigStartSeconds = int(jobArgs.get("StopSigStartSeconds", 1800)) # 30 minutes self.stopSigFinishSeconds = int(jobArgs.get("StopSigFinishSeconds", 1800)) # 30 minutes self.stopSigNumber = int(jobArgs.get("StopSigNumber", 2)) # SIGINT self.stopSigRegex = jobArgs.get("StopSigRegex", None) self.stopSigSent = False self.log = gLogger.getSubLogger("Watchdog") self.exeThread = exeThread self.wrapperPID = pid self.appPID = self.exeThread.getCurrentPID() self.spObject = spObject self.jobCPUTime = jobCPUTime self.memoryLimit = memoryLimit self.calibration = 0 self.initialValues = {} self.parameters = {} self.peekFailCount = 0 self.peekRetry = 5 self.profiler = Profiler(pid) self.checkError = "" self.currentStats = {} self.initialized = False self.count = 0 # defaults self.testWallClock = 1 self.testDiskSpace = 1 self.testLoadAvg = 1 self.maxWallClockTime = 3 * 24 * 60 * 60 self.testCPUConsumed = 1 self.testCPULimit = 0 self.testMemoryLimit = 0 self.testTimeLeft = 1 self.pollingTime = 10 # 10 seconds self.checkingTime = 30 * 60 # 30 minute period self.minCheckingTime = 20 * 60 # 20 mins self.wallClockCheckSeconds = 5 * 60 # 5 minutes self.maxWallClockTime = 3 * 24 * 60 * 60 # e.g. 4 days self.jobPeekFlag = 1 # on / off self.minDiskSpace = 10 # MB self.loadAvgLimit = 1000 # > 1000 and jobs killed self.sampleCPUTime = 30 * 60 # e.g. up to 20mins sample self.jobCPUMargin = 20 # %age buffer before killing job self.minCPUWallClockRatio = 5 # ratio %age self.nullCPULimit = 5 # After 5 sample times return null CPU consumption kill job self.checkCount = 0 self.wallClockCheckCount = 0 self.nullCPUCount = 0 self.grossTimeLeftLimit = 10 * self.checkingTime self.timeLeftUtil = TimeLeft() self.timeLeft = 0 self.littleTimeLeft = False self.cpuPower = 1.0 self.processors = processors ############################################################################# def initialize(self): """Watchdog initialization.""" if self.initialized: self.log.info("Watchdog already initialized") return S_OK() else: self.initialized = True setup = gConfig.getValue("/DIRAC/Setup", "") if not setup: return S_ERROR("Can not get the DIRAC Setup value") wms_instance = getSystemInstance("WorkloadManagement") if not wms_instance: return S_ERROR("Can not get the WorkloadManagement system instance") self.section = "/Systems/WorkloadManagement/%s/JobWrapper" % wms_instance self.log.verbose("Watchdog initialization") # Test control flags self.testWallClock = gConfig.getValue(self.section + "/CheckWallClockFlag", 1) self.testDiskSpace = gConfig.getValue(self.section + "/CheckDiskSpaceFlag", 1) self.testLoadAvg = gConfig.getValue(self.section + "/CheckLoadAvgFlag", 1) self.testCPUConsumed = gConfig.getValue(self.section + "/CheckCPUConsumedFlag", 1) self.testCPULimit = gConfig.getValue(self.section + "/CheckCPULimitFlag", 0) self.testMemoryLimit = gConfig.getValue(self.section + "/CheckMemoryLimitFlag", 0) self.testTimeLeft = gConfig.getValue(self.section + "/CheckTimeLeftFlag", 1) # Other parameters self.pollingTime = gConfig.getValue(self.section + "/PollingTime", 10) # 10 seconds self.checkingTime = gConfig.getValue(self.section + "/CheckingTime", 30 * 60) # 30 minute period self.minCheckingTime = gConfig.getValue(self.section + "/MinCheckingTime", 20 * 60) # 20 mins self.maxWallClockTime = gConfig.getValue(self.section + "/MaxWallClockTime", 3 * 24 * 60 * 60) # e.g. 4 days self.jobPeekFlag = gConfig.getValue(self.section + "/JobPeekFlag", 1) # on / off self.minDiskSpace = gConfig.getValue(self.section + "/MinDiskSpace", 10) # MB self.loadAvgLimit = gConfig.getValue(self.section + "/LoadAverageLimit", 1000) # > 1000 and jobs killed self.sampleCPUTime = gConfig.getValue(self.section + "/CPUSampleTime", 30 * 60) # e.g. up to 20mins sample self.jobCPUMargin = gConfig.getValue(self.section + "/JobCPULimitMargin", 20) # %age buffer before killing job self.minCPUWallClockRatio = gConfig.getValue(self.section + "/MinCPUWallClockRatio", 5) # ratio %age # After 5 sample times return null CPU consumption kill job self.nullCPULimit = gConfig.getValue(self.section + "/NullCPUCountLimit", 5) if self.checkingTime < self.minCheckingTime: self.log.info( "Requested CheckingTime of %s setting to %s seconds (minimum)" % (self.checkingTime, self.minCheckingTime) ) self.checkingTime = self.minCheckingTime # The time left is returned in seconds @ 250 SI00 = 1 HS06, # the self.checkingTime and self.pollingTime are in seconds, # thus they need to be multiplied by a large enough factor self.fineTimeLeftLimit = gConfig.getValue(self.section + "/TimeLeftLimit", 150 * self.pollingTime) self.cpuPower = gConfig.getValue("/LocalSite/CPUNormalizationFactor", 1.0) return S_OK() def run(self): """The main watchdog execution method""" result = self.initialize() if not result["OK"]: self.log.always("Can not start watchdog for the following reason") self.log.always(result["Message"]) return result try: while True: self.log.debug("Starting watchdog loop # %d" % self.count) start_cycle_time = time.time() result = self.execute() exec_cycle_time = time.time() - start_cycle_time if not result["OK"]: self.log.error("Watchdog error during execution", result["Message"]) break elif result["Value"] == "Ended": break self.count += 1 if exec_cycle_time < self.pollingTime: time.sleep(self.pollingTime - exec_cycle_time) return S_OK() except Exception: self.log.exception() return S_ERROR("Exception") ############################################################################# def execute(self): """The main agent execution method of the Watchdog.""" if not self.exeThread.is_alive(): self.__getUsageSummary() self.log.info("Process to monitor has completed, Watchdog will exit.") return S_OK("Ended") # WallClock checks every self.wallClockCheckSeconds, but only if StopSigRegex is defined in JDL if ( not self.stopSigSent and self.stopSigRegex is not None and (time.time() - self.initialValues["StartTime"]) > self.wallClockCheckSeconds * self.wallClockCheckCount ): self.wallClockCheckCount += 1 self._performWallClockChecks() if self.littleTimeLeft: # if we have gone over enough iterations query again if self.littleTimeLeftCount == 0 and self.__timeLeft() == -1: self.checkError = JobMinorStatus.JOB_EXCEEDED_CPU self.log.error(self.checkError, self.timeLeft) self.__killRunningThread() return S_OK() else: self.littleTimeLeftCount -= 1 # Note: need to poll regularly to see if the thread is alive # but only perform checks with a certain frequency if (time.time() - self.initialValues["StartTime"]) > self.checkingTime * self.checkCount: self.checkCount += 1 result = self._performChecks() if not result["OK"]: self.log.warn("Problem during recent checks") self.log.warn(result["Message"]) return S_OK() else: # self.log.debug('Application thread is alive: checking count is %s' %(self.checkCount)) return S_OK() ############################################################################# def _performWallClockChecks(self): """Watchdog performs the wall clock checks based on MJF. Signals are sent to processes if we need to stop, but function always returns S_OK() """ mjf = MJF.MJF() try: wallClockSecondsLeft = mjf.getWallClockSecondsLeft() except Exception as e: # Just stop if we can't get the wall clock seconds left return S_OK() jobstartSeconds = mjf.getIntJobFeature("jobstart_secs") if jobstartSeconds is None: # Just stop if we don't know when the job started return S_OK() if (int(time.time()) > jobstartSeconds + self.stopSigStartSeconds) and ( wallClockSecondsLeft < self.stopSigFinishSeconds + self.wallClockCheckSeconds ): # Need to send the signal! Assume it works to avoid sending the signal more than once self.log.info("Sending signal to JobWrapper children", "(%s)" % self.stopSigNumber) self.stopSigSent = True try: for childPid in getChildrenPIDs(self.wrapperPID): try: cmdline = open("/proc/%d/cmdline" % childPid, "r").read().replace("\0", " ").strip() except IOError: # Process gone away? Not running on Linux? Skip anyway continue if re.search(self.stopSigRegex, cmdline) is not None: self.log.info( 'Sending signal %d to process ID %d, cmdline = "%s"' % (self.stopSigNumber, childPid, cmdline) ) os.kill(childPid, self.stopSigNumber) except Exception as e: self.log.error("Failed to send signals to JobWrapper children!", repr(e)) return S_OK() ############################################################################# def _performChecks(self): """The Watchdog checks are performed at a different period to the checking of the application thread and correspond to the checkingTime. """ self.log.verbose("------------------------------------") self.log.verbose("Checking loop starts for Watchdog") heartBeatDict = {} msg = "" loadAvg = float(os.getloadavg()[0]) msg += "LoadAvg: %d " % loadAvg heartBeatDict["LoadAverage"] = loadAvg if "LoadAverage" not in self.parameters: self.parameters["LoadAverage"] = [] self.parameters["LoadAverage"].append(loadAvg) memoryUsed = self.getMemoryUsed() msg += "MemUsed: %.1f kb " % (memoryUsed) heartBeatDict["MemoryUsed"] = memoryUsed if "MemoryUsed" not in self.parameters: self.parameters["MemoryUsed"] = [] self.parameters["MemoryUsed"].append(memoryUsed) result = self.profiler.vSizeUsage(withChildren=True) if not result["OK"]: self.log.warn("Could not get vSize info from profiler", result["Message"]) else: vsize = result["Value"] * 1024.0 heartBeatDict["Vsize"] = vsize self.parameters.setdefault("Vsize", []) self.parameters["Vsize"].append(vsize) msg += "Job Vsize: %.1f kb " % vsize result = self.profiler.memoryUsage(withChildren=True) if not result["OK"]: self.log.warn("Could not get rss info from profiler", result["Message"]) else: rss = result["Value"] * 1024.0 heartBeatDict["RSS"] = rss self.parameters.setdefault("RSS", []) self.parameters["RSS"].append(rss) msg += "Job RSS: %.1f kb " % rss if "DiskSpace" not in self.parameters: self.parameters["DiskSpace"] = [] # We exclude fuse so that mountpoints can be cleaned up by automount after a period unused # (specific request from CERN batch service). result = self.getDiskSpace(exclude="fuse") if not result["OK"]: self.log.warn("Could not establish DiskSpace", result["Message"]) else: msg += "DiskSpace: %.1f MB " % (result["Value"]) self.parameters["DiskSpace"].append(result["Value"]) heartBeatDict["AvailableDiskSpace"] = result["Value"] cpu = self.__getCPU() if not cpu["OK"]: msg += "CPU: ERROR " hmsCPU = 0 else: cpu = cpu["Value"] msg += "CPU: %s (h:m:s) " % (cpu) if "CPUConsumed" not in self.parameters: self.parameters["CPUConsumed"] = [] self.parameters["CPUConsumed"].append(cpu) hmsCPU = cpu rawCPU = self.__convertCPUTime(hmsCPU) if rawCPU["OK"]: heartBeatDict["CPUConsumed"] = rawCPU["Value"] result = self.__getWallClockTime() if not result["OK"]: self.log.warn("Failed determining wall clock time", result["Message"]) else: msg += "WallClock: %.2f s " % (result["Value"]) self.parameters.setdefault("WallClockTime", list()).append(result["Value"]) heartBeatDict["WallClockTime"] = result["Value"] * self.processors self.log.info(msg) result = self._checkProgress() if not result["OK"]: self.checkError = result["Message"] self.log.warn(self.checkError) if self.jobPeekFlag: result = self.__peek() if result["OK"]: outputList = result["Value"] self.log.info("Last lines of available application output:") self.log.info("================START================") for line in outputList: self.log.info(line) self.log.info("=================END=================") self.__killRunningThread() return S_OK() recentStdOut = "None" if self.jobPeekFlag: result = self.__peek() if result["OK"]: outputList = result["Value"] size = len(outputList) recentStdOut = "Last %s lines of application output from Watchdog on %s [UTC]:" % ( size, Time.dateTime(), ) border = "=" * len(recentStdOut) cpuTotal = "Last reported CPU consumed for job is %s (h:m:s)" % (hmsCPU) if self.timeLeft: cpuTotal += ", Batch Queue Time Left %s (s @ HS06)" % self.timeLeft recentStdOut = "\n%s\n%s\n%s\n%s\n" % (border, recentStdOut, cpuTotal, border) self.log.info(recentStdOut) for line in outputList: self.log.info(line) recentStdOut += line + "\n" else: recentStdOut = ( "Watchdog is initializing and will attempt to obtain standard output from application thread" ) self.log.info(recentStdOut) self.peekFailCount += 1 if self.peekFailCount > self.peekRetry: self.jobPeekFlag = 0 self.log.warn("Turning off job peeking for remainder of execution") if "JOBID" not in os.environ: self.log.info("Running without JOBID so parameters will not be reported") return S_OK() jobID = os.environ["JOBID"] staticParamDict = {"StandardOutput": recentStdOut} self.__sendSignOfLife(int(jobID), heartBeatDict, staticParamDict) return S_OK("Watchdog checking cycle complete") ############################################################################# def __getCPU(self): """Uses the profiler to get CPU time for current process, its child, and the terminated child, and returns HH:MM:SS after conversion. """ result = self.profiler.cpuUsageUser(withChildren=True, withTerminatedChildren=True) if not result["OK"]: self.log.warn("Issue while checking consumed CPU for user", result["Message"]) if result["Errno"] == errno.ESRCH: self.log.warn("The main process does not exist (anymore). This might be correct.") return result cpuUsageUser = result["Value"] result = self.profiler.cpuUsageSystem(withChildren=True, withTerminatedChildren=True) if not result["OK"]: self.log.warn("Issue while checking consumed CPU for system", result["Message"]) if result["Errno"] == errno.ESRCH: self.log.warn("The main process does not exist (anymore). This might be correct.") return result cpuUsageSystem = result["Value"] cpuTimeTotal = cpuUsageUser + cpuUsageSystem if cpuTimeTotal: self.log.verbose("Raw CPU time consumed (s) =", cpuTimeTotal) return self.__getCPUHMS(cpuTimeTotal) self.log.error("CPU time consumed found to be 0") return S_ERROR() ############################################################################# def __getCPUHMS(self, cpuTime): mins, secs = divmod(cpuTime, 60) hours, mins = divmod(mins, 60) humanTime = "%02d:%02d:%02d" % (hours, mins, secs) self.log.verbose("Human readable CPU time is: %s" % humanTime) return S_OK(humanTime) ############################################################################# def __interpretControlSignal(self, signalDict): """This method is called whenever a signal is sent via the result of sending a sign of life. """ self.log.info("Received control signal") if isinstance(signalDict, dict): if "Kill" in signalDict: self.log.info("Received Kill signal, stopping job via control signal") self.checkError = JobMinorStatus.RECEIVED_KILL_SIGNAL self.__killRunningThread() else: self.log.info("The following control signal was sent but not understood by the watchdog:") self.log.info(signalDict) else: self.log.info("Expected dictionary for control signal", "received:\n%s" % (signalDict)) return S_OK() ############################################################################# def _checkProgress(self): """This method calls specific tests to determine whether the job execution is proceeding normally. CS flags can easily be added to add or remove tests via central configuration. """ report = "" if self.testWallClock: result = self.__checkWallClockTime() if not result["OK"]: self.log.warn(result["Message"]) return result report += "WallClock: OK, " else: report += "WallClock: NA," if self.testDiskSpace: result = self.__checkDiskSpace() if not result["OK"]: self.log.warn(result["Message"]) return result report += "DiskSpace: OK, " else: report += "DiskSpace: NA," if self.testLoadAvg: result = self.__checkLoadAverage() if not result["OK"]: self.log.warn( "Check of load average failed, but won't fail because of that", ": %s" % result["Message"] ) report += "LoadAverage: ERROR, " return S_OK() report += "LoadAverage: OK, " else: report += "LoadAverage: NA," if self.testCPUConsumed: result = self.__checkCPUConsumed() if not result["OK"]: return result report += "CPUConsumed: OK, " else: report += "CPUConsumed: NA, " if self.testCPULimit: result = self.__checkCPULimit() if not result["OK"]: self.log.warn(result["Message"]) return result report += "CPULimit OK, " else: report += "CPULimit: NA, " if self.testTimeLeft: self.__timeLeft() if self.timeLeft: report += "TimeLeft: OK" else: report += "TimeLeft: NA" if self.testMemoryLimit: result = self.__checkMemoryLimit() if not result["OK"]: self.log.warn(result["Message"]) return result report += "MemoryLimit OK, " else: report += "MemoryLimit: NA, " self.log.info(report) return S_OK("All enabled checks passed") ############################################################################# def __checkCPUConsumed(self): """Checks whether the CPU consumed by application process is reasonable. This method will report stalled jobs to be killed. """ self.log.info("Checking CPU Consumed") if "WallClockTime" not in self.parameters: return S_ERROR("Missing WallClockTime info") if "CPUConsumed" not in self.parameters: return S_ERROR("Missing CPUConsumed info") wallClockTime = self.parameters["WallClockTime"][-1] if wallClockTime < self.sampleCPUTime: self.log.info( "Stopping check, wallclock time is still smaller than sample time", "(%s) < (%s)" % (wallClockTime, self.sampleCPUTime), ) return S_OK() intervals = max(1, int(self.sampleCPUTime / self.checkingTime)) if len(self.parameters["CPUConsumed"]) < intervals + 1: self.log.info( "Not enough snapshots to calculate", "there are %s and we need %s" % (len(self.parameters["CPUConsumed"]), intervals + 1), ) return S_OK() wallClockTime = self.parameters["WallClockTime"][-1] - self.parameters["WallClockTime"][-1 - intervals] try: cpuTime = self.__convertCPUTime(self.parameters["CPUConsumed"][-1])["Value"] # For some reason, some times the CPU consumed estimation returns 0 # if cpuTime == 0: # return S_OK() cpuTime -= self.__convertCPUTime(self.parameters["CPUConsumed"][-1 - intervals])["Value"] if cpuTime < 0: self.log.warn("Consumed CPU time negative, something wrong may have happened, ignore") return S_OK() if wallClockTime <= 0: self.log.warn("Wallclock time should not be negative or zero, Ignore") return S_OK() ratio = (cpuTime / wallClockTime) * 100 self.log.info("CPU/Wallclock ratio is %.2f%%" % ratio) # in case of error cpuTime might be 0, exclude this if ratio < self.minCPUWallClockRatio: if ( os.path.exists("DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK") or "DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK" in os.environ ): self.log.warn( "N.B. job would be declared as stalled but CPU / WallClock check is disabled by payload" ) return S_OK() self.log.info("Job is stalled!") return S_ERROR(JobMinorStatus.WATCHDOG_STALLED) except Exception as e: self.log.error("Cannot convert CPU consumed from string to int", str(e)) return S_OK() ############################################################################# def __convertCPUTime(self, cputime): """Method to convert the CPU time as returned from the Watchdog instances to the equivalent DIRAC normalized CPU time to be compared to the Job CPU requirement. """ cpuValue = 0 cpuHMS = cputime.split(":") # for i in range( len( cpuHMS ) ): # cpuHMS[i] = cpuHMS[i].replace( '00', '0' ) try: hours = float(cpuHMS[0]) * 60 * 60 mins = float(cpuHMS[1]) * 60 secs = float(cpuHMS[2]) cpuValue = float(hours + mins + secs) except Exception as x: self.log.warn(str(x)) return S_ERROR("Could not calculate CPU time") # Normalization to be implemented normalizedCPUValue = cpuValue result = S_OK() result["Value"] = normalizedCPUValue self.log.debug("CPU value %s converted to %s" % (cputime, normalizedCPUValue)) return result ############################################################################# def __checkCPULimit(self): """Checks that the job has consumed more than the job CPU requirement (plus a configurable margin) and kills them as necessary. """ consumedCPU = 0 if "CPUConsumed" in self.parameters: consumedCPU = self.parameters["CPUConsumed"][-1] consumedCPUDict = self.__convertCPUTime(consumedCPU) if consumedCPUDict["OK"]: currentCPU = consumedCPUDict["Value"] else: return S_OK("Not possible to determine current CPU consumed") if consumedCPU: limit = int(self.jobCPUTime + self.jobCPUTime * (self.jobCPUMargin / 100)) cpuConsumed = float(currentCPU) if cpuConsumed > limit: self.log.info( "Job has consumed more than the specified CPU limit", "with an additional %s%% margin" % (self.jobCPUMargin), ) return S_ERROR("Job has exceeded maximum CPU time limit") return S_OK("Job within CPU limit") if not currentCPU: self.log.verbose("Both initial and current CPU consumed are null") return S_OK("CPU consumed is not measurable yet") return S_OK("Not possible to determine CPU consumed") def __checkMemoryLimit(self): """Checks that the job memory consumption is within a limit""" vsize = 0 if "Vsize" in self.parameters: vsize = self.parameters["Vsize"][-1] if vsize and self.memoryLimit: if vsize > self.memoryLimit: vsize = vsize # Just a warning for the moment self.log.warn( "Job has consumed %f.2 KB of memory with the limit of %f.2 KB" % (vsize, self.memoryLimit) ) return S_OK() ############################################################################# def __checkDiskSpace(self): """Checks whether the CS defined minimum disk space is available.""" if "DiskSpace" in self.parameters: availSpace = self.parameters["DiskSpace"][-1] if availSpace >= 0 and availSpace < self.minDiskSpace: self.log.info( "Not enough local disk space for job to continue, defined in CS as %s MB" % (self.minDiskSpace) ) return S_ERROR(JobMinorStatus.JOB_INSUFFICIENT_DISK) else: return S_OK("Job has enough disk space available") else: return S_ERROR("Available disk space could not be established") ############################################################################# def __checkWallClockTime(self): """Checks whether the job has been running for the CS defined maximum wall clock time. """ if "StartTime" in self.initialValues: startTime = self.initialValues["StartTime"] if time.time() - startTime > self.maxWallClockTime: self.log.info("Job has exceeded maximum wall clock time of %s seconds" % (self.maxWallClockTime)) return S_ERROR(JobMinorStatus.JOB_EXCEEDED_WALL_CLOCK) else: return S_OK("Job within maximum wall clock time") else: return S_ERROR("Job start time could not be established") ############################################################################# def __checkLoadAverage(self): """Checks whether the CS defined maximum load average is exceeded.""" if "LoadAverage" in self.parameters: loadAvg = self.parameters["LoadAverage"][-1] if loadAvg > float(self.loadAvgLimit): self.log.info("Maximum load average exceeded, defined in CS as %s " % (self.loadAvgLimit)) return S_ERROR("Job exceeded maximum load average") return S_OK("Job running with normal load average") return S_ERROR("Job load average not established") ############################################################################# def __peek(self): """Uses ExecutionThread.getOutput() method to obtain standard output from running thread via subprocess callback function. """ result = self.exeThread.getOutput() if not result["OK"]: self.log.warn("Could not obtain output from running application thread") self.log.warn(result["Message"]) return result ############################################################################# def calibrate(self): """The calibrate method obtains the initial values for system memory and load and calculates the margin for error for the rest of the Watchdog cycle. """ self.__getWallClockTime() self.parameters["WallClockTime"] = [] cpuConsumed = self.__getCPU() if not cpuConsumed["OK"]: self.log.warn("Could not establish CPU consumed, setting to 0.0") cpuConsumed = 0.0 else: cpuConsumed = cpuConsumed["Value"] self.initialValues["CPUConsumed"] = cpuConsumed self.parameters["CPUConsumed"] = [] self.initialValues["LoadAverage"] = float(os.getloadavg()[0]) self.parameters["LoadAverage"] = [] memUsed = self.getMemoryUsed() self.initialValues["MemoryUsed"] = memUsed self.parameters["MemoryUsed"] = [] result = self.profiler.vSizeUsage(withChildren=True) if not result["OK"]: self.log.warn("Could not get vSize info from profiler", result["Message"]) else: vsize = result["Value"] * 1024.0 self.initialValues["Vsize"] = vsize self.log.verbose("Vsize(kb)", "%.1f" % vsize) self.parameters["Vsize"] = [] result = self.profiler.memoryUsage(withChildren=True) if not result["OK"]: self.log.warn("Could not get rss info from profiler", result["Message"]) else: rss = result["Value"] * 1024.0 self.initialValues["RSS"] = rss self.log.verbose("RSS(kb)", "%.1f" % rss) self.parameters["RSS"] = [] # We exclude fuse so that mountpoints can be cleaned up by automount after a period unused # (specific request from CERN batch service). result = self.getDiskSpace(exclude="fuse") self.log.verbose("DiskSpace: %s" % (result)) if not result["OK"]: self.log.warn("Could not establish DiskSpace") else: self.initialValues["DiskSpace"] = result["Value"] self.parameters["DiskSpace"] = [] result = self.getNodeInformation() self.log.verbose("NodeInfo", result) if "LSB_JOBID" in os.environ: result["LocalJobID"] = os.environ["LSB_JOBID"] if "PBS_JOBID" in os.environ: result["LocalJobID"] = os.environ["PBS_JOBID"] if "QSUB_REQNAME" in os.environ: result["LocalJobID"] = os.environ["QSUB_REQNAME"] if "JOB_ID" in os.environ: result["LocalJobID"] = os.environ["JOB_ID"] self.__reportParameters(result, "NodeInformation", True) self.__reportParameters(self.initialValues, "InitialValues") return S_OK() def __timeLeft(self): """ return Normalized CPU time left in the batch system 0 if not available update self.timeLeft and self.littleTimeLeft """ # Get CPU time left in the batch system result = self.timeLeftUtil.getTimeLeft(0.0) if not result["OK"]: # Could not get CPU time left, we might need to wait for the first loop # or the Utility is not working properly for this batch system # or we are in a batch system timeLeft = 0 else: timeLeft = result["Value"] self.timeLeft = timeLeft if not self.littleTimeLeft: if timeLeft and timeLeft < self.grossTimeLeftLimit: self.log.info("TimeLeft bellow %s, now checking with higher frequency" % timeLeft) self.littleTimeLeft = True # TODO: better configurable way of doing this to be coded self.littleTimeLeftCount = 15 else: if self.timeLeft and self.timeLeft < self.fineTimeLeftLimit: timeLeft = -1 return timeLeft ############################################################################# def __getUsageSummary(self): """Returns average load, memory etc. over execution of job thread""" summary = {} # CPUConsumed if "CPUConsumed" in self.parameters: cpuList = self.parameters["CPUConsumed"] if cpuList: hmsCPU = cpuList[-1] rawCPU = self.__convertCPUTime(hmsCPU) if rawCPU["OK"]: summary["LastUpdateCPU(s)"] = rawCPU["Value"] else: summary["LastUpdateCPU(s)"] = "Could not be estimated" # DiskSpace if "DiskSpace" in self.parameters: space = self.parameters["DiskSpace"] if space: value = abs(float(space[-1]) - float(self.initialValues["DiskSpace"])) if value < 0: value = 0 summary["DiskSpace(MB)"] = value else: summary["DiskSpace(MB)"] = "Could not be estimated" # MemoryUsed if "MemoryUsed" in self.parameters: memory = self.parameters["MemoryUsed"] if memory: summary["MemoryUsed(kb)"] = abs(float(memory[-1]) - float(self.initialValues["MemoryUsed"])) else: summary["MemoryUsed(kb)"] = "Could not be estimated" # LoadAverage if "LoadAverage" in self.parameters: laList = self.parameters["LoadAverage"] if laList: summary["LoadAverage"] = sum(laList) / len(laList) else: summary["LoadAverage"] = "Could not be estimated" result = self.__getWallClockTime() if not result["OK"]: self.log.warn("Failed determining wall clock time", result["Message"]) summary["WallClockTime(s)"] = 0 summary["ScaledCPUTime(s)"] = 0 else: wallClock = result["Value"] summary["WallClockTime(s)"] = wallClock * self.processors summary["ScaledCPUTime(s)"] = wallClock * self.cpuPower * self.processors self.__reportParameters(summary, "UsageSummary", True) self.currentStats = summary ############################################################################# def __reportParameters(self, params, title=None, report=False): """Will report parameters for job.""" try: parameters = [] self.log.info("", "==========================================================") if title: self.log.info("Watchdog will report", title) else: self.log.info("Watchdog will report parameters") self.log.info("", "==========================================================") vals = params if "Value" in params: if vals["Value"]: vals = params["Value"] for k, v in vals.items(): if v: self.log.info(str(k) + " = " + str(v)) parameters.append([k, v]) if report: self.__setJobParamList(parameters) self.log.info("", "==========================================================") except Exception as x: self.log.warn("Problem while reporting parameters") self.log.warn(repr(x)) ############################################################################# def __getWallClockTime(self): """Establishes the Wall Clock time spent since the Watchdog initialization""" result = S_OK() if "StartTime" in self.initialValues: currentTime = time.time() wallClock = currentTime - self.initialValues["StartTime"] result["Value"] = wallClock else: self.initialValues["StartTime"] = time.time() result["Value"] = 0.0 return result ############################################################################# def __killRunningThread(self): """Will kill the running thread process and any child processes.""" self.log.info("Sending kill signal to application PID %s" % (self.spObject.getChildPID())) result = self.spObject.killChild() self.applicationKilled = True self.log.info("Subprocess.killChild() returned:%s " % (result)) return S_OK("Thread killed") ############################################################################# def __sendSignOfLife(self, jobID, heartBeatDict, staticParamDict): """Sends sign of life 'heartbeat' signal and triggers control signal interpretation. """ result = JobStateUpdateClient().sendHeartBeat(jobID, heartBeatDict, staticParamDict) if not result["OK"]: self.log.warn("Problem sending sign of life") self.log.warn(result) if result["OK"] and result["Value"]: self.__interpretControlSignal(result["Value"]) return result ############################################################################# def __setJobParamList(self, value): """Wraps around setJobParameters of state update client""" # job wrapper template sets the jobID variable if "JOBID" not in os.environ: self.log.info("Running without JOBID so parameters will not be reported") return S_OK() jobID = os.environ["JOBID"] jobParam = JobStateUpdateClient().setJobParameters(int(jobID), value) self.log.verbose("setJobParameters(%s,%s)" % (jobID, value)) if not jobParam["OK"]: self.log.warn(jobParam["Message"]) return jobParam ############################################################################# def getNodeInformation(self): """Retrieves all static system information""" result = {} result["HostName"] = socket.gethostname() result["CPU(MHz)"] = psutil.cpu_freq()[0] result["Memory(kB)"] = int(psutil.virtual_memory()[1] / 1024) result["LocalAccount"] = getpass.getuser() if os.path.exists("/proc/cpuinfo"): with open("/proc/cpuinfo", "r") as cpuinfo: info = cpuinfo.readlines() try: result["ModelName"] = info[4].split(":")[1].replace(" ", "").replace("\n", "") except IndexError: pass try: result["CacheSize(kB)"] = [x.strip().split(":")[1] for x in info if "cache size" in x][0].strip() except IndexError: pass return result ############################################################################# def getMemoryUsed(self): """Obtains the memory used.""" mem = ( resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss ) return float(mem) ############################################################################# def getDiskSpace(self, exclude=None): """Obtains the available disk space.""" result = S_OK() diskSpace = getDiskSpace(exclude=exclude) if diskSpace == -1: result = S_ERROR("Could not obtain disk usage") self.log.warn(" Could not obtain disk usage") result["Value"] = float(-1) return result result["Value"] = float(diskSpace) return result # EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#