#!/bin/sh # set -xv # # scan-sge-job does not use log-files, it only uses qacct. # # usage: scan_sge_job control_dir ... # ARC1 passes first the config file. if [ "$1" = "--config" ]; then shift; ARC_CONFIG=$1; shift; fi if [ -z "$1" ] ; then exit 1 ; fi # first control_dir is used for storing own files control_dir=$1 control_dirs= while [ $# -gt 0 ] ; do control_dirs="${control_dirs} $1" shift done joboption_lrms="sge" lrms_options="sge_root sge_cell sge_qmaster_port sge_execd_port sge_bin_path sge_jobopts" queue_options="sge_jobopts" # define paths and config parser basedir=`dirname $0` basedir=`cd $basedir > /dev/null && pwd` || exit $? . "${basedir}/lrms_common.sh" # include common scan functions . "${pkgdatadir}/scan_common.sh" || exit $? # run common init # * parse config # * load LRMS-specific env # * set common variables common_init # Assume that gm-kick and scan_common are installed in the same directory GMKICK=${pkglibexecdir}/gm-kick olog () { echo "[`date +%Y-%m-%d\ %T`] scan-sge-job: $*" 1>&2; } umask 022 my_id=`id -u` # Log system performance if [ ! -z "$perflogdir" ]; then perflog_common "$perflogdir" "$CONFIG_controldir" fi # Takes diagfile_acct, SGE qacct command, id and extraargs # writes output to diagfile_acct (temporary file for accounting info) write_diagfile_acct () { diagfile_acct=$1 qacct_command=$2 id=$3 extraargs=$4 #Check that qacct is available. if ! `which $qacct_command >dev/null` ; then olog "WARNING: qacct was not found. Accounting data will not be recorded." fi $qacct_command -j $id $extraargs 2> /dev/null \ | /usr/bin/perl -e 'while(<>) { $nodename=$1 if /^hostname\s+(\S+)/; $id=$1 if /^jobnumber\s+(\S+)/; $exitcode=$1 if /^exit_status\s+(\S+)/; $failed=$1 if /^failed\s+(.*\S)/; $CPUTime=$1 if /^cpu\s+(\d+)/; $Processors=$1 if /^slots\s+(\d+)/; $KernelTime=$1 if /^ru_stime\s+(\d+)/; $WallTime=$1 if /^ru_wallclock\s+(\d+)/; $UsedMemory=$1 if /^maxvmem\s+(\S+)M/; $UsedMemory=$1*1024 if /^maxvmem\s+(\S+)G/; $start_time=`date -d "$1" +%s` if /^start_time\s+(.+)/; $end_time =`date -d "$1" +%s` if /^end_time\s+(.+)/; } # converts seconds since epoch in local time into Mds time format (UTC) sub mds_date { my ($seconds) = @_; my @t = gmtime $seconds; my ($Y,$M,$D,$h,$m,$s) = (1900+$t[5],1+$t[4],$t[3],$t[2],$t[1],$t[0]); return sprintf "%04i%02i%02i%02i%02i%02iZ",$Y,$M,$D,$h,$m,$s; } END { exit unless $id; print "LRMSStartTime=".mds_date($1)."\n" if $start_time =~ m/^(\d+)$/; print "LRMSEndTime=" .mds_date($1)."\n" if $end_time =~ m/^(\d+)$/; print "nodename=${nodename}\n"; print "CPUTime=${CPUTime}.0s\n"; print "Processors=${Processors}\n"; print "WallTime=${WallTime}.0s\n"; print "KernelTime=${KernelTime}.0s\n"; print "UserTime=".int($CPUTime-$KernelTime).".0s\n"; print "AverageTotalMemory=".int($UsedMemory*1024)."kB\n"; print "failed=$failed\n"; print "\nsgeexitcode=$exitcode\n"; }' \ > "$diagfile_acct" } # Parse GRAMI-file and see what was requested check_exceeded_resources_grami () { gramifile=$1 errorsfile=$2 used_cputime=$3 used_walltime=$4 used_memory=$5 req_walltime=`sed -n "s/^joboption_walltime=//p" "$gramifile" | tail -n 1` req_cputime=`sed -n "s/^joboption_cputime=//p" "$gramifile" | tail -n 1` req_memory=`sed -n "s/^joboption_memory=//p" "$gramifile" | tail -n 1` if [ ! -z "$used_memory" ] && [ ! -z "$req_memory" ] \ && [ "$req_memory" != "" ] && [ "$req_memory" -gt 0 ] \ && [ $(( 100*$used_memory/1024/$req_memory )) -gt 95 ]; then overlimit="memory" fi if [ ! -z "$used_cputime" ] && [ ! -z "$req_cputime" ] \ && [ "$req_cputime" != "" ] && [ "$req_cputime" -gt 0 ] \ && [ $(( 100*$used_cputime/$req_cputime )) -gt 95 ]; then overlimit="cputime" fi if [ ! -z "$used_walltime" ] && [ ! -z "$req_walltime" ] \ && [ "$req_walltime" != "" ] && [ "$req_walltime" -gt 0 ] \ && [ $(( 100*$used_walltime/$req_walltime )) -gt 95 ]; then overlimit="walltime" fi echo ++++++++++++++++++++++++++ >> "$errorsfile" echo Resources: >> "$errorsfile" echo ++++++++++++++++++++++++++ >> "$errorsfile" echo req_memory=$req_memory MB >> "$errorsfile" echo req_cputime=$req_cputime >> "$errorsfile" echo req_walltime=$req_walltime >> "$errorsfile" echo used_memory=$used_memory kB >> "$errorsfile" echo used_cputime=$used_cputime >> "$errorsfile" echo used_walltime=$used_walltime >> "$errorsfile" if [ ! -z "$overlimit" ]; then echo overlimit=$overlimit >> "$errorsfile" fi echo ++++++++++++++++++++++++++ >> "$errorsfile" } #Handle failedcode handle_failedcode () { failedcode=$1 donefile=$2 exitcode=$3 sgeexitcode=$4 overlimit=$5 if [ -z "$failedcode" ]; then # Should never happen olog "SGE job $id failed: SGE accouting record is incomplete" echo "-1 SGE accouting record is incomplete" > "$donefile" elif [ "$failedcode" = "0" ]; then if [ -z "$exitcode" ]; then olog "SGE job $id failed with unknown exit code" if [ -z "$sgeexitcode" ] || [ "$sgeexitcode" = 0 ]; then sgeexitcode="-1"; fi echo "$sgeexitcode Job failed with unknown exit code" > "$donefile" elif [ "$exitcode" = "0" ]; then #olog "SGE job $id finished successfully" echo "0" > "$donefile" else #olog "SGE job $id failed with exit code $exitcode" if [ -z "$sgeexitcode" ] || [ "$sgeexitcode" = 0 ]; then sgeexitcode="-1"; fi echo "$sgeexitcode Job failed with exit code $exitcode" > "$donefile" fi else # SGE reports a problem if [ "$failedcode" = "25" ]; then failedreason="SGE error $failedcode: Job will be rescheduled" elif [ "$failedcode" = "24" ]; then failedreason="SGE error $failedcode: Job will be migrated" elif [ "$failedcode" = "100" ]; then # This happens when SGE signals the job, as in the case when a # resource limit is exceeded. We don't know for sure whether # they were enforced or not but if a job is killed by SGE, this # might be the likely cause. if [ -z "$overlimit" ]; then failedreason="SGE error $failedreason" elif [ $overlimit = "memory" ]; then failedreason="job killed: vmem" elif [ $overlimit = "cputime" ]; then failedreason="job killed: cput" elif [ $overlimit = "walltime" ]; then failedreason="job killed: wall" fi else failedreason="SGE error $failedreason" fi if [ ! -z "$eqwmessage" ]; then failedreason="$eqwmessage" fi olog "SGE job $id failed: $failedreason" if [ -z "$sgeexitcode" ] || [ "$sgeexitcode" = 0 ]; then sgeexitcode="-1"; fi echo "271 $failedreason" > "$donefile" fi # failedcode } # Add accounting info to $diagfile add_accounting_to_diag () { diagfile=$1 diagfile_acct=$2 diagfile_tmp=$3 errorsfile=$4 uid=$5 commentfile=$6 ctr_dir=$7 gramifile="${ctr_dir}/job.${gridid}.grami" donefile="${ctr_dir}/job.${gridid}.lrms_done" countfile="${ctr_dir}/job.${gridid}.lrms_job" errorsfile="${ctr_dir}/job.${gridid}.errors" localfile="${ctr_dir}/job.${gridid}.local" used_walltime=`sed -n 's/^WallTime=\(.*\).0s/\1/p' "$diagfile_acct" | tail -n 1` used_cputime=`sed -n 's/^CPUTime=\(.*\).0s/\1/p' "$diagfile_acct" | tail -n 1` used_memory=`sed -n 's/^AverageTotalMemory=\(.*\)kB/\1/p' "$diagfile_acct" | tail -n 1` sgeexitcode=`sed -n 's/^sgeexitcode=//p' "$diagfile_acct" | tail -n 1` failedreason=`sed -n 's/^failed=//p' "$diagfile_acct" | tail -n 1` failedcode=`echo $failedreason | awk '{print $1}'` if [ -s "$diagfile_acct" ]; then # Remove attributes from existing diagfile that we should have # gotten from qacct this time, otherwise we will get duplicates do_as_uid "$uid" "cat '$diagfile'" \ | grep -v "^nodename=" \ | grep -v "^WallTime=" \ | grep -v "^KernelTime=" \ | grep -v "^UserTime=" \ | grep -v "^CPUTime=" \ | grep -v "^Processors=" \ | grep -v "^LRMSStartTime=" \ | grep -v "^LRMSEndTime=" \ | grep -v "^MaxResidentMemory=" \ | grep -v "^AverageTotalMemory=" \ > "$diagfile_tmp" cat "$diagfile_tmp" "$diagfile_acct" \ | grep -v '^sgeexitcode=' \ | do_as_uid "$uid" "cat > '$diagfile'" # Check for exceeded resources limits overlimit= if [ -s "$gramifile" ]; then check_exceeded_resources_grami "$gramifile" "$errorsfile" "$used_cputime" "$used_walltime" "$used_memory" fi # grami file save_commentfile "$uid" "$commentfile" "$errorsfile" handle_failedcode "$failedcode" "$donefile" "$exitcode" "$sgeexitcode" "$overlimit" # wake up GM $GMKICK -j "${gridid}" "$localfile" >> "$errorsfile" rm -f "$countfile" rm -f "$diagfile_tmp" "$diagfile_acct" # we're done, go to next job id return 0 fi # accounting info ok rm -f "$diagfile_tmp" "$diagfile_acct" return 1 } # Handle missing accounting info handle_missing_accounting () { countfile=$1 uid=$2 commentfile=$3 errorsfile=$4 donefile=$5 exitcode=$6 localfile=$7 GMKICK=$8 SGE_BIN_PATH=$9 id=$10 if [ -n "$noaccounting" ]; then # Accounting file is not accessible on this host. echo "scan-sge-job: WARNING: SGE's accounting file is not accessible on the Grid frontend node" >> "$errorsfile" echo "scan-sge-job: WARNING: Resource usage reported for this job may be inaccurate or incomplete" >> "$errorsfile" if [ -z "$exitcode" ]; then echo "-1 Job failed with unknown exit code" > "$donefile" elif [ "$exitcode" = "0" ]; then echo "0" > "$donefile" else echo "$exitcode Job failed with exit code $exitcode" > "$donefile" fi $GMKICK -j "${gridid}" "$localfile" >> "$errorsfile" return fi # There is a certain lag between the end of the job # and the time when accouting information becomes available. # We do 5 retries, keeping the count in $countfile counter=0 if [ -f "$countfile" ] ; then counter=`cat "$countfile"` counter=$(( $counter + 1 )) fi if [ "$counter" -gt 5 ]; then # Cannot wait more for accounting info. echo "scan-sge-job: WARNING: No SGE accounting record found for this job" >> "$errorsfile" echo "scan-sge-job: WARNING: Resource usage reported for this job may be inaccurate or incomplete" >> "$errorsfile" save_commentfile "$uid" "$commentfile" "$errorsfile" if [ -z "$exitcode" ]; then olog "No SGE accounting record found for job $id. No exitcode in diag file" echo "-1 Job failed with unknown exit code" > "$donefile" elif [ "$exitcode" = "0" ]; then olog "No SGE accounting record found for job $id. exitcode=$exitcode in diag file" echo "0" > "$donefile" else olog "No SGE accounting record found for job $id. exitcode=$exitcode in diag file" echo "$exitcode Job failed with exit code $exitcode" > "$donefile" fi rm -f "$countfile" # wake up GM $GMKICK -j "${gridid}" "$localfile" >> "$errorsfile" else # test again for job existence, only count if not known ${SGE_BIN_PATH}/qstat -j $id > /dev/null 2>&1 if [ $? -ne 0 ]; then echo "$counter" > "$countfile" else olog "SGE job $id disappeared and then reappeared!" rm -f "$countfile" fi fi } # GD: no attempt to look for SGE Manager logfiles, restrict to job logs. if [ ! -z "$perflogdir" ]; then start_ts=`date +%s.%N` fi # Get all jobs pids=`${SGE_BIN_PATH}/qstat -u '*' 2>/dev/null | sed -n 's/^ *\([0-9][0-9]*\) .*/\1/p'` if [ $? != 0 ]; then olog "Failed running ${SGE_BIN_PATH}/qstat" sleep 60 exit 1 fi if [ ! -z "$perflogdir" ]; then stop_ts=`date +%s.%N` t=`awk "BEGIN { printf \"%.3f\", ${stop_ts}-${start_ts} }"` echo "[`date +%Y-%m-%d\ %T`] scan-sge-job, qstat -u '*': $t" >> $perflogfile fi # Go through directories for ctr_dir in $control_dirs ; do # Obtain ids of pending/running jobs stored in job.*.local if [ ! -z "$perflogdir" ]; then start_ts=`date +%s.%N` fi rjobs=`find "$ctr_dir/processing" -name 'job.*.status' 2>/dev/null \ | sed 's/processing\/job\.\([^\.]*\)\.status$/job.\1.local/'` if [ ! -z "$perflogdir" ]; then stop_ts=`date +%s.%N` t=`awk "BEGIN { printf \"%.3f\", ${stop_ts}-${start_ts} }"` echo "[`date +%Y-%m-%d\ %T`] scan-sge-job, control dir traversal: $t" >> $perflogfile fi if [ -z "$rjobs" ] ; then continue ; fi ids=`echo $rjobs | xargs grep -h '^localid=' 2>/dev/null | sed 's/^localid=\([^ ]*\)/\1/'` if [ -z "$ids" ] ; then continue ; fi # compare them to running jobs and find missing bids= for id in $ids ; do found=`echo "$pids" | grep "^$id"` if [ -z "$found" ] ; then bids="$bids $id" fi done # go through missing ids for id in $bids ; do # find grid job corresponding to current local id jobfile=`find "$ctr_dir/processing" -name 'job.*.status' 2>/dev/null \ | sed 's/processing\/job\.\([^\.]*\)\.status$/job.\1.local/' \ | xargs grep -l "localid=$id\$" 2>/dev/null` if [ -z "$jobfile" ] ; then continue ; fi # unless running as superuser, skip jobs not belonging to the current user if [ "$my_id" != '0' ] ; then if [ ! -O "$jobfile" ] ; then continue ; fi fi # find who runs this job uid=$(get_owner_uid "$jobfile") [ -z "$uid" ] && { log "Failed to stat $jobfile"; continue; } # extract grid id gridid=`basename "$jobfile" '.local' | sed 's/^job\.//'` donefile="${ctr_dir}/job.${gridid}.lrms_done" countfile="${ctr_dir}/job.${gridid}.lrms_job" failedfile="${ctr_dir}/job.${gridid}.failed" errorsfile="${ctr_dir}/job.${gridid}.errors" localfile="${ctr_dir}/job.${gridid}.local" xmlfile="${ctr_dir}/job.${gridid}.xml" if [ -f "$donefile" ] ; then continue ; fi statusfile="${ctr_dir}/processing/job.${gridid}.status" if [ ! -f "$statusfile" ] ; then continue ; fi status=`cat "$statusfile"` if [ "$status" != "INLRMS" ] && [ "$status" != "CANCELING" ] ; then continue ; fi # get session directory of this job session=`grep -h '^sessiondir=' "$jobfile" | sed 's/^sessiondir=\(.*\)/\1/'` commentfile="${session}.comment" commentfile2="${session}/.comment" do_as_uid "$uid" "cat '$commentfile2' >> '$commentfile' 2> /dev/null; rm -f '$commentfile2'"; if [ -d "$session" ] ; then diagfile="${session}.diag" diagfile2="${session}/.diag" do_as_uid "$uid" "cat '$diagfile2' >> '$diagfile' 2> /dev/null; rm -f '$diagfile2'"; # try to obtain the exit code # $diagfile is owned by the user running the job. Switch user to access it. exitcode=`do_as_uid "$uid" "sed -n 's/^exitcode=//p' '$diagfile'" | tail -n 1` diagfile_tmp=`mktemp "$TMPDIR/diag_tmp.XXXXXX"` || { sleep 60; exit 1; } diagfile_acct=`mktemp "$TMPDIR/diag_acct.XXXXXX"` || { sleep 60; exit 1; } noaccounting= # qacct can take quite long. Here is a workaround. # Find the accounting file, and copy the last 50000 # records to a temp file. acctfile=$SGE_ROOT/$SGE_CELL/common/accounting if [ -f "$acctfile" ]; then briefacct=`mktemp "$TMPDIR/accounting.XXXXXX"` || { sleep 60; exit 1; } tail -n 50000 "$acctfile" > "$briefacct" if [ $? = 0 ]; then extraargs="-f $briefacct"; fi else # Accounting file is not accessible on this host. noaccounting=1 fi eqwmessage=`grep "SGE job state was Eqw" $xmlfile | sed -r 's/<[/]?OtherMessages>//g'` # get accounting info. write diag file write_diagfile_acct "$diagfile_acct" "${SGE_BIN_PATH}/qacct" "$id" "$extraargs" if [ "x$briefacct" != "x" ]; then rm -f "$briefacct"; fi # If the last qacct record is about migration, # we should wait for the next qacct record to appear # Delete file, like there was no accounting present at all! if grep -q "^failed=24 *: migrating" "$diagfile_acct" \ || grep -q "^failed=25 *: rescheduling" "$diagfile_acct"; then rm -f "$diagfile_acct" olog "SGE job $id: the last record in qacct is about migration. Waiting for next record to appear" fi # Add accounting info to $diagfile add_accounting_to_diag "$diagfile" "$diagfile_acct" "$diagfile_tmp" "$errorsfile" "$uid" "$commentfile" "$ctr_dir" if [ $? = 0 ]; then continue fi fi # session directory exists # This section is only reached when accounting info is not present handle_missing_accounting "$countfile" "$uid" "$commentfile" "$errorsfile" "$donefile" "$exitcode" "$localfile" "$GMKICK" "$SGE_BIN_PATH" "$id" done # loop over bids # Detect the unlikely situation when a job reappears in the qstat listing # after being absent in the previous run of the scan-*-job (which updated # the counter file) for countfile in `find $ctr_dir -name 'job.*.lrms_job'`; do localfile=${countfile%.lrms_job}.local pid=`sed -n 's/^localid=\([^ ]*\)/\1/p' "$localfile" 2>/dev/null` if [ -z "$pid" ]; then continue; fi if echo "$pids" | grep "^$pid\$" >/dev/null; then olog "SGE job $id disappeared and then reappeared!" rm -f "$countfile" fi done done # loop over control_dirs sleep 60 exit 0