#!/bin/bash # Helper script to flag done LoadLeveler jobs. # The script is called periodically by the arex. # # This function retrieve the jobs status and id in one shot # look for jobs which have a known status but are not completed (!=C) # and save the localid of these jobs in the string variable # $outLocalIdsString. # # The input variable is a string list of localid to check. # # Example of usage: # get_bunch_jobs_status "$inLocalIdsString" outLocalIdsString="" # ARC1 passes the config file first. if [ "$1" = "--config" ]; then shift; ARC_CONFIG=$1; shift; fi # Does the control directory exist? control_dir="$1" test -d "$control_dir" || exit 1 joboption_lrms="ll" lrms_options="ll_bin_path ll_consumable_resources ll_parallel_single_jobs" # define paths and config parser basedir=`dirname $0` basedir=`cd $basedir > /dev/null && pwd` || exit $? . "${basedir}/lrms_common.sh" # include common scan functions . "${pkgdatadir}/scan_common.sh" || exit $? # run common init # * parse config # * load LRMS-specific env # * set common variables common_init # Assume that gm-kick is installed in the same directory GMKICK=${pkglibexecdir}/gm-kick # Log system performance if [ ! -z "$perflogdir" ]; then perflog_common "$perflogdir" "$CONFIG_controldir" fi if [ ! -z "$perflogdir" ]; then start_ts=`date +%s.%N` fi my_id=`id -u` get_bunch_jobs_status() { #get the string list of jobs loop=`$LL_BIN_PATH/llq -r %st %id $1` if [ $? -eq 0 ]; then for elm in $loop do if [ `echo $elm | grep '^[A-Z]\{1,2\}!.\+$'` ]; then if [ ! `echo $elm | grep '^C!'` ]; then outLocalIdsString=$outLocalIdsString" "`echo $elm | awk -F! '{ print $NF}'` fi fi done fi } # mergedlist: array where each element is made of # key:value, where key is the arc jobid and value is the # localid mergedlist=() # inLocalIdsString: in this string we save the localid retrived by from # the arc .local file divided by space inLocalIdsString="" findoutput=$(find "$control_dir/processing" -maxdepth 1 -type f -name 'job.*.status' | sed 's/processing\/job\.\([^\.]*\)\.status$/job.\1.local/') while read i do # Continue if no glob expansion or other problems test -f "$i" || continue jobid=`basename $i .local|sed 's/^job.//'` donefile="${control_dir}/job.${jobid}.lrms_done" statusfile="${control_dir}/processing/job.${jobid}.status" # Continue if the job is already flagged as done? test -f "$donefile" && continue if [ ! -f "$statusfile" ] ; then continue ; fi gmstatus=`cat "$statusfile"` if [ "$gmstatus" != "INLRMS" ] && [ "$gmstatus" != "CANCELING" ] ; then continue ; fi # Get local LRMS id of job by evaluating the line with localid localid=`grep ^localid= $i|head -1` eval $localid # Did we get a local id? test "$localid" = "" && continue # HACK: save the localid to be queried into inLocalIdsString # associate the localid to its jobid and save them in a list inLocalIdsString=$inLocalIdsString" "$localid mergedlist+=("$jobid:$localid") done <<< "$findoutput" if [ ! -z "$perflogdir" ]; then stop_ts=`date +%s.%N` t=`awk "BEGIN { printf \"%.3f\", ${stop_ts}-${start_ts} }"` echo "[`date +%Y-%m-%d\ %T`] scan-ll-job, ControldirTraversal: $t" >> $perflogfile fi if [ ! -z "$perflogdir" ]; then start_ts=`date +%s.%N` fi # Query the LoadLeveler for jobs # and save the not completed into $outLocalIdsString # Call the funcion only if there is some into the string if [[ $inLocalIdsString =~ /^[0-9]|[a-z]|[A-Z]*$/ ]]; then get_bunch_jobs_status "$inLocalIdsString" fi if [ ! -z "$perflogdir" ]; then stop_ts=`date +%s.%N` t=`awk "BEGIN { printf \"%.3f\", ${stop_ts}-${start_ts} }"` echo "[`date +%Y-%m-%d\ %T`] scan-ll-job, llq -r %st %id: $t" >> $perflogfile fi if [ ! -z "$perflogdir" ]; then start_ts=`date +%s.%N` fi numelem=0 # Start the loop based on element of the mergelist for element in ${mergedlist[@]} do # Divide the jobid from the localid jobid=`echo $element | awk '{split($0,a,":"); print a[1]}'` localid=`echo $element | awk '{split($0,a,":"); print a[2]}'` # Exclude the not completed jobs stored into the $outLocalIdsString if [[ $outLocalIdsString == *$localid* ]] then continue fi numelem=$((numelem+1)) donefile="${control_dir}/job.${jobid}.lrms_done" statusfile="${control_dir}/processing/job.${jobid}.status" jobfile="${control_dir}/job.${jobid}.local" errorsfile="${control_dir}/job.${jobid}.errors" # Continue if the job is already flagged as done? test -f "$donefile" && continue if [ ! -f "$statusfile" ] ; then continue ; fi gmstatus=`cat "$statusfile"` exitcode='' # get session directory of this job sessiondir=`grep -h '^sessiondir=' "$control_dir/job.${jobid}.local" | sed 's/^sessiondir=\(.*\)/\1/'` diagfile="${sessiondir}.diag" commentfile="${sessiondir}.comment" if [ "$my_id" != '0' ] ; then if [ ! -O "$jobfile" ] ; then continue ; fi fi uid=$(get_owner_uid "$jobfile") [ -z "$uid" ] && { log "Failed to stat $jobfile"; continue; } if [ ! -z "$sessiondir" ] ; then # have chance to obtain exit code exitcode=$(do_as_uid "$uid" "grep '^exitcode=' '$diagfile'" | sed 's/^exitcode=//') else continue fi if [ ! -z "$exitcode" ] ; then if [ "$exitcode" = "152" -o $exitcode = "24" ] ; then exitcode="24" save_commentfile "$uid" "${sessiondir}.comment" "$errorsfile" echo "$exitcode Job exceeded time limit." > "$donefile" # If job exceeded time, then it will have been killed and no cputime/walltime has been written walltime=`$LL_BIN_PATH/llq -l $localid|sed -n 's/^ *Wall Clk Hard Limit:.*(\([0-9]*\) seconds.*/\1/p'` usertime=`$LL_BIN_PATH/llq -l $localid|sed -n 's/^ *Step Cpu Hard Limit:.*(\([0-9]*\) seconds.*/\1/p'` starttime=`$LL_BIN_PATH/llq -l $localid|sed -n 's/^ *Dispatch Time: \(.*\)/\1/p'` endtime=`$LL_BIN_PATH/llq -l $localid|sed -n 's/^ *Completion Date: \(.*\)/\1/p'` if [ -n "$starttime" ]; then date_to_utc_seconds "$starttime" seconds_to_mds_date "$return_date_seconds" starttime=$return_mds_date fi if [ -n "$endtime" ]; then date_to_utc_seconds "$endtime" seconds_to_mds_date "$return_date_seconds" endtime=$return_mds_date fi job_read_diag [ -n "$walltime" ] && WallTime=${walltime} [ -n "$usertime" ] && UserTime=${usertime} [ -n "$usertime" ] && KernelTime=0 [ -n "$starttime" ] && LRMSStartTime=${starttime} [ -n "$endtime" ] && LRMSEndTime=${endtime} #This needs investigating, might be user program exit code [ -n "$exitcode" ] && LRMSExitcode=$exitcode job_write_diag ${GMKICK} -j "${jobid}" "$jobfile" continue fi # job finished and exit code is known save_commentfile "$uid" "${sessiondir}.comment" "$errorsfile" echo "$exitcode Executable finished with exit code $exitcode" >> "$donefile" ${GMKICK} -j "${jobid}" "$jobfile" continue fi exitcode=-1 save_commentfile "$uid" "${sessiondir}.comment" "$errorsfile" echo "$exitcode Job finished with unknown exit code" >> "$donefile" ${GMKICK} -j "${jobid}" "$jobfile" done if [ ! -z "$perflogdir" ]; then stop_ts=`date +%s.%N` t=`awk "BEGIN { printf \"%.3f\", ${stop_ts}-${start_ts} }"` echo "[`date +%Y-%m-%d\ %T`] scan-ll-job, JobHandling, Handled= $numelem: $t" >> $perflogfile fi sleep 60 exit 0