#!/bin/sh # # # Scan for finished LSF jobs, using bjobs # # usage: scan_lsf_job control_dir ... # Set variables: # LSF_BIN_PATH # ARC1 passes first the config file. if [ "$1" = "--config" ]; then shift; ARC_CONFIG=$1; shift; fi if [ -z "$1" ] ; then echo "Missing control directory path" 1>&2 exit 1 fi # first control_dir is used for storing own files echo `date`" : control_dir=$1" 1>&2 control_dir=$1 control_dirs= while [ $# -gt 0 ] ; do control_dirs="${control_dirs} $1" shift done joboption_lrms="lsf" lrms_options="lsf_architecture lsf_bin_path" # define paths and config parser basedir=`dirname $0` basedir=`cd $basedir > /dev/null && pwd` || exit $? . "${basedir}/lrms_common.sh" # include common scan functions . "${pkgdatadir}/scan_common.sh" || exit $? # run common init # * parse config # * load LRMS-specific env # * set common variables common_init # Assume that gm-kick is installed in the same directory GMKICK=${pkglibexecdir}/gm-kick umask 022 # Log system performance if [ ! -z "$perflogdir" ]; then perflog_common "$perflogdir" "$CONFIG_controldir" fi if [ ! -z "$perflogdir" ]; then start_ts=`date +%s.%N` fi my_id=`id -u` if [ -z ${LSF_BIN_PATH} ]; then echo "${LSF_BIN_PATH} not set" 1>&2 exit 1 fi # Get all running jobs lsf_stat=`${LSF_BIN_PATH}/bjobs -a -u all 2>/dev/null` # | grep RUN | grep '^ [:digit:]' if [ ! -z "$perflogdir" ]; then stop_ts=`date +%s.%N` t=`awk "BEGIN { printf \"%.3f\", ${stop_ts}-${start_ts} }"` echo "[`date +%Y-%m-%d\ %T`] scan-lsf-job, bjobs -a -u all: $t" >> $perflogfile fi if [ -z "${lsf_stat}" ] ; then echo "bjobs returned empty result" 1>&2 fi pids=`echo "${lsf_stat}" | egrep 'PSUSP|USUSP|SSUSP|RUN|PEND' | sed -e 's/^\([^ ]*\).*/\1/'` eval "set -- $control_dirs" # Go through directories for ctr_dir in $control_dir ; do if [ ! -z "$perflogdir" ]; then start_ts=`date +%s.%N` fi # Obtain ids stored in job.*.local ids=`find ${ctr_dir}/processing -name 'job.*.status' -print0 \ | sed 's/processing\/job\.\([^\.]*\)\.status/job.\1.local/g' \ | xargs -0 grep -h "^localid=" 2>/dev/null | sed 's/^localid=\([0-9]*\).*/\1/'` if [ ! -z "$perflogdir" ]; then stop_ts=`date +%s.%N` t=`awk "BEGIN { printf \"%.3f\", ${stop_ts}-${start_ts} }"` echo "[`date +%Y-%m-%d\ %T`] scan-lsf-job, ControlDirTraversal: $t" >> $perflogfile fi if [ -z "$ids" ] ; then continue ; fi # compare them to running jobs and find missing bids= for id in $ids ; do found=`echo "$pids" | grep "^$id$"` if [ -z "$found" ] ; then bids="$bids $id" fi done if [ ! -z "$perflogdir" ]; then start_ts=`date +%s.%N` fi done_count=0 total_count=0 # go through missing ids for id in $bids ; do # find grid job corresponding to curent local id jobfile=`find ${ctr_dir}/processing -name 'job.*.status' -print0 \ | sed 's/processing\/job\.\([^\.]*\)\.status/job.\1.local/g' \ | xargs -0 grep -F -l "localid=$id" 2>/dev/null` if [ -z "$jobfile" ] ; then continue ; fi total_count=$(( total_count + 1 )) # extract grid id gridid=`basename "$jobfile" '.local' | sed 's/^job\.//'` donefile="${ctr_dir}/job.${gridid}.lrms_done" if [ -f "$donefile" ] ; then continue ; fi statusfile="${ctr_dir}/processing/job.${gridid}.status" if [ ! -f "$statusfile" ] ; then continue ; fi status=`cat "$statusfile"` if [ "$status" != "INLRMS" ] && [ "$status" != "CANCELING" ] ; then continue ; fi if [ "$my_id" != '0' ] ; then if [ ! -O "$jobfile" ] ; then continue ; fi fi uid=$(get_owner_uid "$jobfile") [ -z "$uid" ] && { log "Failed to stat $jobfile"; continue; } # get session directory of this job sessiondir=`grep -h '^sessiondir=' "$jobfile" | sed 's/^sessiondir=\(.*\)/\1/'` # get job specific output and remove header bjobs_output="`${LSF_BIN_PATH}/bjobs -W -w $id 2>/dev/null | sed -e'1,1d'`" job_status="`echo $bjobs_output | awk '{print $3}'`" # DONE if exit_code is 0, EXIT if non zero if [ "${job_status}" = "DONE" ] || [ "${job_status}" = "EXIT" ]; then job_read_diag starttime="`echo $bjobs_output | awk '{print $14}' | sed 's/-/ /g'`" endtime="`echo $bjobs_output | awk '{print $15}' | sed 's/-/ /g'`" date_to_utc_seconds "$starttime" starttime_seconds="$return_date_seconds" seconds_to_mds_date "$return_date_seconds" LRMSStartTime=$return_mds_date date_to_utc_seconds "$endtime" endtime_seconds="$return_date_seconds" seconds_to_mds_date "$return_date_seconds" LRMSEndTime=$return_mds_date #TODO handle cputime (walltime * count?) etc. walltime=$(( $endtime_seconds - $starttime_seconds)) #cputime=$(( $walltime * $count)) # Values to write to diag. These will override values already written. [ -n "$walltime" ] && WallTime=$walltime #[ -n "$cputime" ] && UserTime=$cputime #[ -n "$cputime" ] && KernelTime=0 job_write_diag done_count=$(( done_count + 1 )) fi if [ -n "$sessiondir" ] ; then # have chance to obtain exit code diagfile="${sessiondir}.diag" if [ -n "$sessiondir" ] ; then # have chance to obtain exit code exitcode=$(do_as_uid "$uid" "grep '^exitcode=' '$diagfile'" | sed 's/^exitcode=//') fi if [ -n "$exitcode" ] ; then # job finished and exit code is known save_commentfile "$uid" "${sessiondir}.comment" "${ctr_dir}/job.${gridid}.errors" echo "$exitcode Executable finished with exit code $exitcode" > "$donefile" ${GMKICK} -j "${gridid}" "$jobfile" continue fi fi # job has probaly finished and exit code is not known exitcode='-1' countfile="${ctr_dir}/job.${gridid}.lrms_job" counter=0 if [ -f "$countfile" ] ; then counter=`cat "$countfile"` counter=$(( $counter + 1 )) fi if [ "$counter" -gt 5 ] ; then rm -f "$countfile" save_commentfile "$uid" "${sessiondir}.comment" "${ctr_dir}/job.${gridid}.errors" echo "$exitcode Job was lost with unknown exit code" > "$donefile" ${GMKICK} -j "${gridid}" "$jobfile" else echo "$counter" > "$countfile" fi done if [ ! -z "$perflogdir" ]; then stop_ts=`date +%s.%N` t=`awk "BEGIN { printf \"%.3f\", ${stop_ts}-${start_ts} }"` echo "[`date +%Y-%m-%d\ %T`] scan-lsf-job, JobProcessing, T=$total_count D=$done_count: $t" >> $perflogfile fi # go through existing ids for id in $pids ; do # find grid job corresponding to curent local id jobfile=`find ${ctr_dir} -name 'job.*.local' -print0 | xargs -0 grep -F -l "localid=$id." 2>/dev/null` if [ -z "$jobfile" ] ; then continue ; fi gridid=`basename "$jobfile" '.local' | sed 's/^job\.//'` countfile="${ctr_dir}/job.${gridid}.lrms_job" # reset failure counter rm -f "$countfile" done done sleep 60 exit 0