#!/bin/sh
#
#
#   Scan for finished LSF jobs, using bjobs
#
# usage: scan_lsf_job control_dir ...

# Set variables:
#   LSF_BIN_PATH

# ARC1 passes first the config file.
if [ "$1" = "--config" ]; then shift; ARC_CONFIG=$1; shift; fi
if [ -z "$1" ] ; then
    echo "Missing control directory path" 1>&2
    exit 1
fi
# first control_dir is used for storing own files
echo `date`" : control_dir=$1" 1>&2
control_dir=$1
control_dirs=
while [ $# -gt 0 ] ; do
  control_dirs="${control_dirs} $1"
  shift
done

joboption_lrms="lsf"
lrms_options="lsf_architecture lsf_bin_path"

# define paths and config parser
basedir=`dirname $0`
basedir=`cd $basedir > /dev/null && pwd` || exit $?
. "${basedir}/lrms_common.sh"

# include common scan functions
. "${pkgdatadir}/scan_common.sh" || exit $?

# run common init 
#  * parse config
#  * load LRMS-specific env
#  * set common variables
common_init

# Assume that gm-kick is installed in the same directory
GMKICK=${pkglibexecdir}/gm-kick

umask 022

# Log system performance
if [ ! -z "$perflogdir" ]; then
   perflog_common "$perflogdir" "$CONFIG_controldir"
fi

if [ ! -z "$perflogdir" ]; then
   start_ts=`date +%s.%N`
fi

my_id=`id -u`

if [ -z ${LSF_BIN_PATH} ]; then
    echo "${LSF_BIN_PATH} not set" 1>&2
    exit 1
fi

# Get all running jobs
lsf_stat=`${LSF_BIN_PATH}/bjobs -a -u all 2>/dev/null` # | grep RUN | grep '^ [:digit:]'

if [ ! -z "$perflogdir" ]; then
   stop_ts=`date +%s.%N`
   t=`awk "BEGIN { printf \"%.3f\", ${stop_ts}-${start_ts} }"`
   echo "[`date +%Y-%m-%d\ %T`] scan-lsf-job, bjobs -a -u all: $t" >> $perflogfile
fi

if [ -z "${lsf_stat}" ] ; then
    echo "bjobs returned empty result" 1>&2
fi

pids=`echo "${lsf_stat}" | egrep 'PSUSP|USUSP|SSUSP|RUN|PEND' | sed -e 's/^\([^ ]*\).*/\1/'`

eval "set -- $control_dirs"

# Go through directories
for ctr_dir in $control_dir ; do
    if [ ! -z "$perflogdir" ]; then
      start_ts=`date +%s.%N`
    fi


  # Obtain ids stored in job.*.local
    ids=`find ${ctr_dir}/processing -name 'job.*.status' -print0 \
         | sed 's/processing\/job\.\([^\.]*\)\.status/job.\1.local/g' \
         | xargs -0 grep -h "^localid=" 2>/dev/null | sed 's/^localid=\([0-9]*\).*/\1/'`

    if [ ! -z "$perflogdir" ]; then
      stop_ts=`date +%s.%N`
      t=`awk "BEGIN { printf \"%.3f\", ${stop_ts}-${start_ts} }"`
      echo "[`date +%Y-%m-%d\ %T`] scan-lsf-job, ControlDirTraversal: $t" >> $perflogfile
    fi

    if [ -z "$ids" ] ; then continue ; fi

    # compare them to running jobs and find missing
    bids=
    for id in $ids ; do
	found=`echo "$pids" | grep "^$id$"`
	if [ -z "$found" ] ; then
	    bids="$bids $id"
	fi
    done
    if [ ! -z "$perflogdir" ]; then
      start_ts=`date +%s.%N`
    fi
    done_count=0
    total_count=0
    # go through missing ids
    for id in $bids ; do

	# find grid job corresponding to curent local id
        jobfile=`find ${ctr_dir}/processing -name 'job.*.status' -print0 \
                 | sed 's/processing\/job\.\([^\.]*\)\.status/job.\1.local/g' \
                 | xargs -0 grep -F -l "localid=$id" 2>/dev/null`
	if [ -z "$jobfile" ] ; then continue ; fi
        total_count=$(( total_count + 1 ))
	# extract grid id
	gridid=`basename "$jobfile" '.local' | sed 's/^job\.//'`
	donefile="${ctr_dir}/job.${gridid}.lrms_done"
	if [ -f "$donefile" ] ; then continue ; fi

	statusfile="${ctr_dir}/processing/job.${gridid}.status"
	if [ ! -f "$statusfile" ] ; then continue ; fi

	status=`cat "$statusfile"`
	if [ "$status" != "INLRMS" ] && [ "$status" != "CANCELING" ] ; then continue ; fi

        if [ "$my_id" != '0' ] ; then
          if [ ! -O "$jobfile" ] ; then continue ; fi
        fi
        uid=$(get_owner_uid "$jobfile")
        [ -z "$uid" ] && { log "Failed to stat $jobfile"; continue; }


	# get session directory of this job
	sessiondir=`grep -h '^sessiondir=' "$jobfile" | sed 's/^sessiondir=\(.*\)/\1/'`

	# get job specific output and remove header
	bjobs_output="`${LSF_BIN_PATH}/bjobs -W -w $id 2>/dev/null | sed -e'1,1d'`"
	job_status="`echo $bjobs_output | awk '{print $3}'`"

	# DONE if exit_code is 0, EXIT if non zero
	if [ "${job_status}" = "DONE" ] || [ "${job_status}" = "EXIT" ]; then

            job_read_diag

	    starttime="`echo $bjobs_output | awk '{print $14}' | sed 's/-/ /g'`"
	    endtime="`echo $bjobs_output | awk '{print $15}' | sed 's/-/ /g'`"
	    date_to_utc_seconds "$starttime"
	    starttime_seconds="$return_date_seconds"
	    seconds_to_mds_date "$return_date_seconds"
	    LRMSStartTime=$return_mds_date
	    date_to_utc_seconds "$endtime"
	    endtime_seconds="$return_date_seconds"
	    seconds_to_mds_date "$return_date_seconds"
	    LRMSEndTime=$return_mds_date

	    #TODO handle cputime (walltime * count?) etc.
            walltime=$(( $endtime_seconds - $starttime_seconds))
	    #cputime=$(( $walltime * $count))
            # Values to write to diag. These will override values already written.
            [ -n "$walltime" ] && WallTime=$walltime
            #[ -n "$cputime" ] && UserTime=$cputime
            #[ -n "$cputime" ] && KernelTime=0

            job_write_diag
            done_count=$(( done_count + 1 ))
	fi

	if [ -n "$sessiondir" ] ; then
	    # have chance to obtain exit code
	    diagfile="${sessiondir}.diag"

	    if [ -n "$sessiondir" ] ; then
		# have chance to obtain exit code
                exitcode=$(do_as_uid "$uid" "grep '^exitcode=' '$diagfile'" | sed 's/^exitcode=//')
	    fi

	    if [ -n "$exitcode" ] ; then
		# job finished and exit code is known
                save_commentfile "$uid" "${sessiondir}.comment" "${ctr_dir}/job.${gridid}.errors"
		echo "$exitcode Executable finished with exit code $exitcode" > "$donefile"
		${GMKICK} -j "${gridid}" "$jobfile"
		continue
	    fi
	fi

	# job has probaly finished and exit code is not known
	exitcode='-1'
	countfile="${ctr_dir}/job.${gridid}.lrms_job"
	counter=0
	if [ -f "$countfile" ] ; then
	    counter=`cat "$countfile"`
	    counter=$(( $counter + 1 ))
	fi

	if [ "$counter" -gt 5 ] ; then
	    rm -f "$countfile"
            save_commentfile "$uid" "${sessiondir}.comment" "${ctr_dir}/job.${gridid}.errors"
	    echo "$exitcode Job was lost with unknown exit code" > "$donefile"
	    ${GMKICK} -j "${gridid}" "$jobfile"
	else
	    echo "$counter" > "$countfile"
	fi
    done

    if [ ! -z "$perflogdir" ]; then
       stop_ts=`date +%s.%N`
       t=`awk "BEGIN { printf \"%.3f\", ${stop_ts}-${start_ts} }"`
       echo "[`date +%Y-%m-%d\ %T`] scan-lsf-job, JobProcessing, T=$total_count D=$done_count: $t" >> $perflogfile
    fi

    # go through existing ids
    for id in $pids ; do
	# find grid job corresponding to curent local id
	jobfile=`find ${ctr_dir} -name 'job.*.local' -print0 | xargs -0 grep -F -l "localid=$id." 2>/dev/null`
	if [ -z "$jobfile" ] ; then continue ; fi
	gridid=`basename "$jobfile" '.local' | sed 's/^job\.//'`
	countfile="${ctr_dir}/job.${gridid}.lrms_job"
	# reset failure counter
	rm -f "$countfile"
    done

done

sleep 60
exit 0