#!/bin/sh
#
#  Periodically monitor for jobs which has finished or failed but not
#  reported an exitcode
#
#set -x 
id=`id -u`

#debug=:
debug () {
    echo -n `date` 1>&2
    echo -n ' ' 1>&2
    echo $@ 1>&2
}

debug "starting"
debug "options = $@"

# ARC1 passes first the config file.
if [ "$1" = "--config" ]; then shift; ARC_CONFIG=$1; shift; fi
if [ -z "$1" ] ; then echo "Controldir argument missing" 1>&2 ; exit 1 ; fi

joboption_lrms="boinc"
lrms_options="boinc_app_id boinc_db_host boinc_db_port boinc_db_user boinc_db_pass boinc_db_name boinc_project_dir"

# define paths and config parser
basedir=`dirname $0`
basedir=`cd $basedir > /dev/null && pwd` || exit $?
. "${basedir}/lrms_common.sh"

# include common scan functions
. "${pkgdatadir}/scan_common.sh" || exit $?

# run common init 
#  * parse config
#  * load LRMS-specific env
#  * set common variables
common_init

GMKICK=${pkglibexecdir}/gm-kick

# Prints the uid of the owner of the file given as argument
# Perl is used because it's more portable than using the stat command
printuid () {
  code='my @s = stat($ARGV[0]); print($s[4] || "")'
  /usr/bin/perl -we "$code" "$1"
}

#
# Attempts to switch to uid passed as the first argument and then runs the
# commands passed as the second argument in a shell. The remaining arguments
# are passed as arguments to the shell. No warning is given in case switching
# uid is not possible.
#
do_as_uid () {
    test $# -ge 2 || { log "do_as_uid requires 2 arguments"; return 1; }

    script='use English;
            my ($uid, @args) = @ARGV;
            if ( $UID == 0 && $uid ) {
                eval { $UID = $uid };
                print STDERR "Cannot switch to uid($UID): $@\n" if $UID != $uid;
            }
            system("/bin/sh","-c",@args);
            exit ($?>>8||128+($?&127));
    '
    /usr/bin/perl -we "$script" "$@"
}

# Append .comment (containing STDOUT & STDERR of the job wrapper) to .errors
save_commentfile () {
  uid=$1
  commentfile=$2
  errorsfile=$3
  echo '---------- Output of the job wrapper script -----------' >> $errorsfile
  cat $commentfile 2> /dev/null >> $errorsfile
  echo '------------------------- End of output -------------------------' >> $errorsfile
 
  #do_as_uid "$uid" "$action"
}

for control_dir in "$@" ; do

    if [ ! -d "${control_dir}" ]; then 
        echo "No control dir $control_dir" 1>&2
        continue
    fi

    # Bash specific, but this script will be rewritten in python soon...
    declare -A finished_jobs
    appidclause=""
    if [ ! -z "$CONFIG_boinc_app_id" ]; then
        appidclause="and appid=$CONFIG_boinc_app_id"
    fi
    finished=$(mysql -h $CONFIG_boinc_db_host -P $CONFIG_boinc_db_port -u $CONFIG_boinc_db_user --password=$CONFIG_boinc_db_pass $CONFIG_boinc_db_name -e "select name from workunit where assimilate_state=2 $appidclause")
    
    for job in `echo $finished`; do
        finished_jobs[$job]=1
    done

    # iterate over all jobs known in the control directory
    find "${control_dir}/processing" -name 'job.*.status' \
    | xargs egrep -l "INLRMS|CANCELING" \
    | sed -e 's/.*job\.//' -e 's/\.status$//' \
    | while read job; do
        #debug "scanning job = $job"
        unset joboption_jobid
        unset joboption_directory

        # this job was already completed, nothing remains to be done
        [ -f "${control_dir}/job.${job}.lrms_done" ] && continue

        # a grami file exists for all jobs that GM thinks are running.
        # proceed to next job if this file is missing.
        if [ ! -f "${control_dir}/job.${job}.grami" ]; then
            continue
        fi

        # extract process IDs of the grami file
        [ ! -f "${control_dir}/job.${job}.grami" ] && continue
        . "${control_dir}/job.${job}.grami"

        # process IDs could not be learned, proceeding to next
        [ -z "$joboption_jobid" ] && continue
    
        #debug "local jobid = $joboption_jobid"

        # checking if process is still running
        if [[ ! ${finished_jobs[$joboption_jobid]} ]]; then
            #debug "$joboption_jobid is still running, Continueing to next"
            continue
        else
            debug "$joboption_jobid is finished"
        fi
        uid=$(printuid "${control_dir}/job.${job}.local")
        debug "local user id = $uid"
        diagfile=${joboption_directory}.diag
        debug "checking $diagfile"
        exitcode=$(do_as_uid "$uid" "cat '$diagfile'" | sed -n 's/^exitcode=\([0-9]*\).*/\1/p')
        debug "exitcode = [$exitcode] extracted from $diagfile"
        exitcode=0
        
        comment=""
        if [ -z "$joboption_arg_code" ] ; then joboption_arg_code='0' ; fi
        if [ -z "$exitcode" ]; then
            echo "Job $job with PID $joboption_jobid died unexpectedly" 1>&2
            comment="Job died unexpectedly" 1>&2
            exitcode=-1
        elif [ "$exitcode" -ne "$joboption_arg_code" ]; then
            comment="Job finished with wrong exit code - $exitcode != $joboption_arg_code" 1>&2
        fi
        debug "got exitcode=$exitcode"
        save_commentfile "$uid" "${joboption_directory}.comment" "${control_dir}/job.${job}.errors"
        echo "$exitcode $comment" > "${control_dir}/job.${job}.lrms_done"
        "${GMKICK}" -j "${job}" "${control_dir}/job.${job}.local"
    done

done

debug "done, going to sleep"

sleep 120
exit 0