#!/bin/bash
#
# Periodically check state of grid jobs in SLURM, and put mark files
# for finished jobs.
#
# usage: scan_slurm_job control_dir ...

# ARC1 passes first the config file.
if [ "$1" = "--config" ]; then shift; ARC_CONFIG=$1; shift; fi
# Validate control directories supplied on command-line
if [ -z "$1" ] ; then
    echo "no control_dir specified" 1>&2; exit 1
fi
for ctr_dir in "$@"; do
    if [ ! -d "$ctr_dir" ]; then
        echo "called with erronous control dir: $ctr_dir"
        exit 1
    fi
done

joboption_lrms="SLURM"
lrms_options="slurm_wakeupperiod slurm_use_sacct slurm_bin_path slurm_query_retries"

# define paths and config parser
basedir=`dirname $0`
basedir=`cd $basedir > /dev/null && pwd` || exit $?
. "${basedir}/lrms_common.sh"

# include common scan functions
. "${pkgdatadir}/scan_common.sh" || exit $?

# run common init 
#  * parse config
#  * load LRMS-specific env
#  * set common variables
common_init

# Prevent multiple instances of scan-slurm-job to run concurrently
lockfile="${TMPDIR:-/tmp}/scan-slurm-job.lock"

#Check if lockfile exist, if not, create it.
(set -C; : > "$lockfile") 2> /dev/null
if [ "$?" != "0" ]; then
    if ps -p $(< "$lockfile") 2>/dev/null;then
        echo "lockfile exists and PID $(< $lockfile) is running"
        exit 1
    fi
    echo "old lockfile found, was scan-slurm-job killed?"
    
    # sleep, and if no other have removed and recreated the lockfile we remove it.
    # there are still races possible, but this will have to do
    sleep $((${RANDOM}%30+10))
    if ps -p $(< $lockfile) &>/dev/null;then
        echo "lockfile exists and $(< $lockfile) is running"
        exit 1
    else
        echo "still not running, removing lockfile"
        rm $lockfile
        exit 1
    fi
fi
echo "$$" > "$lockfile"
#If killed, remove lockfile
trap 'rm $lockfile' EXIT KILL TERM
#Default sleep-time is 30 seconds
sleep ${CONFIG_slurm_wakeupperiod:-30}

# Log system performance
if [ ! -z "$perflogdir" ]; then
    perflog_common "$perflogdir" "$CONFIG_controldir"
fi

### use sacct
unset use_sacct

if [ ! -z "${CONFIG_slurm_use_sacct}" ]; then
    if [ "${CONFIG_slurm_use_sacct}" = "yes" ]; then
        use_sacct="true"
    fi
fi


### slurm_query_retries
unset slurm_query_retries

if [ ! -z "${CONFIG_slurm_query_retries}" ]; then
    slurm_query_retries=${CONFIG_slurm_query_retries}
fi


my_id=`id -u`

if [ ! -z "$perflogdir" ]; then
    #start time stamp
    start_ts=`date +%s.%N`
fi

# List of SLURM jobids for grid-jobs with state INLRMS
declare -a localids
# Array with basenames of grid-job files in ctrl_dir, indexed by localid
# example /some/path/job.XXXXX /some/other/parh/job.YYYYY
declare -a basenames
# Array with states of the jobs in SLURM, indexed by localid
declare -a jobstates
# Array to store localids of jobs that are determined to have finished, which are sent to gm-kick
declare -a kicklist
# Array with jobid blocks 
declare -a lidblocks

# Find list of grid jobs with status INLRMS, store localid and
# basename for those jobs
for ctr_dir in "$@"; do
    for basename in $(find "$ctr_dir/processing" -name 'job.*.status' -print0 \
        | xargs -0 egrep -l "INLRMS|CANCELING" \
        | sed 's/processing\/job\.\([^\.]*\)\.status$/job.\1/')
    do
        localid=$(grep ^localid= "${basename}.local" | cut -d= -f2)
        
        verify_jobid "$localid" || continue
        
        localids[${#localids[@]}]="$localid"
        basenames[$localid]="$basename"
    done
done

# No need to continue further if no jobs have status INLRMS
if [ ${#localids[@]} -eq 0 ]; then
    exit 0
fi


# Distribute localids into block so that we don't exceed max command line length
for jids in `echo "${localids[@]}" | xargs -n 4000 | tr ' ' ,`; do
    lidblocks[${#lidblocks[@]}]=$jids
done

if [ ! -z "$perflogdir" ]; then
    stop_ts=`date +%s.%N`
    #   t=`perl -e "printf '%.2f',$stop_ts-$start_ts;"`
    t=`awk "BEGIN { printf \"%.3f\", ${stop_ts}-${start_ts} }"`
    echo "[`date +%Y-%m-%d\ %T`] scan-slurm-job, ControldirTraversal: $t" >> $perflogfile
fi


if [ ! -z "$perflogdir" ]; then
    start_ts=`date +%s.%N`
fi

# Get JobStates from SLURM
jobstate_squeue=$(echo "${lidblocks[@]}" | xargs -n 1 $squeue -a -h -o "%i:%T" -t all -j )\
    || { echo "[$(date  +%Y-%m-%d\ %T)] squeue failed" 1>&2; exit 1; }

for record in $jobstate_squeue; do
    localid=$(echo "$record"|cut -d: -f1)
    state=$(echo "$record"|cut -d: -f2)
    jobstates[$localid]=$state;
done
unset jobstate_squeue

if [ ! -z "$perflogdir" ]; then
    stop_ts=`date +%s.%N`
    t=`awk "BEGIN { printf \"%.3f\", ${stop_ts}-${start_ts} }"`
    echo "[`date +%Y-%m-%d\ %T`] scan-slurm-job, squeue -a -h -o %i:%T -t all -j: $t" >> $perflogfile
fi

# A special version of interval_to_seconds for Slurm v20.02
# This function takes a time interval formatted as 789:12:34:56 (with days) or
# 12:34:56 (without days) and transforms it to seconds. It returns the result in
# the return_interval_seconds variable.
# Slurm format: [dd-][hh:][mm:][ss][.uuu].
# [.uuu] will always be removed.
# There can be years and months in front of the days, like [yy-][mm-]?
slurm_interval_to_seconds () {
    return_interval_seconds=0
    _interval_dhms=`echo $1 | sed -e 's|-|:|' -e 's|\.[0-9]\+||'`
    _interval_good=`echo $_interval_dhms | grep -E '[^:0-9]'`
    _interval_size=`echo $_interval_dhms | grep -o : | wc -l`
    if [ X"$_interval_good" = "X" ] ; then
        if [ "$_interval_size" -eq 0 ]; then
            return_interval_seconds=$_interval_dhms
        elif [ "$_interval_size" -eq 1 ]; then
            return_interval_seconds=`echo $_interval_dhms | tr : ' ' | awk '{print $1*60+$2;}'`
        elif [ "$_interval_size" -eq 2 ]; then
            return_interval_seconds=`echo $_interval_dhms | tr : ' ' | awk '{print $1*60*60+$2*60+$3;}'`
        elif [ "$_interval_size" -eq 3 ]; then
            return_interval_seconds=`echo $_interval_dhms | tr : ' ' | awk '{print $1*24*60*60+$2*60*60+$3*60+$4;}'`
        else
            echo "Bad formatting of time interval: $_interval_dhms" 1>&2
        fi
    else
        echo "Bad formatting of time interval: $_interval_dhms" 1>&2
    fi
    unset _interval_dhms _interval_size _interval_good
}

handle_commentfile () {
    localid=$1
    sessiondir=`grep -h '^sessiondir=' $jobfile | sed 's/^sessiondir=\(.*\)/\1/'`
    if [ "$my_id" != '0' ] ; then
      if [ ! -O "$jobfile" ] ; then continue ; fi
    fi
    uid=$(get_owner_uid "$jobfile")
    [ -z "$uid" ] && { log "Failed to stat $jobfile"; continue; }
    save_commentfile "$uid" "${sessiondir}.comment" "${basenames[$localid]}.errors"
}

# This function is called after a successfull call to handle_diag_file.
# It fetches the exitcode from SLURM and inserts the code into the job.$localid.lrms_done file
# The kicklist is updated to include the $localid of this job
#
# Input variables:
#  * localid
#  * tmpexitcode (hardcoded exitcode)
#  * reason (hardcoded reason)
#
# The following variables are initialized and updated, then written to .lrms_done
#  * exitcode (either hardcoded tmpexitcode or 
#  * reason
#
# In slurm the exitcode is returned as <num>:<num> where the first <num> is the exit code,
# and the second <num> is the signal number responsible for the job termination
#
function handle_exitcode {
    localid="$1"
    tmpexitcode="$2"
    reason="$3"
    exitcode_retries=$(( ${slurm_query_retries} + 1  ))
    
    while [ "$exitcode_retries" -gt 0 ]; do

        if [ "$use_sacct" ]; then 
            jobinfostring=$("$sacct" -j $localid -o ExitCode -P -n | head -n 1)
            exitcode1=$(echo $jobinfostring|awk -F':' '{print $1}')
            exitcode2=$(echo $jobinfostring|awk -F':' '{print $2}')
        else
            jobinfostring=$("$scontrol" -o show job $localid)
            exitcode1=$(echo $jobinfostring|sed -n 's/.*ExitCode=\([0-9]*\):\([0-9]*\).*/\1/p')
            exitcode2=$(echo $jobinfostring|sed -n 's/.*ExitCode=\([0-9]*\):\([0-9]*\).*/\2/p')
        fi
        
        if [ -z "$jobinfostring" ]; then
            exitcode_retries=$(( $exitcode_retries - 1 ))
            echo "scan-SLURM-job - [$(date  +%Y-%m-%d\ %T)] sacct/scontrol failed for job: $localid - could not fetch jobinfostring for exit code handling. Retries left: $exitcode_retries" 1>&2;
            jobinfo_exitcode_failed=1
        else
            ## all ok, break out of loop
            unset jobinfo_exitcode_failed
            break
        fi
    done

    ## If all retries failed hence jobinfo_exitcode_failed set, skip this step and try again at next scan
    if [ -z "$jobinfo_exitcode_failed" ]; then
        if [ -z "$exitcode1" ] && [ -z "$exitcode2" ] ; then
            exitcode=$tmpexitcode
        elif [ -n "$exitcode2" ] && [ "$exitcode2" -ne 0 ]; then
            exitcode=$(( $exitcode2 + 256 ))
        elif [ -n "$exitcode1" ] && [ "$exitcode1" -ne 0 ]; then
            exitcode=$exitcode1
        else
            exitcode=0
        fi

        # Special handling of cancelled jobs - as SLURM can return exitcode 0:0 for cancelled jobs
        if [ "$exitcode" -eq 0 ] && [ "${reason}" != "${reason/cancelled/}" ]; then
            exitcode=15
        fi

        echo "$exitcode $reason" > "${basenames[$localid]}.lrms_done"
        kicklist=(${kicklist[@]} $localid)
    fi
        
}


#
# Collects accounting info from LRMS for a job by using sacct or scontrol SLURM commands
# depending on ARC configuration. The job's LRMS id is stored in the "localid" variable.
# It first reads the jobs diag file. The job_read_diag function initializes the following variables:
#  * nodename
#  * WallTime
#  * UserTime
#  * KernelTime
#  * TotalMemory
#  * ResidentMemory
#  * LRMSStartTime
#  * LRMSEndTime
#  * exitcode
#
# Next, information from LRMS is fetched.
# If sacct is used, the following info is fetched:
#   * cpus (NCPUS)
#   * starttime (Start)
#   * endtime (End)
#   * usercputime (UserCPU)
#   * kernelcputime (SystemCPU)
#
# If scontrol is used instead of sacct, no usercputime is available, only walltime.
# The following info is fetched from scontrol:
#   * cpus
#   * starttime (Start)
#   * endtime (End)
#
# Once the values have been fetched, the diag file values updated are
#   * WallTime - in seconds
#   * Processors
#   * UserTime - in seconds
#   * KernelTime - in seconds 
# Note again that in the case where scontrol is used instead of sacct UserTime=WallTime
#
# If for some reason sacct or scontrol fails (the former due to e.g. the slurm database being overloade)
# a retry functionality is included. 3 retries is attempted for sacct/scontrol call. If still no 
# success, the handle_errorcode or handle_errorcode_cancelled is not called, avoiding the lrms_done mark.
# This results in the job being picked up in the next scan for a new attempt.
#
# The STDOUT and STDERR are redirected to the job-helper.errors file.
#
function handle_diag_file {
    localid="$1"
    ctr_diag="$2"

    handle_diag_tries=$(( ${slurm_query_retries} + 1  ))

    job_read_diag
    
    ## This while loop is an attempt to reduce the cases where the job info
    ## is not successfully fetched from slurm due to slurm connection/timeout issues
    while [ "$handle_diag_tries" -gt 0 ] ; do
        
        
        unset jobinfo_collect_failed
        
        if [ "$use_sacct" ]; then 
            jobinfostring=$("$sacct" -j $localid.batch -o NCPUS,Start,End,UserCPU,SystemCPU -P -n | tail -n 1) 
            
            cpus=$(echo "$jobinfostring" | awk -F'|' '{print $1}')
            starttime=$(echo "$jobinfostring"|awk -F'|' '{print $2}'| sed 's,\([0-9]\+/[0-9]\+\)-\([0-9:]\+\),\1 \2,g' | sed 's/T/ /g')
            endtime=$(echo "$jobinfostring"|awk -F'|' '{print $3}'| sed 's,\([0-9]\+/[0-9]\+\)-\([0-9:]\+\),\1 \2,g' | sed 's/T/ /g')
            
            # UserCPU,SystemCPU format is [dd-]hh:mm:ss[.uuu]
            usercputime=$(echo "$jobinfostring" | awk -F'|' '{print $4}')
            kernelcputime=$(echo "$jobinfostring" | awk -F'|' '{print $5}')
            [ -z "$usercputime" ] && usercputime="00:00:00"
            [ -z "$kernelcputime" ] && kernelcputime="00:00:00"
        else
            jobinfostring=$("$scontrol" -o show job $localid) 
            
            #Slurm can report StartTime and EndTime in at least these two formats:
            #2010-02-15T15:30:29
            #02/15-15:25:15
            #For our code to be able to manage both, the first needs to keep its hyphens,
            #the second needs them removed
            starttime=$(echo "$jobinfostring"|sed -n 's/.*StartTime=\([^ ]*\) .*/\1/p' | \
                            sed 's,\([0-9]\+/[0-9]\+\)-\([0-9:]\+\),\1 \2,g' | sed 's/T/ /g')
            endtime=$(echo "$jobinfostring"|sed -n 's/.*EndTime=\([^ ]*\) .*/\1/p' | \
                          sed 's,\([0-9]\+/[0-9]\+\)-\([0-9:]\+\),\1 \2,g' | sed 's/T/ /g')
            cpus=$(echo "$jobinfostring"|sed -n 's/.*NumCPUs=\([^ ]*\) .*/\1/p')
        fi
        
        if [ -z "$jobinfostring" ]; then
            jobinfo_collect_failed=1;
        fi

        
        ## Do not try again if cpus correctly filled with a number. Or if cpus filled with header of sacct, which means cpus=NCPUS. 
        ## The latter is handled already in handle_diag_file function
        ## If (not empty cpus variable, and cpus variable is a number) or cpus variable is NCPUS then we are done, so break out of the retry loop  
        if ( [   -n "$cpus" ]  &&  [ "$cpus" -eq "$cpus" ]  2>/dev/null ) ||   [ z"$cpus" = "zNCPUS" ] ; then break ; fi

        handle_diag_tries=$(( $handle_diag_tries - 1 ))
        if [ -n "$jobinfo_collect_failed"  ] ; then
            echo "scan-SLURM-job - [$(date  +%Y-%m-%d\ %T)] sacct/scontrol failed for job: $localid - could not fetch jobinfostring to update the diag file. Retries left: $handle_diag_tries." 1>&2;
        fi
        
        sleep 2
    done

    # if "sacct -j $localid.batch" return string "NCPUS|NNodes..." only, the job has no batch stage, it was killed before start on WN
    if [ ! z"$cpus" = "zNCPUS" ] && [ -z "$jobinfo_collect_failed" ]; then
        date_to_utc_seconds "$starttime"
        starttime_seconds="$return_date_seconds"
        seconds_to_mds_date "$return_date_seconds"
        LRMSStartTime=$return_mds_date
        date_to_utc_seconds "$endtime"
        endtime_seconds="$return_date_seconds"
        seconds_to_mds_date "$return_date_seconds"
        LRMSEndTime=$return_mds_date
        
        #TODO handle exitcode etc.
        walltime=$(( $endtime_seconds - $starttime_seconds))
        slurm_interval_to_seconds "$usercputime"
        cputime="$return_interval_seconds"
        slurm_interval_to_seconds "$kernelcputime"
        kernel="$return_interval_seconds"
        
        # Values to write to diag. These will override values already written.
        [ -n "$walltime" ] && WallTime=$walltime
        [ -n "$cpus" ] && Processors=$cpus
        [ -n "$cputime" ] && UserTime=$cputime
        [ -n "$kernel" ] && KernelTime=$kernel
        
        job_write_diag
    fi
}

if [ ! -z "$perflogdir" ]; then
   start_ts=`date +%s.%N`
fi

run=0
completed=0
zombie=0
failed=0
# Look at the list of jobstates and determine which jobs that have
# finished. Write job.XXXX.lrms_done according to this
for localid in ${localids[@]}; do
    # Initialize jobfile variable since it's used below
    jobfile="${basenames[$localid]}.local"
    case "${jobstates[$localid]}" in
        "")
            # Job is missing (no state) from slurm but INLRMS.
            zombie=$(($zombie + 1))
            exitcode=''
            # get session directory of this job
            sessiondir=`grep -h '^sessiondir=' $jobfile | sed 's/^sessiondir=\(.*\)/\1/'`
            diagfile="${sessiondir}.diag"
            commentfile="${sessiondir}.comment"
            if [ "$my_id" != '0' ] ; then
                if [ ! -O "$jobfile" ] ; then continue ; fi
            fi
            uid=$(get_owner_uid "$jobfile")
            [ -z "$uid" ] && { log "Failed to stat $jobfile"; continue; }

            if [ ! -z "$sessiondir" ] ; then
                # have chance to obtain exit code
                if [ -z "${RUNTIME_NODE_SEES_FRONTEND}" ] ; then
                    # In case of non-NFS setup it may take some time till
                    # diagnostics file is delivered. Wait for it max 2 minutes.
                    diag_tries=20
                    while [ "$diag_tries" -gt 0 ] ; do
                        if [ -z "$uid" ] ; then
                            exitcode=`grep '^exitcode=' "$diagfile" 2>/dev/null | sed 's/^exitcode=//'`
                        else
                            exitcode=$(do_as_uid "$uid" "grep '^exitcode=' '$diagfile'" | sed 's/^exitcode=//')
                        fi
                        if [ ! -z "$exitcode" ] ; then break ; fi
                        sleep 10
                        diag_tries=$(( $diag_tries - 1 ))
                    done
                else
                    if [ -z "$uid" ] ; then
                        exitcode=`grep '^exitcode=' "$diagfile" 2>/dev/null | sed 's/^exitcode=//'`
                    else
                        exitcode=$(do_as_uid "$uid" "grep '^exitcode=' '$diagfile'" | sed 's/^exitcode=//')
                    fi
                fi
            fi

            jobstatus="$exitcode Job missing from SLURM, exitcode recovered from session directory"
            if [ -z $exitcode ];then
                exitcode="-1"
                jobstatus="$exitcode Job missing from SLURM"
            fi
            
            save_commentfile "$uid" "$commentfile" "${basenames[$localid]}.errors"
            echo  "$jobstatus" > "${basenames[$localid]}.lrms_done"
            kicklist=(${kicklist[@]} $localid)

            ;;
            PENDING|RUNNING|SUSPENDE|COMPLETING)
                #Job is running, nothing to do.
                run=$(($run + 1))
                ;;
            CANCELLED)
                failed=$(($failed + 1))
                handle_commentfile $localid
                kicklist=(${kicklist[@]} $localid)
                handle_diag_file "$localid" "${basenames[$localid]}.diag"
                [ -z "$jobinfo_collect_failed" ] && handle_exitcode $localid "-1" "Job was cancelled" || echo "scan-SLURM-job - [$(date  +%Y-%m-%d\ %T)] Job:$localid CANCELLED, but jobinfo_collect_failed - not setting exit code, will try again in next scan" 1>&2;
                ;;
            COMPLETED)
                completed=$(($completed + 1))
                handle_commentfile $localid
                handle_diag_file "$localid" "${basenames[$localid]}.diag"
                [ -z "$jobinfo_collect_failed" ] && handle_exitcode $localid "0" "" || echo "scan-SLURM-job - [$(date  +%Y-%m-%d\ %T)] Job:$localid COMPLETED, but jobinfo_collect_failed - not setting exit code, will try again in next scan" 1>&2;
                ;;
            FAILED)
                failed=$(($failed + 1))
                handle_commentfile $localid
                handle_diag_file "$localid" "${basenames[$localid]}.diag"
                [ -z "$jobinfo_collect_failed" ] && handle_exitcode $localid "-1" "Job failed" || echo "scan-SLURM-job - [$(date  +%Y-%m-%d\ %T)] Job:$localid FAILED, but jobinfo_collect_failed - not setting exit code, will try again in next scan" 1>&2;
                ;;
            TIMEOUT)
                failed=$(($failed + 1))
                handle_commentfile $localid
                handle_diag_file "$localid" "${basenames[$localid]}.diag"
                [ -z "$jobinfo_collect_failed" ] && handle_exitcode $localid "-1" "Job timeout" || echo "scan-SLURM-job - [$(date  +%Y-%m-%d\ %T)] Job:$localid TIMEOUT, but jobinfo_collect_failed - not setting exit code, will try again in next scan" 1>&2;
                ;;
            NODE_FAIL)
                failed=$(($failed + 1))
                handle_commentfile $localid
                handle_diag_file "$localid" "${basenames[$localid]}.diag"
                [ -z "$jobinfo_collect_failed" ] && handle_exitcode $localid "-1" "Node fail" || echo "scan-SLURM-job - [$(date  +%Y-%m-%d\ %T)] Job:$localid NODE_FAIL, but jobinfo_collect_failed - not setting exit code, will try again in next scan" 1>&2;
                ;;
    esac
    unset jobinfo_collect_failed
done
    
if [ ! -z "$perflogdir" ]; then
    stop_ts=`date +%s.%N`
    t=`awk "BEGIN { printf \"%.3f\", ${stop_ts}-${start_ts} }"`
    echo "[`date +%Y-%m-%d\ %T`] scan-slurm-job, JobHandling, R= $run, D= $completed, Z= $zombie, F= $failed:  $t" >> $perflogfile
fi

# Kick the GM
if [ -n "${kicklist[*]}" ];then
    for localid in "${kicklist[@]}";do
        gridid=`echo "${basenames[$localid]}" | sed 's/.*\.\([^\.]*\)$/\1/'`
        "${pkglibexecdir}/gm-kick" -j "${gridid}" "${basenames[$localid]}.local"
    done
fi

exit 0