#!/bin/sh
# set -xv
#
# scan-sge-job does not use log-files, it only uses qacct.
#
# usage: scan_sge_job control_dir ...

# ARC1 passes first the config file.
if [ "$1" = "--config" ]; then shift; ARC_CONFIG=$1; shift; fi
if [ -z "$1" ] ; then exit 1 ; fi
# first control_dir is used for storing own files
control_dir=$1
control_dirs=
while [ $# -gt 0 ] ; do
  control_dirs="${control_dirs} $1"
  shift
done

joboption_lrms="sge"
lrms_options="sge_root sge_cell sge_qmaster_port sge_execd_port sge_bin_path sge_jobopts"
queue_options="sge_jobopts"

# define paths and config parser
basedir=`dirname $0`
basedir=`cd $basedir > /dev/null && pwd` || exit $?
. "${basedir}/lrms_common.sh"

# include common scan functions
. "${pkgdatadir}/scan_common.sh" || exit $?

# run common init 
#  * parse config
#  * load LRMS-specific env
#  * set common variables
common_init

# Assume that gm-kick and scan_common are installed in the same directory
GMKICK=${pkglibexecdir}/gm-kick

olog () { echo "[`date +%Y-%m-%d\ %T`] scan-sge-job: $*" 1>&2; }

umask 022

my_id=`id -u`

# Log system performance
if [ ! -z "$perflogdir" ]; then
   perflog_common "$perflogdir" "$CONFIG_controldir"
fi

# Takes diagfile_acct, SGE qacct command, id and extraargs
# writes output to diagfile_acct (temporary file for accounting info)
write_diagfile_acct () {
  diagfile_acct=$1
  qacct_command=$2
  id=$3
  extraargs=$4
  
  #Check that qacct is available.
  if ! `which $qacct_command >dev/null` ; then
    olog "WARNING: qacct was not found. Accounting data will not be recorded."
  fi

  $qacct_command -j $id $extraargs 2> /dev/null \
        | /usr/bin/perl -e 'while(<>) {
                     $nodename=$1         if /^hostname\s+(\S+)/;
                     $id=$1               if /^jobnumber\s+(\S+)/;
                     $exitcode=$1         if /^exit_status\s+(\S+)/;
                     $failed=$1           if /^failed\s+(.*\S)/;
                     $CPUTime=$1          if /^cpu\s+(\d+)/;
                     $Processors=$1       if /^slots\s+(\d+)/;
                     $KernelTime=$1       if /^ru_stime\s+(\d+)/;
                     $WallTime=$1         if /^ru_wallclock\s+(\d+)/;
                     $UsedMemory=$1       if /^maxvmem\s+(\S+)M/;
                     $UsedMemory=$1*1024  if /^maxvmem\s+(\S+)G/;
                     $start_time=`date -d "$1" +%s` if /^start_time\s+(.+)/;
                     $end_time  =`date -d "$1" +%s` if /^end_time\s+(.+)/;
                   }
                   # converts seconds since epoch in local time into Mds time format (UTC)
                   sub mds_date {
                       my ($seconds) = @_;
                       my @t = gmtime $seconds;
                       my ($Y,$M,$D,$h,$m,$s) = (1900+$t[5],1+$t[4],$t[3],$t[2],$t[1],$t[0]);
                       return sprintf "%04i%02i%02i%02i%02i%02iZ",$Y,$M,$D,$h,$m,$s;
                   }
                   END {
                     exit unless $id;
                     print "LRMSStartTime=".mds_date($1)."\n" if $start_time =~ m/^(\d+)$/;
                     print "LRMSEndTime="  .mds_date($1)."\n" if $end_time   =~ m/^(\d+)$/;
                     print "nodename=${nodename}\n";
                     print "CPUTime=${CPUTime}.0s\n";
                     print "Processors=${Processors}\n";
                     print "WallTime=${WallTime}.0s\n";
                     print "KernelTime=${KernelTime}.0s\n";
                     print "UserTime=".int($CPUTime-$KernelTime).".0s\n";
                     print "AverageTotalMemory=".int($UsedMemory*1024)."kB\n";
                     print "failed=$failed\n";
                     print "\nsgeexitcode=$exitcode\n";
                   }' \
  > "$diagfile_acct"

}

# Parse GRAMI-file and see what was requested
check_exceeded_resources_grami () {
  gramifile=$1
  errorsfile=$2
  used_cputime=$3
  used_walltime=$4
  used_memory=$5

  req_walltime=`sed -n "s/^joboption_walltime=//p" "$gramifile" | tail -n 1`
  req_cputime=`sed -n "s/^joboption_cputime=//p" "$gramifile" | tail -n 1`
  req_memory=`sed -n "s/^joboption_memory=//p" "$gramifile" | tail -n 1`
    
  if [ ! -z "$used_memory" ] && [ ! -z "$req_memory" ] \
    && [ "$req_memory" != "" ] && [ "$req_memory" -gt 0 ] \
    && [ $(( 100*$used_memory/1024/$req_memory )) -gt 95 ]; then
    overlimit="memory"
  fi
  if [ ! -z "$used_cputime" ] && [ ! -z "$req_cputime" ] \
    && [ "$req_cputime" != "" ] && [ "$req_cputime" -gt 0 ] \
    && [ $(( 100*$used_cputime/$req_cputime )) -gt 95 ]; then
    overlimit="cputime"
  fi
  if [ ! -z "$used_walltime" ] && [ ! -z "$req_walltime" ] \
    && [ "$req_walltime" != "" ] && [ "$req_walltime" -gt 0 ] \
    && [ $(( 100*$used_walltime/$req_walltime )) -gt 95 ]; then
    overlimit="walltime"
  fi
    
  echo ++++++++++++++++++++++++++   >> "$errorsfile"
  echo Resources:                   >> "$errorsfile"
  echo ++++++++++++++++++++++++++   >> "$errorsfile"
  echo req_memory=$req_memory MB    >> "$errorsfile"
  echo req_cputime=$req_cputime     >> "$errorsfile"
  echo req_walltime=$req_walltime   >> "$errorsfile"
  echo used_memory=$used_memory kB  >> "$errorsfile"
  echo used_cputime=$used_cputime   >> "$errorsfile"
  echo used_walltime=$used_walltime >> "$errorsfile"
  if [ ! -z "$overlimit" ]; then
    echo overlimit=$overlimit       >> "$errorsfile"
  fi
  echo ++++++++++++++++++++++++++   >> "$errorsfile"
}

#Handle failedcode
handle_failedcode () {
  failedcode=$1
  donefile=$2
  exitcode=$3
  sgeexitcode=$4
  overlimit=$5

  if [ -z "$failedcode" ]; then
    # Should never happen
    olog "SGE job $id failed: SGE accouting record is incomplete"
    echo "-1 SGE accouting record is incomplete" > "$donefile"
	
  elif [ "$failedcode" = "0" ]; then
	
    if [ -z "$exitcode" ]; then
      olog "SGE job $id failed with unknown exit code"
      if [ -z "$sgeexitcode" ] || [ "$sgeexitcode" = 0 ]; then sgeexitcode="-1"; fi
      echo "$sgeexitcode Job failed with unknown exit code" > "$donefile"
      
    elif [ "$exitcode" = "0" ]; then
      #olog "SGE job $id finished successfully"
      echo "0" > "$donefile"
	    
    else
      #olog "SGE job $id failed with exit code $exitcode"
      if [ -z "$sgeexitcode" ] || [ "$sgeexitcode" = 0 ]; then sgeexitcode="-1"; fi
      echo "$sgeexitcode Job failed with exit code $exitcode" > "$donefile"
    fi

  else  # SGE reports a problem
	
    if [ "$failedcode" = "25" ]; then
      failedreason="SGE error $failedcode: Job will be rescheduled"
    elif [ "$failedcode" = "24" ]; then
      failedreason="SGE error $failedcode: Job will be migrated"
    elif [ "$failedcode" = "100" ]; then
      # This happens when SGE signals the job, as in the case when a
      # resource limit is exceeded.  We don't know for sure whether
      # they were enforced or not but if a job is killed by SGE, this
      # might be the likely cause.
      if [ -z "$overlimit" ]; then
        failedreason="SGE error $failedreason"
      elif [ $overlimit = "memory" ]; then
        failedreason="job killed: vmem"
      elif [ $overlimit = "cputime" ]; then
        failedreason="job killed: cput"
      elif [ $overlimit = "walltime" ]; then
        failedreason="job killed: wall"
      fi
    else
      failedreason="SGE error $failedreason"
    fi

    if [ ! -z "$eqwmessage" ]; then
       failedreason="$eqwmessage"
    fi
	
    olog "SGE job $id failed: $failedreason"
    if [ -z "$sgeexitcode" ] || [ "$sgeexitcode" = 0 ]; then sgeexitcode="-1"; fi
    echo "271 $failedreason" > "$donefile"

  fi # failedcode
}

# Add accounting info to $diagfile
add_accounting_to_diag () {
  diagfile=$1 
  diagfile_acct=$2
  diagfile_tmp=$3
  errorsfile=$4
  uid=$5
  commentfile=$6
  ctr_dir=$7
  
  gramifile="${ctr_dir}/job.${gridid}.grami"
  donefile="${ctr_dir}/job.${gridid}.lrms_done"
  countfile="${ctr_dir}/job.${gridid}.lrms_job"
  errorsfile="${ctr_dir}/job.${gridid}.errors"
  localfile="${ctr_dir}/job.${gridid}.local" 
  
  used_walltime=`sed -n 's/^WallTime=\(.*\).0s/\1/p' "$diagfile_acct" | tail -n 1`
  used_cputime=`sed -n 's/^CPUTime=\(.*\).0s/\1/p' "$diagfile_acct" | tail -n 1`
  used_memory=`sed -n 's/^AverageTotalMemory=\(.*\)kB/\1/p' "$diagfile_acct" | tail -n 1`
  sgeexitcode=`sed -n 's/^sgeexitcode=//p' "$diagfile_acct" | tail -n 1`
  failedreason=`sed -n 's/^failed=//p' "$diagfile_acct" | tail -n 1`
  failedcode=`echo $failedreason | awk '{print $1}'`
    
  if [ -s "$diagfile_acct" ]; then

    # Remove attributes from existing diagfile that we should have
    # gotten from qacct this time, otherwise we will get duplicates
    do_as_uid "$uid" "cat '$diagfile'" \
      | grep -v "^nodename=" \
      | grep -v "^WallTime=" \
      | grep -v "^KernelTime=" \
      | grep -v "^UserTime=" \
      | grep -v "^CPUTime=" \
      | grep -v "^Processors=" \
      | grep -v "^LRMSStartTime=" \
      | grep -v "^LRMSEndTime=" \
      | grep -v "^MaxResidentMemory=" \
      | grep -v "^AverageTotalMemory=" \
      > "$diagfile_tmp"

    cat "$diagfile_tmp" "$diagfile_acct" \
      | grep -v '^sgeexitcode=' \
      | do_as_uid "$uid" "cat > '$diagfile'"
	
    # Check for exceeded resources limits
    overlimit=
    if [ -s "$gramifile" ]; then
      check_exceeded_resources_grami "$gramifile" "$errorsfile" "$used_cputime" "$used_walltime" "$used_memory"
    fi # grami file
	
    save_commentfile "$uid" "$commentfile" "$errorsfile"
	
    handle_failedcode "$failedcode" "$donefile" "$exitcode" "$sgeexitcode" "$overlimit"
	
    # wake up GM
    $GMKICK -j "${gridid}" "$localfile" >> "$errorsfile"

    rm -f "$countfile"
    rm -f "$diagfile_tmp" "$diagfile_acct"

    # we're done, go to next job id
    return 0

  fi # accounting info ok

  rm -f "$diagfile_tmp" "$diagfile_acct"
  return 1
}

# Handle missing accounting info
handle_missing_accounting () {
  countfile=$1
  uid=$2
  commentfile=$3
  errorsfile=$4
  donefile=$5
  exitcode=$6
  localfile=$7
  GMKICK=$8
  SGE_BIN_PATH=$9
  id=$10

  if [ -n "$noaccounting" ]; then

    # Accounting file is not accessible on this host.
    echo "scan-sge-job: WARNING: SGE's accounting file is not accessible on the Grid frontend node" >> "$errorsfile"
    echo "scan-sge-job: WARNING: Resource usage reported for this job may be inaccurate or incomplete" >> "$errorsfile"

    if [ -z "$exitcode" ]; then
      echo "-1 Job failed with unknown exit code" > "$donefile"
    elif [ "$exitcode" = "0" ]; then
      echo "0" > "$donefile" 
    else
      echo "$exitcode Job failed with exit code $exitcode" > "$donefile"
    fi

    $GMKICK -j "${gridid}" "$localfile" >> "$errorsfile"

    return
  fi

  # There is a certain lag between the end of the job
  # and the time when accouting information becomes available.
  # We do 5 retries, keeping the count in $countfile

  counter=0
  if [ -f "$countfile" ] ; then
    counter=`cat "$countfile"`
    counter=$(( $counter + 1 ))
  fi

  if [ "$counter" -gt 5 ]; then
    # Cannot wait more for accounting info.
    echo "scan-sge-job: WARNING: No SGE accounting record found for this job" >> "$errorsfile"
    echo "scan-sge-job: WARNING: Resource usage reported for this job may be inaccurate or incomplete" >> "$errorsfile"

    save_commentfile "$uid" "$commentfile" "$errorsfile"

    if [ -z "$exitcode" ]; then
      olog "No SGE accounting record found for job $id. No exitcode in diag file"
      echo "-1 Job failed with unknown exit code" > "$donefile"
    elif [ "$exitcode" = "0" ]; then
      olog "No SGE accounting record found for job $id. exitcode=$exitcode in diag file"
      echo "0" > "$donefile"
    else
      olog "No SGE accounting record found for job $id. exitcode=$exitcode in diag file"
      echo "$exitcode Job failed with exit code $exitcode" > "$donefile"
    fi
    rm -f "$countfile"

    # wake up GM
    $GMKICK -j "${gridid}" "$localfile" >> "$errorsfile"

  else
    # test again for job existence, only count if not known
    ${SGE_BIN_PATH}/qstat -j $id > /dev/null 2>&1
    if [ $? -ne 0 ]; then
      echo "$counter" > "$countfile"
    else
      olog "SGE job $id disappeared and then reappeared!"
      rm -f "$countfile"
    fi
  fi
}

# GD: no attempt to look for SGE Manager logfiles, restrict to job logs.

if [ ! -z "$perflogdir" ]; then
   start_ts=`date +%s.%N`
fi

# Get all jobs
pids=`${SGE_BIN_PATH}/qstat -u '*' 2>/dev/null | sed -n 's/^ *\([0-9][0-9]*\) .*/\1/p'`
if [ $? != 0 ]; then
  olog "Failed running ${SGE_BIN_PATH}/qstat"
  sleep 60
  exit 1
fi

if [ ! -z "$perflogdir" ]; then
   stop_ts=`date +%s.%N`
   t=`awk "BEGIN { printf \"%.3f\", ${stop_ts}-${start_ts} }"`
   echo "[`date +%Y-%m-%d\ %T`] scan-sge-job, qstat -u '*': $t" >> $perflogfile
fi

# Go through directories
for ctr_dir in $control_dirs ; do
  # Obtain ids of pending/running jobs stored in job.*.local
  if [ ! -z "$perflogdir" ]; then
     start_ts=`date +%s.%N`
  fi
  rjobs=`find "$ctr_dir/processing" -name 'job.*.status' 2>/dev/null \
         | sed 's/processing\/job\.\([^\.]*\)\.status$/job.\1.local/'`
  if [ ! -z "$perflogdir" ]; then
     stop_ts=`date +%s.%N`
     t=`awk "BEGIN { printf \"%.3f\", ${stop_ts}-${start_ts} }"`
     echo "[`date +%Y-%m-%d\ %T`] scan-sge-job, control dir traversal: $t" >> $perflogfile
  fi

  if [ -z "$rjobs" ] ; then continue ; fi
  ids=`echo $rjobs | xargs grep -h '^localid=' 2>/dev/null | sed 's/^localid=\([^ ]*\)/\1/'`
  if [ -z "$ids" ] ; then continue ; fi
  # compare them to running jobs and find missing
  bids=
  for id in $ids ; do
    found=`echo "$pids" | grep "^$id"`
    if [ -z "$found" ] ; then
      bids="$bids $id"
    fi
  done
  # go through missing ids
  for id in $bids ; do
    # find grid job corresponding to current local id
    jobfile=`find "$ctr_dir/processing" -name 'job.*.status' 2>/dev/null \
             | sed 's/processing\/job\.\([^\.]*\)\.status$/job.\1.local/' \
             | xargs grep -l "localid=$id\$" 2>/dev/null`
    if [ -z "$jobfile" ] ; then continue ; fi
    # unless running as superuser, skip jobs not belonging to the current user
    if [ "$my_id" != '0' ] ; then
      if [ ! -O "$jobfile" ] ; then continue ; fi
    fi
    # find who runs this job
    uid=$(get_owner_uid "$jobfile")
    [ -z "$uid" ] && { log "Failed to stat $jobfile"; continue; }
    # extract grid id
    gridid=`basename "$jobfile" '.local' | sed 's/^job\.//'`
    donefile="${ctr_dir}/job.${gridid}.lrms_done"
    countfile="${ctr_dir}/job.${gridid}.lrms_job"
    failedfile="${ctr_dir}/job.${gridid}.failed"
    errorsfile="${ctr_dir}/job.${gridid}.errors"
    localfile="${ctr_dir}/job.${gridid}.local"
    xmlfile="${ctr_dir}/job.${gridid}.xml"
    if [ -f "$donefile" ] ; then continue ; fi
    statusfile="${ctr_dir}/processing/job.${gridid}.status"
    if [ ! -f "$statusfile" ] ; then continue ; fi
    status=`cat "$statusfile"`
    if [ "$status" != "INLRMS" ] && [ "$status" != "CANCELING" ] ; then continue ; fi
    # get session directory of this job
    session=`grep -h '^sessiondir=' "$jobfile" | sed 's/^sessiondir=\(.*\)/\1/'`
    commentfile="${session}.comment"
    commentfile2="${session}/.comment"
    do_as_uid "$uid" "cat '$commentfile2' >> '$commentfile' 2> /dev/null; rm -f '$commentfile2'";
    if [  -d "$session" ] ; then
      diagfile="${session}.diag"
      diagfile2="${session}/.diag"
      do_as_uid "$uid" "cat '$diagfile2' >> '$diagfile' 2> /dev/null; rm -f '$diagfile2'";
      # try to obtain the exit code
      # $diagfile is owned by the user running the job. Switch user to access it.
      exitcode=`do_as_uid "$uid" "sed -n 's/^exitcode=//p' '$diagfile'" | tail -n 1`

      diagfile_tmp=`mktemp "$TMPDIR/diag_tmp.XXXXXX"` || { sleep 60; exit 1; }
      diagfile_acct=`mktemp "$TMPDIR/diag_acct.XXXXXX"` || { sleep 60; exit 1; }
      noaccounting=

      # qacct can take quite long. Here is a workaround.
      # Find the accounting file, and copy the last 50000
      # records to a temp file.
      acctfile=$SGE_ROOT/$SGE_CELL/common/accounting
      if [ -f  "$acctfile" ]; then
        briefacct=`mktemp "$TMPDIR/accounting.XXXXXX"` || { sleep 60; exit 1; }
        tail -n 50000 "$acctfile" > "$briefacct"
        if [ $? = 0 ]; then extraargs="-f $briefacct"; fi
      else
        # Accounting file is not accessible on this host.
        noaccounting=1
      fi

      eqwmessage=`grep "SGE job state was Eqw" $xmlfile | sed -r 's/<[/]?OtherMessages>//g'`

      # get accounting info. write diag file
      write_diagfile_acct "$diagfile_acct" "${SGE_BIN_PATH}/qacct" "$id" "$extraargs"

      if [ "x$briefacct" != "x" ]; then rm -f "$briefacct"; fi

      # If the last qacct record is about migration,
      # we should wait for the next qacct record to appear
      # Delete file, like there was no accounting present at all!
      if grep -q "^failed=24 *: migrating"    "$diagfile_acct" \
      || grep -q "^failed=25 *: rescheduling" "$diagfile_acct"; then
          rm -f "$diagfile_acct"
          olog "SGE job $id: the last record in qacct is about migration. Waiting for next record to appear"
      fi

      # Add accounting info to $diagfile
      add_accounting_to_diag "$diagfile" "$diagfile_acct" "$diagfile_tmp" "$errorsfile" "$uid" "$commentfile" "$ctr_dir"
      if [ $? = 0 ]; then
	  continue
      fi
    fi # session directory exists

    # This section is only reached when accounting info is not present

    handle_missing_accounting "$countfile" "$uid" "$commentfile" "$errorsfile" "$donefile" "$exitcode" "$localfile" "$GMKICK" "$SGE_BIN_PATH" "$id"

  done # loop over bids

  # Detect the unlikely situation when a job reappears in the qstat listing
  # after being absent in the previous run of the scan-*-job (which updated
  # the counter file)
  for countfile in `find $ctr_dir -name 'job.*.lrms_job'`; do
    localfile=${countfile%.lrms_job}.local
    pid=`sed -n 's/^localid=\([^ ]*\)/\1/p' "$localfile" 2>/dev/null`
    if [ -z "$pid" ]; then continue; fi
    if echo "$pids" | grep "^$pid\$" >/dev/null; then
      olog "SGE job $id disappeared and then reappeared!"
      rm -f "$countfile"
    fi
  done

done # loop over control_dirs
sleep 60
exit 0