#!/bin/sh
# set -x
#
#  Based on globus submission script for pbs
#
#  Submits job to SLURM.
#  Input: path to grami file (same as Globus).
#
# The temporary job script is created for the submission and then removed 
# at the end of this script. 

echo "----- starting submit_slurm_job -----" 1>&2
joboption_lrms="SLURM"
lrms_options="slurm_requirements slurm_wakeupperiod slurm_use_sacct slurm_bin_path"
queue_options="slurm_requirements"

# ARC1 passes first the config file.
if [ "$1" = "--config" ]; then shift; ARC_CONFIG=$1; shift; fi
GRAMI_FILE=$1

# define paths and config parser
basedir=`dirname $0`
basedir=`cd $basedir > /dev/null && pwd` || exit $?
. "${basedir}/lrms_common.sh"

# include common submit functions
. "${pkgdatadir}/submit_common.sh" || exit $?

# run common init 
#  * parse grami
#  * parse config
#  * load LRMS-specific env
#  * set common variables
common_init

# perflog
perflogfilesub="${perflogdir}/submission.perflog"

if [ ! -z "$perflogdir" ]; then
   start_ts=`date +%s.%N`
fi

# check remote or local scratch is configured
check_any_scratch

##############################################################
# Zero stage of runtime environments
##############################################################
RTE_stage0

##############################################################
# create job script
##############################################################
mktempscript

is_cluster=true
##############################################################
# Start job script
##############################################################
echo "#!/bin/bash -l" > $LRMS_JOB_SCRIPT
echo "# SLURM batch job script built by arex" >> $LRMS_JOB_SCRIPT

# rerun is handled by GM, do not let SLURM requeue jobs itself.
echo "#SBATCH --no-requeue" >> $LRMS_JOB_SCRIPT

# no environment exporting from ARC CE to WN
echo "#SBATCH --export=NONE" >> $LRMS_JOB_SCRIPT

# write SLURM output to 'comment' file
echo "#SBATCH -e ${joboption_directory}.comment">> $LRMS_JOB_SCRIPT
echo "#SBATCH -o ${joboption_directory}.comment">> $LRMS_JOB_SCRIPT
echo "" >> $LRMS_JOB_SCRIPT
# choose queue
if [ ! -z "${joboption_queue}" ] ; then
  echo "#SBATCH -p $joboption_queue" >> $LRMS_JOB_SCRIPT
fi

##############################################################
# priority
##############################################################
if [ ! -z "$joboption_priority" ]; then
  #Slurm priority is -10000 to 10000. Lower is better.
  #Default is 0, and only superusers can assign priorities 
  #less than 0.
  #We set the priority as 100 - arc priority.
  #This will have the desired effect for all grid jobs
  #Local jobs will unfortunatly have a default priority equal 
  #to ARC priority 100, but there is no way around that.
  priority=$((100-joboption_priority))
  echo "#SBATCH --nice=${priority}" >> $LRMS_JOB_SCRIPT
else
  #If priority is not set we should set it to 
  #50 to match the default in the documentation
  priority=50
  echo "#SBATCH --nice=${priority}" >> $LRMS_JOB_SCRIPT
fi

# project name for accounting
if [ ! -z "${joboption_rsl_project}" ] ; then
  echo "#SBATCH -A $joboption_rsl_project" >> $LRMS_JOB_SCRIPT
fi
# job name for convenience
if [ ! -z "${joboption_jobname}" ] ; then
    #TODO is this necessary? do parts of the infosys need these limitations?
  jobname=`echo "$joboption_jobname" | \
           sed 's/^\([^[:alpha:]]\)/N\1/' | \
           sed 's/[^[:alnum:]]/_/g' | \
	   sed 's/\(...............\).*/\1/'`
  echo "#SBATCH -J '$jobname'" >> $LRMS_JOB_SCRIPT
else
    jobname="gridjob"
    echo "#SBATCH -J '$jobname'" >> $LRMS_JOB_SCRIPT
fi
echo "SLURM jobname: $jobname" 1>&2
# Set up the user's environment on the compute node where the script
# is executed.
echo "#SBATCH --get-user-env=10L" >> $LRMS_JOB_SCRIPT

##############################################################
# (non-)parallel jobs
##############################################################

set_count

nodes_string="#SBATCH -n ${joboption_count}"
echo "$nodes_string" >> $LRMS_JOB_SCRIPT

if [ ! -z $joboption_countpernode ] && [ $joboption_countpernode -gt 0 ] ; then
  echo "#SBATCH --ntasks-per-node $joboption_countpernode" >> $LRMS_JOB_SCRIPT
fi


nodes_string="#SBATCH "
i=0
eval "var_is_set=\${joboption_nodeproperty_$i+yes}"
while [ ! -z "${var_is_set}" ] ; do
  eval "var_value=\${joboption_nodeproperty_$i}"
  nodes_string="${nodes_string} ${var_value}"
  i=$(( $i + 1 ))
  eval "var_is_set=\${joboption_nodeproperty_$i+yes}"
done
echo "$nodes_string" >> $LRMS_JOB_SCRIPT


if [ "$joboption_exclusivenode" = "true" ]; then
  echo "#SBATCH --exclusive" >> $LRMS_JOB_SCRIPT
fi


##############################################################
# Execution times (minutes)
##############################################################
if [ ! -z "$joboption_cputime" ] ; then
  if [ $joboption_cputime -lt 0 ] ; then
    echo 'WARNING: Less than 0 cpu time requested: $joboption_cputime' 1>&2
    joboption_cputime=0
    echo 'WARNING: cpu time set to 0' 1>&2
  fi
  # this is actually walltime deduced from cputime !
  maxcputime=$(( $joboption_cputime / $joboption_count ))
  cputime_min=$(( $maxcputime / 60 ))
  cputime_sec=$(( $maxcputime - $cputime_min * 60 ))
  echo "#SBATCH -t ${cputime_min}:${cputime_sec}" >> $LRMS_JOB_SCRIPT
fi  
  
if [ -z "$joboption_walltime" ] ; then
  if [ ! -z "$joboption_cputime" ] ; then
    # Set walltime for backward compatibility or incomplete requests
    joboption_walltime=$(( $maxcputime * $walltime_ratio ))
  fi
fi

if [ ! -z "$joboption_walltime" ] ; then
  if [ $joboption_walltime -lt 0 ] ; then
    echo 'WARNING: Less than 0 wall time requested: $joboption_walltime' 1>&2
    joboption_walltime=0
    echo 'WARNING: wall time set to 0' 1>&2
  fi
  maxwalltime="$joboption_walltime"
  walltime_min=$(( $maxwalltime / 60 ))
  walltime_sec=$(( $maxwalltime - $walltime_min * 60 ))
  echo "#SBATCH -t ${walltime_min}:${walltime_sec}" >> $LRMS_JOB_SCRIPT
fi

##############################################################
# Requested memory (mb)
##############################################################

set_req_mem

if [ ! -z "$joboption_memory" ] ; then
  echo "#SBATCH --mem-per-cpu=${joboption_memory}" >> $LRMS_JOB_SCRIPT
fi

##############################################################
# Additional SLURM requirements
##############################################################

requirements_string=""
i=0
eval "var_is_set=\${CONFIG_slurm_requirements_$i+yes}"
while [ ! -z "${var_is_set}" ] ; do
  eval "var_value=\${CONFIG_slurm_requirements_$i}"
  requirements_string="${requirements_string} ${var_value}"
  i=$(( $i + 1 ))
  eval "var_is_set=\${CONFIG_slurm_requirements_$i+yes}"
done

if [ ! -z "$requirements_string" ]; then
  echo "#SBATCH $requirements_string" >> $LRMS_JOB_SCRIPT
fi

if [ ! -z "$CONFIG_slurm_requirements" ]  && [  "x$CONFIG_slurm_requirements" != "x__array__" ] ; then
  echo "#SBATCH $CONFIG_slurm_requirements" >> $LRMS_JOB_SCRIPT
fi


echo "" >> $LRMS_JOB_SCRIPT
echo "# Overide umask of execution node (sometime values are really strange)" >> $LRMS_JOB_SCRIPT
echo "umask 077" >> $LRMS_JOB_SCRIPT
echo " " >> $LRMS_JOB_SCRIPT

sourcewithargs_jobscript

##############################################################
# Init accounting
##############################################################
accounting_init

##############################################################
# Add environment variables
##############################################################
add_user_env

##############################################################
# Check for existance of executable,
# there is no sense to check for executable if files are 
# downloaded directly to computing node
##############################################################
if [ -z "${joboption_arg_0}" ] ; then
  echo 'Executable is not specified' 1>&2
  rm -f "$LRMS_JOB_SCRIPT" "$LRMS_JOB_OUT" "$LRMS_JOB_ERR"
  echo "Submission: Job description error."
  exit 1
fi

######################################################################
# Adjust working directory for tweaky nodes
# RUNTIME_GRIDAREA_DIR should be defined by external means on nodes
######################################################################
if [ ! -z "${RUNTIME_NODE_SEES_FRONTEND}" ] ; then
  setup_runtime_env
else
  echo "RUNTIME_JOB_DIR=$RUNTIME_LOCAL_SCRATCH_DIR/$joboption_gridid" >> $LRMS_JOB_SCRIPT
  echo "RUNTIME_JOB_DIAG=$RUNTIME_LOCAL_SCRATCH_DIR/${joboption_gridid}.diag" >> $LRMS_JOB_SCRIPT
  RUNTIME_STDIN_REL=`echo "${joboption_stdin}" | sed "s#^${joboption_directory}/*##"`
  RUNTIME_STDOUT_REL=`echo "${joboption_stdout}" | sed "s#^${joboption_directory}/*##"`
  RUNTIME_STDERR_REL=`echo "${joboption_stderr}" | sed "s#^${joboption_directory}/*##"`
  if [ "$RUNTIME_STDIN_REL" = "${joboption_stdin}" ] ; then
    echo "RUNTIME_JOB_STDIN=\"${joboption_stdin}\"" >> $LRMS_JOB_SCRIPT
  else
    echo "RUNTIME_JOB_STDIN=\"$RUNTIME_LOCAL_SCRATCH_DIR/$joboption_gridid/$RUNTIME_STDIN_REL\"" >> $LRMS_JOB_SCRIPT
  fi
  if [ "$RUNTIME_STDOUT_REL" = "${joboption_stdout}" ] ; then
    echo "RUNTIME_JOB_STDOUT=\"${joboption_stdout}\"" >> $LRMS_JOB_SCRIPT
  else
    echo "RUNTIME_JOB_STDOUT=\"$RUNTIME_LOCAL_SCRATCH_DIR/$joboption_gridid/$RUNTIME_STDOUT_REL\"" >> $LRMS_JOB_SCRIPT
  fi
  if [ "$RUNTIME_STDERR_REL" = "${joboption_stderr}" ] ; then
    echo "RUNTIME_JOB_STDERR=\"${joboption_stderr}\"" >> $LRMS_JOB_SCRIPT
  else
    echo "RUNTIME_JOB_STDERR=\"$RUNTIME_LOCAL_SCRATCH_DIR/$joboption_gridid/$RUNTIME_STDERR_REL\"" >> $LRMS_JOB_SCRIPT
  fi
fi

##############################################################
# Add std... to job arguments
##############################################################
include_std_streams

##############################################################
#  Move files to local working directory (job is done on node only)
#  RUNTIME_JOB_DIR -> RUNTIME_LOCAL_SCRATCH_DIR/job_id
##############################################################
move_files_to_node

echo "" >> $LRMS_JOB_SCRIPT
echo "RESULT=0" >> $LRMS_JOB_SCRIPT
echo "" >> $LRMS_JOB_SCRIPT



##############################################################
#  Skip execution if something already failed
##############################################################
echo "if [ \"\$RESULT\" = '0' ] ; then" >> $LRMS_JOB_SCRIPT

##############################################################
#  Runtime configuration at computing node
##############################################################
RTE_stage1

##############################################################
#  Diagnostics
##############################################################
echo "echo \"runtimeenvironments=\$runtimeenvironments\" >> \"\$RUNTIME_JOB_DIAG\"" >> $LRMS_JOB_SCRIPT
cat >> $LRMS_JOB_SCRIPT <<'EOSCR'
if [ ! "X$SLURM_NODEFILE" = 'X' ] ; then
  if [ -r "$SLURM_NODEFILE" ] ; then
    cat "$SLURM_NODEFILE" | sed 's/\(.*\)/nodename=\1/' >> "$RUNTIME_JOB_DIAG"
    NODENAME_WRITTEN="1"
  else
    SLURM_NODEFILE=
  fi
fi
EOSCR

##############################################################
# Accounting (WN OS Detection)
##############################################################
detect_wn_systemsoftware

##############################################################
#  Check intermediate result again
##############################################################
echo "if [ \"\$RESULT\" = '0' ] ; then" >> $LRMS_JOB_SCRIPT

##############################################################
#  Execution
##############################################################
cd_and_run

##############################################################
#  End of RESULT checks
##############################################################
echo "fi" >> $LRMS_JOB_SCRIPT
echo "fi" >> $LRMS_JOB_SCRIPT

##############################################################
#  Runtime (post)configuration at computing node
##############################################################
RTE_stage2

#####################################################
#  Upload output files
####################################################
clean_local_scratch_dir_output

##############################################################
#  Move files back to session directory (job is done on node only)
#  RUNTIME_JOB_DIR -> RUNTIME_LOCAL_SCRATCH_DIR/job_id
# !!!!!!!!!!!!!!!!!!! would be better to know the names of files !!!!!!!!!!!
##############################################################
move_files_to_frontend

##############################################################
# Finish accounting and exit job
##############################################################
accounting_end

#######################################
#  Submit the job
#######################################
echo "SLURM job script built" 1>&2
# Execute sbatch command
cd "$joboption_directory"
echo "SLURM script follows:" 1>&2
echo "-------------------------------------------------------------------" 1>&2
cat "$LRMS_JOB_SCRIPT" 1>&2
echo "-------------------------------------------------------------------" 1>&2
echo "" 1>&2
SLURM_RESULT=1
SLURM_TRIES=0

#job creation finished
if [ ! -z "$perflogdir" ]; then
   stop_ts=`date +%s.%N`
   t=`awk "BEGIN { printf \"%.3f\", ${stop_ts}-${start_ts} }"`
   echo "[`date +%Y-%m-%d\ %T`] submit-slurm-job, JobScriptCreation: $t" >> $perflogfilesub
fi

while [ "$SLURM_TRIES" -lt '10' ] ; do

    # Unset all environment variables before calling sbatch. Otherwise
    # SLURM will forward them to the job and leak information about arex.
    # Only unset lines with assignments.
    # Risks unsetting variables in sub assignments, but this is likely harmless.
    # TODO: Maybe we only should unset $ARC_*, $CONFIG_*, $GLOBUS_* etc?
  if [ ! -z "$perflogdir" ]; then
     start_ts=`date +%s.%N`
  fi
  (for i in $(env|grep '^[A-Za-z][A-Za-z0-9]*='|grep -v "LRMS_JOB_SCRIPT"|cut -d= -f1);do unset $i;done; \
       ${sbatch} $LRMS_JOB_SCRIPT) 1>$LRMS_JOB_OUT 2>$LRMS_JOB_ERR
  SLURM_RESULT="$?"
  if [ ! -z "$perflogdir" ]; then
    stop_ts=`date +%s.%N`
    t=`awk "BEGIN { printf \"%.3f\", ${stop_ts}-${start_ts} }"`
    echo "[`date +%Y-%m-%d\ %T`] submit-slurm-job, JobSubmission: $t" >> $perflogfilesub
  fi
  if [ "$SLURM_RESULT" -eq '0' ] ; then break ; fi 
  if [ "$SLURM_RESULT" -eq '198' ] ; then 
    echo "Waiting for queue to decrease" 1>&2
    sleep 60
    SLURM_TRIES=0
    continue
  fi
  grep 'maximum number of jobs' "$LRMS_JOB_OUT" "$LRMS_JOB_ERR"
  if [ $? -eq '0' ] ; then 
    echo "Waiting for queue to decrease" 1>&2
    sleep 60
    SLURM_TRIES=0
    continue
  fi
  # A rare SLURM error, but may cause chaos in the information/accounting system
  grep 'unable to accept job' "$LRMS_JOB_OUT" "$LRMS_JOB_ERR"
  if [ $? -eq '0' ] ; then 
    echo "Waiting for queue to decrease" 1>&2
    sleep 60
    SLURM_TRIES=0
    continue
  fi 
  SLURM_TRIES=$(( $SLURM_TRIES + 1 ))
  sleep 2
done
if [ $SLURM_RESULT -eq '0' ] ; then

#TODO test what happens when the jobqueue is full or when the slurm ctld is not responding
   # SLURM 1.x and 2.2.x outputs the jobid into STDERR and STDOUT respectively. Concat them,
   # and let sed sort it out. From the exit code we know that the job was submitted, so this
   # is safe. Ulf Tigerstedt <tigerste@csc.fi> 1.5.2011 
   # This is unfortunately not safe. Cray's SLURM sbatch returns 0, when it fails to submit a job. <soettrup@nbi.dk> 22.1.2015

   job_id=`cat $LRMS_JOB_OUT $LRMS_JOB_ERR |sed -e 's/^\(sbatch: \)\{0,1\}Submitted batch job \([0-9]*\)$/\2/'`
   if expr match "${job_id}" '[0-9][0-9]*' >/dev/null; then
      echo "joboption_jobid=$job_id" >> $GRAMI_FILE
      echo "job submitted successfully!" 1>&2
      echo "local job id: $job_id" 1>&2
      # Remove temporary job script file
      rm -f $LRMS_JOB_SCRIPT $LRMS_JOB_OUT $LRMS_JOB_ERR
      echo "----- exiting submit_slurm_job -----" 1>&2
      echo "" 1>&2
      exit 0
   else
      echo "job *NOT* submitted successfully!" 1>&2
      echo "failed getting the slurm jobid for the job!" 1>&2
      echo "Instead got: ${job_id}" 1>&2
      echo "Submission: Local submission client behaved unexpectedly."
   fi
else
  echo "job *NOT* submitted successfully!" 1>&2
  echo "got error code from sbatch: $SLURM_RESULT !" 1>&2
  echo "Submission: Local submission client failed."
fi
echo "Output is:" 1>&2
cat $LRMS_JOB_OUT 1>&2
echo "Error output is:"
cat $LRMS_JOB_ERR 1>&2
rm -f "$LRMS_JOB_SCRIPT" "$LRMS_JOB_OUT" "$LRMS_JOB_ERR"
echo "----- exiting submit_slurm_job -----" 1>&2
echo "" 1>&2
exit 1