#!/bin/sh # set -x # # Based on globus submission script for pbs # # Submits job to SLURM. # Input: path to grami file (same as Globus). # # The temporary job script is created for the submission and then removed # at the end of this script. echo "----- starting submit_slurm_job -----" 1>&2 joboption_lrms="SLURM" lrms_options="slurm_requirements slurm_wakeupperiod slurm_use_sacct slurm_bin_path" queue_options="slurm_requirements" # ARC1 passes first the config file. if [ "$1" = "--config" ]; then shift; ARC_CONFIG=$1; shift; fi GRAMI_FILE=$1 # define paths and config parser basedir=`dirname $0` basedir=`cd $basedir > /dev/null && pwd` || exit $? . "${basedir}/lrms_common.sh" # include common submit functions . "${pkgdatadir}/submit_common.sh" || exit $? # run common init # * parse grami # * parse config # * load LRMS-specific env # * set common variables common_init # perflog perflogfilesub="${perflogdir}/submission.perflog" if [ ! -z "$perflogdir" ]; then start_ts=`date +%s.%N` fi # check remote or local scratch is configured check_any_scratch ############################################################## # Zero stage of runtime environments ############################################################## RTE_stage0 ############################################################## # create job script ############################################################## mktempscript is_cluster=true ############################################################## # Start job script ############################################################## echo "#!/bin/bash -l" > $LRMS_JOB_SCRIPT echo "# SLURM batch job script built by arex" >> $LRMS_JOB_SCRIPT # rerun is handled by GM, do not let SLURM requeue jobs itself. echo "#SBATCH --no-requeue" >> $LRMS_JOB_SCRIPT # no environment exporting from ARC CE to WN echo "#SBATCH --export=NONE" >> $LRMS_JOB_SCRIPT # write SLURM output to 'comment' file echo "#SBATCH -e ${joboption_directory}.comment">> $LRMS_JOB_SCRIPT echo "#SBATCH -o ${joboption_directory}.comment">> $LRMS_JOB_SCRIPT echo "" >> $LRMS_JOB_SCRIPT # choose queue if [ ! -z "${joboption_queue}" ] ; then echo "#SBATCH -p $joboption_queue" >> $LRMS_JOB_SCRIPT fi ############################################################## # priority ############################################################## if [ ! -z "$joboption_priority" ]; then #Slurm priority is -10000 to 10000. Lower is better. #Default is 0, and only superusers can assign priorities #less than 0. #We set the priority as 100 - arc priority. #This will have the desired effect for all grid jobs #Local jobs will unfortunatly have a default priority equal #to ARC priority 100, but there is no way around that. priority=$((100-joboption_priority)) echo "#SBATCH --nice=${priority}" >> $LRMS_JOB_SCRIPT else #If priority is not set we should set it to #50 to match the default in the documentation priority=50 echo "#SBATCH --nice=${priority}" >> $LRMS_JOB_SCRIPT fi # project name for accounting if [ ! -z "${joboption_rsl_project}" ] ; then echo "#SBATCH -A $joboption_rsl_project" >> $LRMS_JOB_SCRIPT fi # job name for convenience if [ ! -z "${joboption_jobname}" ] ; then #TODO is this necessary? do parts of the infosys need these limitations? jobname=`echo "$joboption_jobname" | \ sed 's/^\([^[:alpha:]]\)/N\1/' | \ sed 's/[^[:alnum:]]/_/g' | \ sed 's/\(...............\).*/\1/'` echo "#SBATCH -J '$jobname'" >> $LRMS_JOB_SCRIPT else jobname="gridjob" echo "#SBATCH -J '$jobname'" >> $LRMS_JOB_SCRIPT fi echo "SLURM jobname: $jobname" 1>&2 # Set up the user's environment on the compute node where the script # is executed. echo "#SBATCH --get-user-env=10L" >> $LRMS_JOB_SCRIPT ############################################################## # (non-)parallel jobs ############################################################## set_count nodes_string="#SBATCH -n ${joboption_count}" echo "$nodes_string" >> $LRMS_JOB_SCRIPT if [ ! -z $joboption_countpernode ] && [ $joboption_countpernode -gt 0 ] ; then echo "#SBATCH --ntasks-per-node $joboption_countpernode" >> $LRMS_JOB_SCRIPT fi nodes_string="#SBATCH " i=0 eval "var_is_set=\${joboption_nodeproperty_$i+yes}" while [ ! -z "${var_is_set}" ] ; do eval "var_value=\${joboption_nodeproperty_$i}" nodes_string="${nodes_string} ${var_value}" i=$(( $i + 1 )) eval "var_is_set=\${joboption_nodeproperty_$i+yes}" done echo "$nodes_string" >> $LRMS_JOB_SCRIPT if [ "$joboption_exclusivenode" = "true" ]; then echo "#SBATCH --exclusive" >> $LRMS_JOB_SCRIPT fi ############################################################## # Execution times (minutes) ############################################################## if [ ! -z "$joboption_cputime" ] ; then if [ $joboption_cputime -lt 0 ] ; then echo 'WARNING: Less than 0 cpu time requested: $joboption_cputime' 1>&2 joboption_cputime=0 echo 'WARNING: cpu time set to 0' 1>&2 fi # this is actually walltime deduced from cputime ! maxcputime=$(( $joboption_cputime / $joboption_count )) cputime_min=$(( $maxcputime / 60 )) cputime_sec=$(( $maxcputime - $cputime_min * 60 )) echo "#SBATCH -t ${cputime_min}:${cputime_sec}" >> $LRMS_JOB_SCRIPT fi if [ -z "$joboption_walltime" ] ; then if [ ! -z "$joboption_cputime" ] ; then # Set walltime for backward compatibility or incomplete requests joboption_walltime=$(( $maxcputime * $walltime_ratio )) fi fi if [ ! -z "$joboption_walltime" ] ; then if [ $joboption_walltime -lt 0 ] ; then echo 'WARNING: Less than 0 wall time requested: $joboption_walltime' 1>&2 joboption_walltime=0 echo 'WARNING: wall time set to 0' 1>&2 fi maxwalltime="$joboption_walltime" walltime_min=$(( $maxwalltime / 60 )) walltime_sec=$(( $maxwalltime - $walltime_min * 60 )) echo "#SBATCH -t ${walltime_min}:${walltime_sec}" >> $LRMS_JOB_SCRIPT fi ############################################################## # Requested memory (mb) ############################################################## set_req_mem if [ ! -z "$joboption_memory" ] ; then echo "#SBATCH --mem-per-cpu=${joboption_memory}" >> $LRMS_JOB_SCRIPT fi ############################################################## # Additional SLURM requirements ############################################################## requirements_string="" i=0 eval "var_is_set=\${CONFIG_slurm_requirements_$i+yes}" while [ ! -z "${var_is_set}" ] ; do eval "var_value=\${CONFIG_slurm_requirements_$i}" requirements_string="${requirements_string} ${var_value}" i=$(( $i + 1 )) eval "var_is_set=\${CONFIG_slurm_requirements_$i+yes}" done if [ ! -z "$requirements_string" ]; then echo "#SBATCH $requirements_string" >> $LRMS_JOB_SCRIPT fi if [ ! -z "$CONFIG_slurm_requirements" ] && [ "x$CONFIG_slurm_requirements" != "x__array__" ] ; then echo "#SBATCH $CONFIG_slurm_requirements" >> $LRMS_JOB_SCRIPT fi echo "" >> $LRMS_JOB_SCRIPT echo "# Overide umask of execution node (sometime values are really strange)" >> $LRMS_JOB_SCRIPT echo "umask 077" >> $LRMS_JOB_SCRIPT echo " " >> $LRMS_JOB_SCRIPT sourcewithargs_jobscript ############################################################## # Init accounting ############################################################## accounting_init ############################################################## # Add environment variables ############################################################## add_user_env ############################################################## # Check for existance of executable, # there is no sense to check for executable if files are # downloaded directly to computing node ############################################################## if [ -z "${joboption_arg_0}" ] ; then echo 'Executable is not specified' 1>&2 rm -f "$LRMS_JOB_SCRIPT" "$LRMS_JOB_OUT" "$LRMS_JOB_ERR" echo "Submission: Job description error." exit 1 fi ###################################################################### # Adjust working directory for tweaky nodes # RUNTIME_GRIDAREA_DIR should be defined by external means on nodes ###################################################################### if [ ! -z "${RUNTIME_NODE_SEES_FRONTEND}" ] ; then setup_runtime_env else echo "RUNTIME_JOB_DIR=$RUNTIME_LOCAL_SCRATCH_DIR/$joboption_gridid" >> $LRMS_JOB_SCRIPT echo "RUNTIME_JOB_DIAG=$RUNTIME_LOCAL_SCRATCH_DIR/${joboption_gridid}.diag" >> $LRMS_JOB_SCRIPT RUNTIME_STDIN_REL=`echo "${joboption_stdin}" | sed "s#^${joboption_directory}/*##"` RUNTIME_STDOUT_REL=`echo "${joboption_stdout}" | sed "s#^${joboption_directory}/*##"` RUNTIME_STDERR_REL=`echo "${joboption_stderr}" | sed "s#^${joboption_directory}/*##"` if [ "$RUNTIME_STDIN_REL" = "${joboption_stdin}" ] ; then echo "RUNTIME_JOB_STDIN=\"${joboption_stdin}\"" >> $LRMS_JOB_SCRIPT else echo "RUNTIME_JOB_STDIN=\"$RUNTIME_LOCAL_SCRATCH_DIR/$joboption_gridid/$RUNTIME_STDIN_REL\"" >> $LRMS_JOB_SCRIPT fi if [ "$RUNTIME_STDOUT_REL" = "${joboption_stdout}" ] ; then echo "RUNTIME_JOB_STDOUT=\"${joboption_stdout}\"" >> $LRMS_JOB_SCRIPT else echo "RUNTIME_JOB_STDOUT=\"$RUNTIME_LOCAL_SCRATCH_DIR/$joboption_gridid/$RUNTIME_STDOUT_REL\"" >> $LRMS_JOB_SCRIPT fi if [ "$RUNTIME_STDERR_REL" = "${joboption_stderr}" ] ; then echo "RUNTIME_JOB_STDERR=\"${joboption_stderr}\"" >> $LRMS_JOB_SCRIPT else echo "RUNTIME_JOB_STDERR=\"$RUNTIME_LOCAL_SCRATCH_DIR/$joboption_gridid/$RUNTIME_STDERR_REL\"" >> $LRMS_JOB_SCRIPT fi fi ############################################################## # Add std... to job arguments ############################################################## include_std_streams ############################################################## # Move files to local working directory (job is done on node only) # RUNTIME_JOB_DIR -> RUNTIME_LOCAL_SCRATCH_DIR/job_id ############################################################## move_files_to_node echo "" >> $LRMS_JOB_SCRIPT echo "RESULT=0" >> $LRMS_JOB_SCRIPT echo "" >> $LRMS_JOB_SCRIPT ############################################################## # Skip execution if something already failed ############################################################## echo "if [ \"\$RESULT\" = '0' ] ; then" >> $LRMS_JOB_SCRIPT ############################################################## # Runtime configuration at computing node ############################################################## RTE_stage1 ############################################################## # Diagnostics ############################################################## echo "echo \"runtimeenvironments=\$runtimeenvironments\" >> \"\$RUNTIME_JOB_DIAG\"" >> $LRMS_JOB_SCRIPT cat >> $LRMS_JOB_SCRIPT <<'EOSCR' if [ ! "X$SLURM_NODEFILE" = 'X' ] ; then if [ -r "$SLURM_NODEFILE" ] ; then cat "$SLURM_NODEFILE" | sed 's/\(.*\)/nodename=\1/' >> "$RUNTIME_JOB_DIAG" NODENAME_WRITTEN="1" else SLURM_NODEFILE= fi fi EOSCR ############################################################## # Accounting (WN OS Detection) ############################################################## detect_wn_systemsoftware ############################################################## # Check intermediate result again ############################################################## echo "if [ \"\$RESULT\" = '0' ] ; then" >> $LRMS_JOB_SCRIPT ############################################################## # Execution ############################################################## cd_and_run ############################################################## # End of RESULT checks ############################################################## echo "fi" >> $LRMS_JOB_SCRIPT echo "fi" >> $LRMS_JOB_SCRIPT ############################################################## # Runtime (post)configuration at computing node ############################################################## RTE_stage2 ##################################################### # Upload output files #################################################### clean_local_scratch_dir_output ############################################################## # Move files back to session directory (job is done on node only) # RUNTIME_JOB_DIR -> RUNTIME_LOCAL_SCRATCH_DIR/job_id # !!!!!!!!!!!!!!!!!!! would be better to know the names of files !!!!!!!!!!! ############################################################## move_files_to_frontend ############################################################## # Finish accounting and exit job ############################################################## accounting_end ####################################### # Submit the job ####################################### echo "SLURM job script built" 1>&2 # Execute sbatch command cd "$joboption_directory" echo "SLURM script follows:" 1>&2 echo "-------------------------------------------------------------------" 1>&2 cat "$LRMS_JOB_SCRIPT" 1>&2 echo "-------------------------------------------------------------------" 1>&2 echo "" 1>&2 SLURM_RESULT=1 SLURM_TRIES=0 #job creation finished if [ ! -z "$perflogdir" ]; then stop_ts=`date +%s.%N` t=`awk "BEGIN { printf \"%.3f\", ${stop_ts}-${start_ts} }"` echo "[`date +%Y-%m-%d\ %T`] submit-slurm-job, JobScriptCreation: $t" >> $perflogfilesub fi while [ "$SLURM_TRIES" -lt '10' ] ; do # Unset all environment variables before calling sbatch. Otherwise # SLURM will forward them to the job and leak information about arex. # Only unset lines with assignments. # Risks unsetting variables in sub assignments, but this is likely harmless. # TODO: Maybe we only should unset $ARC_*, $CONFIG_*, $GLOBUS_* etc? if [ ! -z "$perflogdir" ]; then start_ts=`date +%s.%N` fi (for i in $(env|grep '^[A-Za-z][A-Za-z0-9]*='|grep -v "LRMS_JOB_SCRIPT"|cut -d= -f1);do unset $i;done; \ ${sbatch} $LRMS_JOB_SCRIPT) 1>$LRMS_JOB_OUT 2>$LRMS_JOB_ERR SLURM_RESULT="$?" if [ ! -z "$perflogdir" ]; then stop_ts=`date +%s.%N` t=`awk "BEGIN { printf \"%.3f\", ${stop_ts}-${start_ts} }"` echo "[`date +%Y-%m-%d\ %T`] submit-slurm-job, JobSubmission: $t" >> $perflogfilesub fi if [ "$SLURM_RESULT" -eq '0' ] ; then break ; fi if [ "$SLURM_RESULT" -eq '198' ] ; then echo "Waiting for queue to decrease" 1>&2 sleep 60 SLURM_TRIES=0 continue fi grep 'maximum number of jobs' "$LRMS_JOB_OUT" "$LRMS_JOB_ERR" if [ $? -eq '0' ] ; then echo "Waiting for queue to decrease" 1>&2 sleep 60 SLURM_TRIES=0 continue fi # A rare SLURM error, but may cause chaos in the information/accounting system grep 'unable to accept job' "$LRMS_JOB_OUT" "$LRMS_JOB_ERR" if [ $? -eq '0' ] ; then echo "Waiting for queue to decrease" 1>&2 sleep 60 SLURM_TRIES=0 continue fi SLURM_TRIES=$(( $SLURM_TRIES + 1 )) sleep 2 done if [ $SLURM_RESULT -eq '0' ] ; then #TODO test what happens when the jobqueue is full or when the slurm ctld is not responding # SLURM 1.x and 2.2.x outputs the jobid into STDERR and STDOUT respectively. Concat them, # and let sed sort it out. From the exit code we know that the job was submitted, so this # is safe. Ulf Tigerstedt 1.5.2011 # This is unfortunately not safe. Cray's SLURM sbatch returns 0, when it fails to submit a job. 22.1.2015 job_id=`cat $LRMS_JOB_OUT $LRMS_JOB_ERR |sed -e 's/^\(sbatch: \)\{0,1\}Submitted batch job \([0-9]*\)$/\2/'` if expr match "${job_id}" '[0-9][0-9]*' >/dev/null; then echo "joboption_jobid=$job_id" >> $GRAMI_FILE echo "job submitted successfully!" 1>&2 echo "local job id: $job_id" 1>&2 # Remove temporary job script file rm -f $LRMS_JOB_SCRIPT $LRMS_JOB_OUT $LRMS_JOB_ERR echo "----- exiting submit_slurm_job -----" 1>&2 echo "" 1>&2 exit 0 else echo "job *NOT* submitted successfully!" 1>&2 echo "failed getting the slurm jobid for the job!" 1>&2 echo "Instead got: ${job_id}" 1>&2 echo "Submission: Local submission client behaved unexpectedly." fi else echo "job *NOT* submitted successfully!" 1>&2 echo "got error code from sbatch: $SLURM_RESULT !" 1>&2 echo "Submission: Local submission client failed." fi echo "Output is:" 1>&2 cat $LRMS_JOB_OUT 1>&2 echo "Error output is:" cat $LRMS_JOB_ERR 1>&2 rm -f "$LRMS_JOB_SCRIPT" "$LRMS_JOB_OUT" "$LRMS_JOB_ERR" echo "----- exiting submit_slurm_job -----" 1>&2 echo "" 1>&2 exit 1