#!/bin/sh # set -x # # Based on globus submission script for pbs # # Submits job to HTCondor. # Input: path to grami file (same as Globus). # # The temporary job description file is created for the submission and then removed # at the end of this script. echo "----- starting submit_condor_job -----" 1>&2 joboption_lrms="condor" lrms_options="condor_requirements condor_rank condor_bin_path condor_config" queue_options="condor_requirements" # ARC1 passes first the config file. if [ "$1" = "--config" ]; then shift; ARC_CONFIG=$1; shift; fi GRAMI_FILE=$1 # define paths and config parser basedir=`dirname $0` basedir=`cd $basedir > /dev/null && pwd` || exit $? . "${basedir}/lrms_common.sh" # include common submit functions . "${pkgdatadir}/submit_common.sh" || exit $? # run common init # * parse grami # * parse config # * load LRMS-specific env # * set common variables common_init perflogfilesub="${perflogdir}/submission.perflog" if [ ! -z "$perflogdir" ]; then start_ts=`date +%s.%N` fi if [ -z "${RUNTIME_NODE_SEES_FRONTEND}" ] ; then RUNTIME_LOCAL_SCRATCH_DIR="\${_CONDOR_SCRATCH_DIR}" fi # check remote or local scratch is configured check_any_scratch ############################################################## # Zero stage of runtime environments ############################################################## RTE_stage0 ############################################################## # create job script ############################################################## mktempscript is_cluster=true ############################################################## # Start job description file ############################################################## CONDOR_SUBMIT='condor_submit' if [ ! -z "$CONDOR_BIN_PATH" ] ; then CONDOR_SUBMIT=${CONDOR_BIN_PATH}/${CONDOR_SUBMIT} fi # HTCondor job script and submit description file rm -f "$LRMS_JOB_SCRIPT" LRMS_JOB_SCRIPT="${joboption_directory}/condorjob.sh" LRMS_JOB_DESCRIPT="${joboption_directory}/condorjob.jdl" echo "# HTCondor job description built by arex" > $LRMS_JOB_DESCRIPT echo "Executable = condorjob.sh" >> $LRMS_JOB_DESCRIPT echo "Input = $joboption_stdin" >> $LRMS_JOB_DESCRIPT echo "Log = ${joboption_directory}/log">> $LRMS_JOB_DESCRIPT # write HTCondor output to .comment file if possible, but handle the situation when # jobs are submitted by HTCondor-G < 8.0.5 condor_stdout="${joboption_directory}.comment" condor_stderr="${joboption_directory}.comment" if [ -z "${RUNTIME_NODE_SEES_FRONTEND}" ] ; then # if [[ $joboption_stdout =~ _condor_stdout$ ]]; then if expr match "$joboption_stdout" '.*_condor_stdout$' > /dev/null; then condor_stdout=$joboption_stdout; condor_stderr=$joboption_stderr; fi fi echo "Output = $condor_stdout">> $LRMS_JOB_DESCRIPT echo "Error = $condor_stderr">> $LRMS_JOB_DESCRIPT # queue if [ ! -z "${joboption_queue}" ] ; then echo "+NordugridQueue = \"$joboption_queue\"" >> $LRMS_JOB_DESCRIPT fi # job name for convenience if [ ! -z "${joboption_jobname}" ] ; then #TODO is this necessary? do parts of the infosys need these limitations? jobname=`echo "$joboption_jobname" | \ sed 's/^\([^[:alpha:]]\)/N\1/' | \ sed 's/[^[:alnum:]]/_/g' | \ sed 's/\(...............\).*/\1/'` echo "Description = $jobname" >> $LRMS_JOB_DESCRIPT else jobname="gridjob" echo "Description = $jobname" >> $LRMS_JOB_DESCRIPT fi # universe if [ ! -z $DOCKER_UNIVERSE ] ; then echo "Universe = $DOCKER_UNIVERSE" >> $LRMS_JOB_DESCRIPT echo "should_transfer_files = YES" >> $LRMS_JOB_DESCRIPT echo "when_to_transfer_output = ON_EXIT" >> $LRMS_JOB_DESCRIPT else echo "Universe = vanilla" >> $LRMS_JOB_DESCRIPT fi if [ ! -z $DOCKER_IMAGE ] ; then echo "docker_image = $DOCKER_IMAGE" >> $LRMS_JOB_DESCRIPT fi # notification echo "Notification = Never" >> $LRMS_JOB_DESCRIPT # no job restart REQUIREMENTS="(NumJobStarts == 0)" PERIODIC_REMOVE="(JobStatus == 1 && NumJobStarts > 0)" # custom requirements if [ ! -z "$CONFIG_condor_requirements" ] ; then # custom requirement from arc.conf REQUIREMENTS="${REQUIREMENTS} && ( $CONFIG_condor_requirements )" fi echo "Requirements = ${REQUIREMENTS}" >> $LRMS_JOB_DESCRIPT ##################################################### # priority ##################################################### if [ ! -z "$joboption_priority" ]; then #Condor uses any integer as priority. 0 being default. Only per user basis. #We assume that only grid jobs are relevant. #In that case we can use ARC 0-100 but translated so default is 0. priority=$((joboption_priority-50)) echo "Priority = $priority" >> $LRMS_JOB_DESCRIPT fi # rank if [ ! -z "$CONFIG_condor_rank" ] ; then echo "Rank = $CONFIG_condor_rank" >> $LRMS_JOB_DESCRIPT fi # proxy if [ -f "${joboption_directory}/user.proxy" ]; then echo "x509userproxy = ${joboption_directory}/user.proxy" >> $LRMS_JOB_DESCRIPT fi ############################################################## # (non-)parallel jobs ############################################################## set_count if [ ! -z $joboption_count ] && [ $joboption_count -gt 0 ] ; then echo "request_cpus = $joboption_count" >> $LRMS_JOB_DESCRIPT fi if [ "$joboption_exclusivenode" = "true" ]; then echo "+RequiresWholeMachine=True" >> $LRMS_JOB_DESCRIPT fi ############################################################## # Execution times (minutes) ############################################################## if [ ! -z "$joboption_cputime" ] ; then if [ $joboption_cputime -lt 0 ] ; then echo 'WARNING: Less than 0 cpu time requested: $joboption_cputime' 1>&2 joboption_cputime=0 echo 'WARNING: cpu time set to 0' 1>&2 fi maxcputime=$(( $joboption_cputime / $joboption_count )) echo "+JobCpuLimit = $joboption_cputime" >> $LRMS_JOB_DESCRIPT PERIODIC_REMOVE="${PERIODIC_REMOVE} || RemoteUserCpu + RemoteSysCpu > JobCpuLimit" fi if [ -z "$joboption_walltime" ] ; then if [ ! -z "$joboption_cputime" ] ; then # Set walltime for backward compatibility or incomplete requests joboption_walltime=$(( $maxcputime * $walltime_ratio )) fi fi if [ ! -z "$joboption_walltime" ] ; then if [ $joboption_walltime -lt 0 ] ; then echo 'WARNING: Less than 0 wall time requested: $joboption_walltime' 1>&2 joboption_walltime=0 echo 'WARNING: wall time set to 0' 1>&2 fi echo "+JobTimeLimit = $joboption_walltime" >> $LRMS_JOB_DESCRIPT PERIODIC_REMOVE="${PERIODIC_REMOVE} || RemoteWallClockTime > JobTimeLimit" fi ############################################################## # Requested memory (mb) ############################################################## set_req_mem if [ ! -z "$joboption_memory" ] ; then memory_bytes=$(( $joboption_memory * 1024 )) memory_req=$(( $joboption_memory )) # HTCondor needs to know the total memory for the job, not memory per core if [ ! -z $joboption_count ] && [ $joboption_count -gt 0 ] ; then memory_bytes=$(( $joboption_count * $memory_bytes )) memory_req=$(( $joboption_count * $memory_req )) fi echo "request_memory=$memory_req" >> $LRMS_JOB_DESCRIPT echo "+JobMemoryLimit = $memory_bytes" >> $LRMS_JOB_DESCRIPT # it is important to protect evaluation from undefined ResidentSetSize PERIODIC_REMOVE="${PERIODIC_REMOVE} || ((ResidentSetSize isnt undefined ? ResidentSetSize : 0) > JobMemoryLimit)" fi ############################################################## # HTCondor stage in/out ############################################################## if [ -z "${RUNTIME_NODE_SEES_FRONTEND}" ] ; then ( cd "$joboption_directory" if [ $? -ne '0' ] ; then echo "Can't change to session directory: $joboption_directory" 1>&2 rm -f "$LRMS_JOB_SCRIPT" "$LRMS_JOB_DESCRIPT" "$LRMS_JOB_OUT" "$LRMS_JOB_ERR" echo "Submission: Configuration error." exit 1 fi # transfer all session directory if not shared between ARC CE and worknodes scratch_dir=`dirname "$joboption_directory"` echo "should_transfer_files = YES" >> $LRMS_JOB_DESCRIPT echo "When_to_transfer_output = ON_EXIT_OR_EVICT" >> $LRMS_JOB_DESCRIPT echo "Transfer_input_files = $joboption_directory" >> $LRMS_JOB_DESCRIPT ) fi echo "Periodic_remove = ${PERIODIC_REMOVE}" >> $LRMS_JOB_DESCRIPT echo "Queue" >> $LRMS_JOB_DESCRIPT echo "#!/bin/bash -l" > $LRMS_JOB_SCRIPT echo "" >> $LRMS_JOB_SCRIPT echo "# Overide umask of execution node (sometime values are really strange)" >> $LRMS_JOB_SCRIPT echo "umask 077" >> $LRMS_JOB_SCRIPT echo " " >> $LRMS_JOB_SCRIPT # Script must have execute permission chmod 0755 $LRMS_JOB_SCRIPT sourcewithargs_jobscript ############################################################## # Init accounting ############################################################## accounting_init ############################################################## # Add environment variables ############################################################## add_user_env ############################################################## # Check for existance of executable, # there is no sense to check for executable if files are # downloaded directly to computing node ############################################################## if [ -z "${joboption_arg_0}" ] ; then echo 'Executable is not specified' 1>&2 rm -f "$LRMS_JOB_SCRIPT" "$LRMS_JOB_DESCRIPT" "$LRMS_JOB_OUT" "$LRMS_JOB_ERR" echo "Submission: Job description error." exit 1 fi ###################################################################### # Adjust working directory for tweaky nodes # RUNTIME_GRIDAREA_DIR should be defined by external means on nodes ###################################################################### if [ ! -z "${RUNTIME_NODE_SEES_FRONTEND}" ] ; then setup_runtime_env else echo "RUNTIME_JOB_DIR=$RUNTIME_LOCAL_SCRATCH_DIR/$joboption_gridid" >> $LRMS_JOB_SCRIPT echo "RUNTIME_JOB_DIAG=$RUNTIME_LOCAL_SCRATCH_DIR/${joboption_gridid}.diag" >> $LRMS_JOB_SCRIPT RUNTIME_STDIN_REL=`echo "${joboption_stdin}" | sed "s#^${joboption_directory}/*##"` RUNTIME_STDOUT_REL=`echo "${joboption_stdout}" | sed "s#^${joboption_directory}/*##"` RUNTIME_STDERR_REL=`echo "${joboption_stderr}" | sed "s#^${joboption_directory}/*##"` if [ "$RUNTIME_STDIN_REL" = "${joboption_stdin}" ] ; then echo "RUNTIME_JOB_STDIN=\"${joboption_stdin}\"" >> $LRMS_JOB_SCRIPT else echo "RUNTIME_JOB_STDIN=\"$RUNTIME_LOCAL_SCRATCH_DIR/$joboption_gridid/$RUNTIME_STDIN_REL\"" >> $LRMS_JOB_SCRIPT fi if [ "$RUNTIME_STDOUT_REL" = "${joboption_stdout}" ] ; then echo "RUNTIME_JOB_STDOUT=\"${joboption_stdout}\"" >> $LRMS_JOB_SCRIPT else echo "RUNTIME_JOB_STDOUT=\"$RUNTIME_LOCAL_SCRATCH_DIR/$joboption_gridid/$RUNTIME_STDOUT_REL\"" >> $LRMS_JOB_SCRIPT fi if [ "$RUNTIME_STDERR_REL" = "${joboption_stderr}" ] ; then echo "RUNTIME_JOB_STDERR=\"${joboption_stderr}\"" >> $LRMS_JOB_SCRIPT else echo "RUNTIME_JOB_STDERR=\"$RUNTIME_LOCAL_SCRATCH_DIR/$joboption_gridid/$RUNTIME_STDERR_REL\"" >> $LRMS_JOB_SCRIPT fi fi ############################################################## # Add std... to job arguments ############################################################## include_std_streams ############################################################## # Move files to local working directory (job is done on node only) # RUNTIME_JOB_DIR -> RUNTIME_LOCAL_SCRATCH_DIR/job_id ############################################################## move_files_to_node echo "" >> $LRMS_JOB_SCRIPT echo "RESULT=0" >> $LRMS_JOB_SCRIPT echo "" >> $LRMS_JOB_SCRIPT ############################################################## # Skip execution if something already failed ############################################################## echo "if [ \"\$RESULT\" = '0' ] ; then" >> $LRMS_JOB_SCRIPT ############################################################## # Runtime configuration at computing node ############################################################## RTE_stage1 ############################################################## # Diagnostics ############################################################## echo "echo \"runtimeenvironments=\$runtimeenvironments\" >> \"\$RUNTIME_JOB_DIAG\"" >> $LRMS_JOB_SCRIPT cat >> $LRMS_JOB_SCRIPT <<'EOSCR' EOSCR ############################################################## # Accounting (WN OS Detection) ############################################################## detect_wn_systemsoftware ############################################################## # Check intermediate result again ############################################################## echo "if [ \"\$RESULT\" = '0' ] ; then" >> $LRMS_JOB_SCRIPT ############################################################## # Execution ############################################################## cd_and_run ############################################################## # End of RESULT checks ############################################################## echo "fi" >> $LRMS_JOB_SCRIPT echo "fi" >> $LRMS_JOB_SCRIPT ############################################################## # Runtime (post)configuration at computing node ############################################################## RTE_stage2 ##################################################### # Clean up output files in the local scratch dir ##################################################### clean_local_scratch_dir_output "moveup" ############################################################## # Move files back to session directory (job is done on node only) # RUNTIME_JOB_DIR -> RUNTIME_LOCAL_SCRATCH_DIR/job_id ############################################################## move_files_to_frontend ############################################################## # Finish accounting and exit job ############################################################## accounting_end if [ ! -z "$perflogdir" ]; then stop_ts=`date +%s.%N` t=`awk "BEGIN { printf \"%.3f\", ${stop_ts}-${start_ts} }"` echo "[`date +%Y-%m-%d\ %T`] submit-condor-job, JobScriptCreation: $t" >> $perflogfilesub fi ####################################### # Submit the job ####################################### echo "HTCondor job script built" 1>&2 # Execute condor_submit command cd "$joboption_directory" echo "HTCondor script follows:" 1>&2 echo "-------------------------------------------------------------------" 1>&2 cat "$LRMS_JOB_SCRIPT" 1>&2 echo "-------------------------------------------------------------------" 1>&2 echo "" 1>&2 CONDOR_RESULT=1 CONDOR_TRIES=0 while [ "$CONDOR_TRIES" -lt '10' ] ; do if [ ! -z "$perflogdir" ]; then start_ts=`date +%s.%N` fi ${CONDOR_SUBMIT} $LRMS_JOB_DESCRIPT 1>$LRMS_JOB_OUT 2>$LRMS_JOB_ERR CONDOR_RESULT="$?" if [ ! -z "$perflogdir" ]; then stop_ts=`date +%s.%N` t=`awk "BEGIN { printf \"%.3f\", ${stop_ts}-${start_ts} }"` echo "[`date +%Y-%m-%d\ %T`] submit-condor-job, JobSubmission: $t" >> $perflogfilesub fi if [ "$CONDOR_RESULT" -eq '0' ] ; then break ; fi CONDOR_TRIES=$(( $CONDOR_TRIES + 1 )) sleep 2 done if [ $CONDOR_RESULT -eq '0' ] ; then job_out=`cat $LRMS_JOB_OUT` if [ "${job_out}" = "" ]; then echo "job *NOT* submitted successfully!" 1>&2 echo "failed getting the condor jobid for the job!" 1>&2 echo "Submission: Local submission client behaved unexpectedly." elif [ `echo "${job_out}" | grep -Ec "submitted to cluster\s[0-9]+"` != "1" ]; then echo "job *NOT* submitted successfully!" 1>&2 echo "badly formatted condor jobid for the job !" 1>&2 echo "Submission: Local submission client behaved unexpectedly." else job_id=`echo $job_out | grep cluster | awk '{print $8}' | sed 's/[\.]//g'` hostname=`hostname -f` echo "joboption_jobid=${job_id}.${hostname}" >> $GRAMI_FILE echo "condor_log=${joboption_directory}/log" >> $GRAMI_FILE echo "job submitted successfully!" 1>&2 echo "local job id: $job_id" 1>&2 # Remove temporary files rm -f $LRMS_JOB_OUT $LRMS_JOB_ERR echo "----- exiting submit_condor_job -----" 1>&2 echo "" 1>&2 exit 0 fi else echo "job *NOT* submitted successfully!" 1>&2 echo "got error code from condor_submit: $CONDOR_RESULT !" 1>&2 echo "Submission: Local submission client failed." fi echo "Output is:" 1>&2 cat $LRMS_JOB_OUT 1>&2 echo "Error output is:" 1>&2 cat $LRMS_JOB_ERR 1>&2 rm -f "$LRMS_JOB_OUT" "$LRMS_JOB_ERR" echo "----- exiting submit_condor_job -----" 1>&2 echo "" 1>&2 exit 1