#!/usr/bin/env bash ##************************************************************** ## ## Copyright (C) 1990-2018, Condor Team, Computer Sciences Department, ## University of Wisconsin-Madison, WI. ## ## Licensed under the Apache License, Version 2.0 (the "License"); you ## may not use this file except in compliance with the License. You may ## obtain a copy of the License at ## ## http://www.apache.org/licenses/LICENSE-2.0 ## ## Unless required by applicable law or agreed to in writing, software ## distributed under the License is distributed on an "AS IS" BASIS, ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ## See the License for the specific language governing permissions and ## limitations under the License. ## ##************************************************************** # This is a script to run OpenMPI jobs under the HTCondor parallel universe. # OpenMPI assumes that a full install is available on all execute nodes. ## sample submit script #universe = parallel #executable = openmpiscript #arguments = actual_mpi_job arg1 arg2 arg3 #getenv = true # #should_transfer_files = yes #transfer_input_files = actual_mpi_job #when_to_transfer_output = on_exit_or_evict # #output = out.$(NODE) #error = err.$(NODE) #log = log # #machine_count = 8 #queue ## ## configuration options # $USE_OPENMP should be set to true if using OpenMP with your OpenMPI executable (not typical). USE_OPENMP=false # Set the paths to the helper scripts # Get them from the HTCondor libexec directory ORTED_LAUNCHER=$(condor_config_val libexec)/orted_launcher.sh GET_ORTED_CMD=$(condor_config_val libexec)/get_orted_cmd.sh # Or set a custom path (e.g. the local directory if transferring the scripts) #ORTED_LAUNCHER=./orted_launcher.sh #GET_ORTED_CMD=./get_orted_cmd.sh # $MPDIR points to the location of the OpenMPI install # The pool admin may set it via OPENMPI_INSTALL_PATH in the condor_config (recommended) MPDIR=$(condor_config_val OPENMPI_INSTALL_PATH) # Or set it manually #MPDIR=/usr/lib64/openmpi # $EXINT is a comma-delimited list of excluded network interfaces. # If your mpi jobs are hanging, OpenMPI may be trying to use too many # network interfaces to communicate between nodes. # The pool admin may set it via OPENMPI_EXCLUDE_NETWORK_INTERFACES in the condor_config (recommended) EXINT=$(condor_config_val OPENMPI_EXCLUDE_NETWORK_INTERFACES) # Or set it manually #EXINT="docker0,virbr0" ## ## configuration check # We recommend that your pool admin use MOUNT_UNDER_SCRATCH = /tmp # so that OpenMPI caches all data under the user's scratch directory. # Not having /tmp mounted under scratch may hang mpi jobs. _USE_SCRATCH=$(condor_config_val MOUNT_UNDER_SCRATCH) if [ -z $_USE_SCRATCH ]; then >&2 echo "WARNING: MOUNT_UNDER_SCRATCH not set in condor_config" elif test "${_USE_SCRATCH#*/tmp}" == "$_USE_SCRATCH"; then >&2 echo "WARNING: /tmp not included in MOUNT_UNDER_SCRATCH" fi # If MPDIR is not set, then use a default value if [ -z $MPDIR ]; then >&2 echo "WARNING: Using default value for \$MPDIR in openmpiscript" MPDIR=/usr/lib64/openmpi fi PATH=$MPDIR/bin:.:$PATH export PATH # If EXINT is not set, then use some default values if [ -z $EXINT ]; then >&2 echo "WARNING: Using default values for \$EXINT in openmpiscript" EXINT="docker0,virbr0" fi ## ## cleanup function _orted_launcher_pid=0 _mpirun_pid=0 CONDOR_CHIRP=$(condor_config_val libexec)/condor_chirp force_cleanup() { # Forward SIGTERM to the orted launcher if [ $_orted_launcher_pid -ne 0 ]; then kill -s SIGTERM $_orted_launcher_pid fi # Cleanup mpirun if [ $_CONDOR_PROCNO -eq 0 ] && [ $_mpirun_pid -ne 0 ]; then $CONDOR_CHIRP ulog "Node $_CONDOR_PROCNO caught SIGTERM, cleaning up mpirun" rm $HOSTFILE # Send SIGTERM to mpirun and the orted launcher kill -s SIGTERM $_mpirun_pid # Give mpirun 30 seconds to terminate nicely for i in {1..30}; do kill -0 $_mpirun_pid 2> /dev/null # returns 0 if running _mpirun_killed=$? if [ $_mpirun_killed -ne 0 ]; then break fi sleep 1 done # If mpirun is still running, send SIGKILL if [ $_mpirun_killed -eq 0 ]; then $CONDOR_CHIRP ulog "mpirun hung on Node ${_CONDOR_PROCNO}, sending SIGKILL!" kill -s SIGKILL $_mpirun_pid fi fi exit 1 } trap force_cleanup SIGTERM ## ## execute node setup export PATH=$MPDIR/bin:$PATH # Run the orted launcher (gets orted command from condor_chirp) $ORTED_LAUNCHER & _orted_launcher_pid=$! if [ $_CONDOR_PROCNO -ne 0 ]; then # If not on node 0, wait for orted wait $_orted_launcher_pid exit $? fi ## ## head node (node 0) setup # Build the hostfile HOSTFILE=hosts while [ -f $_CONDOR_SCRATCH_DIR/$HOSTFILE ]; do HOSTFILE=x$HOSTFILE done HOSTFILE=$_CONDOR_SCRATCH_DIR/$HOSTFILE REQUEST_CPUS=$(condor_q -jobads $_CONDOR_JOB_AD -af RequestCpus) for node in $(seq 0 $(( $_CONDOR_NPROCS - 1 ))); do if $USE_OPENMP; then # OpenMP will do the threading on the execute node echo "$node slots=1" >> $HOSTFILE else # OpenMPI will do the threading on the execute node echo "$node slots=$REQUEST_CPUS" >> $HOSTFILE fi done # Make sure the executable is executable EXECUTABLE=$1 shift chmod +x $EXECUTABLE ## ## run mpirun # Set MCA values for running on HTCondor export OMPI_MCA_plm_rsh_agent=$GET_ORTED_CMD # use the helper script instead of ssh export OMPI_MCA_plm_rsh_no_tree_spawn=1 # disable ssh tree spawn export OMPI_MCA_orte_hetero_nodes=1 # do not assume same hardware on each node export OMPI_MCA_orte_startup_timeout=120 # allow two minutes before failing export OMPI_MCA_hwloc_base_binding_policy="none" # do not bind to cpu cores export OMPI_MCA_btl_tcp_if_exclude="lo,$EXINT" # exclude unused tcp network interfaces # Optional MCA values to set for firewalled setups #export OMPI_MCA_btl_tcp_port_min_v4=1024 # lowest port number that can be used #export OMPI_MCA_btl_tcp_port_range_v4=64511 # range of ports above lowest that can be used # Optionally set MCA values for increasing mpirun verbosity per component # (see ompi_info for more components) #export OMPI_MCA_plm_base_verbose=30 #export OMPI_MCA_orte_base_verbose=30 #export OMPI_MCA_hwloc_base_verbose=30 #export OMPI_MCA_btl_base_verbose=30 # Run mpirun in the background and wait for it to exit mpirun -v --prefix $MPDIR -hostfile $HOSTFILE $EXECUTABLE $@ & _mpirun_pid=$! wait $_mpirun_pid _mpirun_exit=$? ## clean up # Wait for orted to finish wait $_orted_launcher_pid rm $HOSTFILE exit $_mpirun_exit