#!/bin/sh

##**************************************************************
##
## Copyright (C) 1990-2017, Condor Team, Computer Sciences Department,
## University of Wisconsin-Madison, WI.
## 
## Licensed under the Apache License, Version 2.0 (the "License"); you
## may not use this file except in compliance with the License.  You may
## obtain a copy of the License at
## 
##    http://www.apache.org/licenses/LICENSE-2.0
## 
## Unless required by applicable law or agreed to in writing, software
## distributed under the License is distributed on an "AS IS" BASIS,
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
## See the License for the specific language governing permissions and
## limitations under the License.
##
##**************************************************************


# Set this to the bin directory of MPICH installation
MPDIR=/usr/local/mpich2/bin
PATH=$MPDIR:.:$PATH
export PATH

_CONDOR_PROCNO=$_CONDOR_PROCNO
_CONDOR_NPROCS=$_CONDOR_NPROCS

# Remove the contact file, so if we are held and released
# it can be recreated anew

rm -f $CONDOR_CONTACT_FILE

PATH=`condor_config_val libexec`/:$PATH

# mpd needs a conf file, and it must be
# permissions 0700
mkdir tmp
MPD_CONF_FILE=`pwd`/tmp/mpd_conf_file
export MPD_CONF_FILE

ulimit -c 0

# If you have a shared file system, maybe you
# want to put the mpd.conf file in your home
# directory

echo "password=somepassword" > $MPD_CONF_FILE
chmod 0700 $MPD_CONF_FILE

# If on the head node, start mpd, get the port and host,
# and condor_chirp it back into the ClassAd
# so the non-head nodes can find the head node.

if [ $_CONDOR_PROCNO -eq 0 ]
then
	mpd > mpd.out.$_CONDOR_PROCNO 2>&1 &
	sleep 1
	host=`mpdtrace -l | sed 1q | tr '_' ' ' | awk '{print $1}'`
	port=`mpdtrace -l | sed 1q | tr '_' ' ' | awk '{print $2}'`

	condor_chirp set_job_attr MPICH_PORT $port
	condor_chirp set_job_attr MPICH_HOST \"$host\"
	
	num_hosts=1
	retries=0
	while [ $num_hosts -ne $_CONDOR_NPROCS ]
	do
		num_hosts=`mpdtrace | wc -l`
		sleep 2
		retries=`expr $retries + 1`
		if [ $retries -gt 100 ]
		then
			echo "Too many retries, could not start all $_CONDOR_NPROCS nodes, only started $num_hosts, giving up.  Here are the hosts I could start "
			mpdtrace
			exit 1
		fi
	done

	## run the actual mpi job, which was the command line argument
 	## to the invocation of this shell script
 	mpiexec -n $_CONDOR_NPROCS $@
	e=$?

	mpdallexit
	sleep 20
	echo $e
else
	# If NOT the head node, acquire the host and port of 
 	# the head node
 	retries=0
	host=UNDEFINED
	while [ $host == "UNDEFINED" ]
	do
		host=`condor_chirp get_job_attr MPICH_HOST`
		sleep 2
		retries=`expr $retries + 1`
		if [ $retries -gt 100 ]; then
                    echo "Too many retries, could not get mpd host from condor_chirp, giving up."
                    exit 1
                fi
	done

	port=`condor_chirp get_job_attr MPICH_PORT`
	host=`echo $host | tr -d '"'`
	mpd --host=$host --port=$port --noconsole > mpd.out.$_CONDOR_PROCNO 2>&1
fi