#!/bin/bash export MODULEPATH=/expanse/projects/ngbt/opt/expanse/modules:$MODULEPATH module load phylobayes/1.8c export THEPROGRAM="pb_mpi" ################ Manu ##################### # Assumes number of chains (-N) = 2 ####### for debugging ########## #module list #scontrol show job -d $SLURM_JOBID #numactl --show #echo ${SLURM_NTASKS_PER_NODE} ####### for debugging ########## # get the list of cores allocated to this job allcpus=(`numactl --show | grep "physcpubind:" | awk -F: '{print $2}'`) # partition the list into two equal cpu sets for the two MPI jobs h=$((${SLURM_NTASKS_PER_NODE}/2)) cpuset1='' cpuset2='' for ((i=0;i : number of cycles to exclude from comparison export ACCEPTDIFF=${ACCEPTDIFF-'0.1'} #: : maxdiff must be less than this. export ACCEPTSIZE=${ACCEPTSIZE-'50'} #: : The effective number of samples must be greater than this. export GIVEUP=${GIVEUP-'10000'} #: : The number of cycles that must occur before we start to check if we are stuck. export CHAINNAME="chain" export RESTARTING=0 #Function to print help to the user: usage() { cat 1>&2 << EOF Usage: $0 [Wrapper/Convergence Options] -- [PhyloBayes options EXCEPT or -rnd ] $0 is a BASH script that starts and runs several chains of PhyloBayesMPI in parallel. You submit this script as an MPI job to your PBS cluster, and it divides the available hosts and slots evenly amongst the chains. On the head node, it also starts a convergence checker that runs in the background, terminating the chains if they fulfill the convergence parameters. WRAPPER OPTIONS: -h --help Print this help message. --RESTART Restart the existing chains. Totally ignore options after the '--' --seed --SEED Set the base seed, each chain increments this by 1. (Which is sufficiently random, 1 and 2 produce completely different pseudorandom chains.) (This uses the undocumented '--rnd' option of PhyloBayes) -N --N The number of different chains to start/run. Each will get 1/N cores. --CHECKINTERVAL The interval, in seconds, between convergence checks. Defaults to 600 = 10 minutes. --BURNIN The number of cycles at the beginning of the chain to be ignored by the convergence checker. Defaults to 500. --ACCEPTDIFF The 'maxdiff' between all chains must be less than this for convergence. Defaults to 0.1, the PB manual says that 0.3 is 'acceptable' and 0.1 is 'good'. --ACCEPTSIZE The effective number of samples must be greater than this for convergence to be accepted. Defaults to 50, the PB manual says that 50 is 'acceptable' and 300 is 'good'. --GIVEUP The number of cycles to run before deciding that a set of chains that still have 'maxdiff' = 1 have failed. PHYLOBAYES OPTIONS: Please DO NOT provide the following options after the '--': --rnd The wrapper chooses the seed for each chain, they must have different seeds. PLEASE USE the wrapper '--SEED' option. The wrapper chooses the names of the chains, 'chain0', 'chain1', etc. EOF } #Function to print to both stdout and stderr echoerr() { echo "$@" ; echo "$@" 1>&2; } #Parse the command line PARSINGARGS=1 while [[ "$PARSINGARGS" == "1" && "$1" ]] do case $1 in -h|--help) usage; exit 0;; --seed|--SEED) shift; SEED=$1;; #check that seed is a positive int -N|--N) shift; NUM_CHAINS=$1;; #check that num_chains is a positive int. --CHECKINTERVAL) shift; CHECKINTERVAL=$1;; #check that checkinterval is a positive int --BURNIN) shift; BURNIN=$1;; #int >= 0 --ACCEPTDIFF) shift; ACCEPTDIFF=$1;; #float (0,1) --ACCEPTSIZE) shift; ACCEPTSIZE=$1;; #int >= 1 --GIVEUP) shift; GIVEUP=$1;; #int >= 1 --RESTART) RESTARTING=1;; --) PARSINGARGS=0;; *) echoerr "Unknown parameter: '${1}', terminating."; usage; exit 1;; esac shift done if [[ "$PARSINGARGS" == "1" ]] then echoerr "No '--' encountered while parsing arguments. Please verify the command line." exit 1 fi export ALLARGS=$* if [[ "$RESTARTING" == "1" ]] then #Clear the arguments that we are going to ignore ALLARGS="" #Set the number of chains according to the files here. NUM_CHAINS=$( ls ${CHAINNAME}*.chain 2>/dev/null | wc -w ) fi #Done Parsing WRAPPER OPTIONS #Do a basic sanity check on the PhyloBayes options. echo ${ALLARGS} | grep ' -rnd' >/dev/null if [ "$?" == "0" ] then #Found -rnd in options echoerr "Please do not provide the '-rnd' option to PhyloBayes. Use the -h option of this script for more details." exit 1 fi export NTHREADS=$h #begin setup set -m set -b export STARTUPERROR=0 #We have to do this here, because seq gives a SIGCHLD when it terminates, triggering the exit signal handler. #BTW, I don't convert to 0-index because I like it, but because the 'split' command uses 0-indexed output files. NUMBERS=$(seq 0 $(( ${NUM_CHAINS} - 1 )) ) ALLCHAINS="" for i in $(seq 0 $(( ${NUM_CHAINS} - 1 )) ) do ALLCHAINS+="${CHAINNAME}${i} " done trap "pkill -INT -P $$" EXIT SIGINT trap "STARTUPERROR=1; echo 'got sigchild during startup'; jobs; exit 1" SIGCHLD RUNPIDS="" for i in ${NUMBERS} do export LSEED=$(( SEED + i )) export MV2_CPU_MAPPING="${cpuset[$i]}" bash -c "srun --mpi=pmi2 --overlap -n ${NTHREADS} ${THEPROGRAM} -rnd ${LSEED} ${ALLARGS} ${CHAINNAME}${i}" & RUNPIDS+="${!} " echo "started chain ${i} with PID : ${!}" done #Setup the convergence tester /expanse/projects/ngbt/home/cipres/ngbw/contrib/scripts/PBterminator.bash ${CHECKINTERVAL} ${BURNIN} ${ACCEPTDIFF} ${ACCEPTSIZE} ${GIVEUP} ${ALLCHAINS} > __terminator.txt& TERMPID="$!" echo "Terminator running as PID : ${TERMPID}" # Manu -- changes June 17: commented the original line below. #trap "echo 'Got SIGCHLD or SIGTERM, attempting graceful return. '; kill -s SIGTERM -- -${TERMPID}; kill -s SIGTERM ${RUNPIDS}; sleep 5; kill -9 -- ${RUNPIDS} -${TERMPID}" SIGCHLD SIGTERM trap "echo 'Got SIGCHLD or SIGTERM, attempting graceful return. '; kill -s SIGTERM -- -${TERMPID} 2> /dev/null; kill -s SIGTERM ${RUNPIDS} 2> /dev/null; sleep 5; kill -9 -- ${RUNPIDS} -${TERMPID} 2> /dev/null" SIGCHLD SIGTERM if [ "${STARTUPERROR}" == "0" ] then wait ${RUNPIDS} fi kill -s SIGTERM ${TERMPID} 2> /dev/null # Manu -- changes June 17 sleep 2; kill -9 -- -${TERMPID} 2> /dev/null # Manu -- changes June 17 exit 0