256 lines
6.8 KiB
Bash
Executable File
256 lines
6.8 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
#Choose folder
|
|
#folder=omp
|
|
#nodelist=compute08
|
|
|
|
folder=mpitasking
|
|
nodelist=compute01,compute04,compute07,compute10,compute11,compute12
|
|
|
|
#folder=mpisubgraphing
|
|
#nodelist=compute01,compute04,compute07,compute10,compute11,compute12
|
|
|
|
#folder=serial
|
|
#nodelist=compute05
|
|
|
|
#folder=serialnosync
|
|
#folder=ompnosync
|
|
TIMESTR=`date +"%s"`
|
|
DBNAME="${folder}_${TIMESTR}"
|
|
#nodelist=compute01,compute04,compute07,compute10,compute11,compute12
|
|
NODES=(2 3 4 5 6 7 8 9 10 11 12)
|
|
|
|
pwd=`pwd`
|
|
#one=1
|
|
#4,8,16,32 thr
|
|
thrds=(4 8 16 32)
|
|
|
|
if [[ "$folder" == serial || "$folder" == serialnosync || "$folder" == mpitasking || "$folder" == mpisubgraphing ]]; then
|
|
thrds=(1)
|
|
else
|
|
thrds=(4 8 16 32)
|
|
fi
|
|
|
|
if [[ "$folder" != mpisubgraphing || "$folder" != mpitasking ]]; then
|
|
NODES=(1)
|
|
fi
|
|
|
|
QSIZE=1
|
|
SKIPSYNC=false
|
|
|
|
if [[ "$folder" == serialnosync ]]; then
|
|
SKIPSYNC=true
|
|
fi
|
|
|
|
if [[ "$folder" == ompnosync ]]; then
|
|
SKIPSYNC=true
|
|
fi
|
|
|
|
if [[ "$SKIPSYNC" == false ]]; then
|
|
|
|
for thrd in "${thrds[@]}"
|
|
do
|
|
if [[ "$folder" == serial ]]; then
|
|
thrd=1
|
|
nodelist=compute05
|
|
fi
|
|
|
|
if [[ "$folder" == serialnosync ]]; then
|
|
thrd=1
|
|
nodelist=compute07
|
|
fi
|
|
|
|
if [[ "$folder" == ompnosync ]]; then
|
|
nodelist=compute08
|
|
fi
|
|
|
|
for qmult in {1..1}
|
|
do
|
|
if [[ "$qmult" -eq 3 ]]; then
|
|
QSIZE=$((thrd*4))
|
|
else
|
|
QSIZE=$((thrd*qmult))
|
|
fi
|
|
|
|
if [[ "$folder" == serial ]]; then
|
|
QSIZE=1
|
|
fi
|
|
|
|
#1..6 Cars
|
|
for cars in {2..6}
|
|
do
|
|
|
|
echo "----------------------------------- "$cars" cars -----------------------------------" >> ./slurm_reports/"$folder"/sync_data.txt
|
|
#1..4 (or 1..10)Services
|
|
for i in {1..6}
|
|
do
|
|
for node in "${NODES[@]}"
|
|
do
|
|
SKIP_FLAG=0
|
|
#Dispatch the job and get the output ("Submitted batch job <jobid>
|
|
JOB=`sbatch --nodelist="$nodelist" ag_run.sh "$i" "sync" "$cars" "$DBNAME" "$thrd" "$node" "tasking"`
|
|
#Pull the jobid
|
|
JOB=`echo "$JOB" | grep -oP '(?<=job )[^ ]*'`
|
|
|
|
#Output will be stored as this format
|
|
FILE="./slurm_reports/job.${JOB}.out"
|
|
ERR="./slurm_reports/job.${JOB}.err"
|
|
|
|
NOW=`date +"%H:%M"`
|
|
|
|
echo "Synchronous test with "$i" services "$thrd" omp threads "$node" nodes and "$cars" cars dispatched at "$NOW" with jobid "$JOB"" >> ./slurm_reports/"$folder"/sync_data.txt
|
|
|
|
#Wait until job is finished
|
|
while squeue | grep "$JOB" > /dev/null 2>&1; do
|
|
#Slurm is having an issue where the prog finishes but doesn't leave slurm queue
|
|
#Check if program finished by checking the output file, delay by 5 seconds for safety, then cancel the job
|
|
if test -f "$FILE" && (cat "$FILE" | grep "total run time" > /dev/null 2>&1) ; then
|
|
#if test -f "$FILE"; then
|
|
for j in {1..5}
|
|
do
|
|
sleep 1
|
|
done
|
|
SKIP_FLAG=1
|
|
echo "Job done, but slurm hung." >> ./slurm_reports/"$folder"/sync_data.txt
|
|
tmp=`scancel "$JOB"`
|
|
sleep 3
|
|
break
|
|
fi
|
|
#while ! test -f "$FILE"; do
|
|
sleep 1
|
|
#done
|
|
done
|
|
|
|
#Wait for the delay between job finish and file write
|
|
while ! test -f "$FILE"; do
|
|
sleep 1
|
|
done
|
|
|
|
#Get runtime and states from the output
|
|
#RUNTIME=`cat "$FILE" | grep -oP '(?<=total run time is )[^ ]*'`
|
|
RUNTIME=`cat "$FILE" | grep -oP '(?<=AG TOOK )[^ ]*'`
|
|
|
|
STATES=`cat "$FILE" | grep -oP '(?<=Total States: )[^ ]*'`
|
|
|
|
if ! test -s "$ERR" || test "$SKIP_FLAG" -eq 1 ; then
|
|
printf "${i} Services Synchronized: ${STATES} states and ${RUNTIME} runtime \n\n" >> ./slurm_reports/"$folder"/sync_data.txt
|
|
str="$i","$STATES","$RUNTIME"
|
|
echo $str >> ./slurm_reports/"$folder"/sync_data.csv
|
|
else
|
|
printf "Errors occurred. Please see err file for more details. \n\n" >> ./slurm_reports/"$folder"/sync_data.txt
|
|
|
|
fi
|
|
done
|
|
SKIP_FLAG=0
|
|
done
|
|
done
|
|
|
|
printf "\n\n" >> ./slurm_reports/"$folder"/sync_data.txt
|
|
done
|
|
done
|
|
fi
|
|
|
|
if [[ "$SKIPSYNC" == false ]]; then
|
|
|
|
for thrd in "${thrds[@]}"
|
|
do
|
|
if [[ "$folder" == serial ]]; then
|
|
thrd=1
|
|
nodelist=compute05
|
|
fi
|
|
|
|
if [[ "$folder" == serialnosync ]]; then
|
|
thrd=1
|
|
nodelist=compute07
|
|
fi
|
|
|
|
if [[ "$folder" == ompnosync ]]; then
|
|
nodelist=compute08
|
|
fi
|
|
|
|
for qmult in {1..1}
|
|
do
|
|
if [[ "$qmult" -eq 3 ]]; then
|
|
QSIZE=$((thrd*4))
|
|
else
|
|
QSIZE=$((thrd*qmult))
|
|
fi
|
|
|
|
if [[ "$folder" == serial ]]; then
|
|
QSIZE=1
|
|
fi
|
|
|
|
#1..6 Cars
|
|
for cars in {2..6}
|
|
do
|
|
|
|
echo "----------------------------------- "$cars" cars -----------------------------------" >> ./slurm_reports/"$folder"/non-sync_data.txt
|
|
#1..4 (or 1..10)Services
|
|
for i in {1..6}
|
|
do
|
|
for node in "${NODES[@]}"
|
|
do
|
|
SKIP_FLAG=0
|
|
#Dispatch the job and get the output ("Submitted batch job <jobid>
|
|
JOB=`sbatch --nodelist="$nodelist" ag_run.sh "$i" "non-sync" "$cars" "$DBNAME" "$thrd" "$node" "tasking"`
|
|
#Pull the jobid
|
|
JOB=`echo "$JOB" | grep -oP '(?<=job )[^ ]*'`
|
|
|
|
#Output will be stored as this format
|
|
FILE="./slurm_reports/job.${JOB}.out"
|
|
ERR="./slurm_reports/job.${JOB}.err"
|
|
|
|
NOW=`date +"%H:%M"`
|
|
|
|
echo "Non-Synchronous test with "$i" services "$thrd" omp threads "$node" nodes and "$cars" cars dispatched at "$NOW" with jobid "$JOB"" >> ./slurm_reports/"$folder"/non-sync_data.txt
|
|
|
|
#Wait until job is finished
|
|
while squeue | grep "$JOB" > /dev/null 2>&1; do
|
|
#Slurm is having an issue where the prog finishes but doesn't leave slurm queue
|
|
#Check if program finished by checking the output file, delay by 5 seconds for safety, then cancel the job
|
|
if test -f "$FILE" && (cat "$FILE" | grep "total run time" > /dev/null 2>&1) ; then
|
|
#if test -f "$FILE"; then
|
|
for j in {1..5}
|
|
do
|
|
sleep 1
|
|
done
|
|
SKIP_FLAG=1
|
|
echo "Job done, but slurm hung." >> ./slurm_reports/"$folder"/non-sync_data.txt
|
|
tmp=`scancel "$JOB"`
|
|
sleep 3
|
|
break
|
|
fi
|
|
#while ! test -f "$FILE"; do
|
|
sleep 1
|
|
#done
|
|
done
|
|
|
|
#Wait for the delay between job finish and file write
|
|
while ! test -f "$FILE"; do
|
|
sleep 1
|
|
done
|
|
|
|
#Get runtime and states from the output
|
|
#RUNTIME=`cat "$FILE" | grep -oP '(?<=total run time is )[^ ]*'`
|
|
RUNTIME=`cat "$FILE" | grep -oP '(?<=AG TOOK )[^ ]*'`
|
|
|
|
STATES=`cat "$FILE" | grep -oP '(?<=Total States: )[^ ]*'`
|
|
|
|
if ! test -s "$ERR" || test "$SKIP_FLAG" -eq 1 ; then
|
|
printf "${i} Services Non-Synchronized: ${STATES} states and ${RUNTIME} runtime \n\n" >> ./slurm_reports/"$folder"/non-sync_data.txt
|
|
str="$i","$STATES","$RUNTIME"
|
|
echo $str >> ./slurm_reports/"$folder"/non-sync_data.csv
|
|
else
|
|
printf "Errors occurred. Please see err file for more details. \n\n" >> ./slurm_reports/"$folder"/non-sync_data.txt
|
|
|
|
fi
|
|
done
|
|
SKIP_FLAG=0
|
|
done
|
|
done
|
|
|
|
printf "\n\n" >> ./slurm_reports/"$folder"/non-sync_data.txt
|
|
done
|
|
done
|
|
fi
|