Skip to content

Commit

Permalink
Test mpmd in rt
Browse files Browse the repository at this point in the history
  • Loading branch information
DusanJovic-NOAA committed Nov 20, 2024
1 parent c0367fd commit f970da0
Show file tree
Hide file tree
Showing 7 changed files with 213 additions and 20 deletions.
3 changes: 3 additions & 0 deletions tests/default_vars.sh
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,9 @@ export esmf_logkind="ESMF_LOGKIND_MULTI"
export DumpFields="false"
export MED_history_n=1000000

export ESMF_THREADING=true
export MPMD=false

export_fv3_v16 ()
{
# Add support for v16 test cases. This section
Expand Down
8 changes: 4 additions & 4 deletions tests/fv3_conf/fv3_qsub.IN_wcoss2
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@
#PBS -N @[JBNME]
#PBS -A @[ACCNR]
#PBS -q @[QUEUE]
#PBS -l place=vscatter,select=@[NODES]:ncpus=@[TPN]:mpiprocs=@[TPN]:mem=500G
#PBS -l place=excl
#PBS -l place=vscatter:excl,select=@[NODES]:ncpus=128:mem=500G
#PBS -l walltime=00:@[WLCLK]:00

set -eux
echo -n " $( date +%s )," > job_timestamp.txt

cd $PBS_O_WORKDIR

echo -n " $( date +%s )," > job_timestamp.txt

set +x
module use $PWD/modulefiles
module load modules.fv3
Expand All @@ -37,7 +37,7 @@ if [ "${JOB_SHOULD_FAIL:-NO}" = WHEN_RUNNING ] ; then
false
fi

mpiexec -n @[TASKS] -ppn @[TPN] -depth @[THRD] ./fv3.exe
mpiexec @[MPIEXEC_CMD_ARGS] # -n @[TASKS] -ppn @[TPN] --cpu-bind core --depth @[THRD] ./fv3.exe

echo "Model ended: " `date`
echo -n " $( date +%s )," >> job_timestamp.txt
2 changes: 1 addition & 1 deletion tests/parm/ufs.configure.s2swa_fast_esmf.IN
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

# ESMF #
logKindFlag: ESMF_LOGKIND_MULTI
globalResourceControl: true
globalResourceControl: @[ESMF_THREADING]

# EARTH #
EARTH_component_list: MED ATM CHM OCN ICE WAV
Expand Down
6 changes: 5 additions & 1 deletion tests/rt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1266,7 +1266,11 @@ EOF
(
source "${PATHRT}/tests/${TEST_NAME}"

compute_petbounds_and_tasks
if [[ ${ESMF_THREADING} == true ]]; then
compute_petbounds_and_tasks_esmf_threading
else
compute_petbounds_and_tasks_traditional_threading
fi

TPN=$(( TPN / THRD ))
NODES=$(( TASKS / TPN ))
Expand Down
128 changes: 125 additions & 3 deletions tests/rt_utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,129 @@ redirect_out_err() {
# redirect_out_err command will return non-zero if "$@" or tee return non-zero.
}

function compute_petbounds_and_tasks() {
function compute_petbounds_and_tasks_traditional_threading() {

# each test MUST define ${COMPONENT}_tasks variable for all components it is using
# and MUST NOT define those that it's not using or set the value to 0.

# ATM is a special case since it is running on the sum of compute and io tasks.
# CHM component and mediator are running on ATM compute tasks only.

if [[ ${DATM_CDEPS} = 'false' ]]; then
if [[ ${ATM_compute_tasks:-0} -eq 0 ]]; then
ATM_compute_tasks=$((INPES * JNPES * NTILES))
fi
if [[ ${QUILTING} = '.true.' ]]; then
ATM_io_tasks=$((WRITE_GROUP * WRTTASK_PER_GROUP))
else
ATM_io_tasks=0
fi
ATM_tasks=$((ATM_compute_tasks + ATM_io_tasks))
fi

local n=0
unset atm_petlist_bounds ocn_petlist_bounds ice_petlist_bounds wav_petlist_bounds chm_petlist_bounds med_petlist_bounds aqm_petlist_bounds fbh_petlist_bounds

local _tasks
local _nodes
mpmd_nodes=0

# ATM
if [[ ${ATM_tasks:-0} -gt 0 ]]; then
atm_petlist_bounds="${n} $((n + ATM_tasks - 1))"
n=$((n + ATM_tasks))
_tasks=$(( ATM_tasks*atm_omp_num_threads ))
_nodes=$(( _tasks / TPN ))
if (( _nodes * TPN < _tasks )); then
_nodes=$(( _nodes + 1 ))
fi
mpmd_nodes=$(( mpmd_nodes + _nodes ))
fi

# OCN
if [[ ${OCN_tasks:-0} -gt 0 ]]; then
ocn_petlist_bounds="${n} $((n + OCN_tasks - 1))"
n=$((n + OCN_tasks))
_tasks=$(( OCN_tasks*ocn_omp_num_threads ))
_nodes=$(( _tasks / TPN ))
if (( _nodes * TPN < _tasks )); then
_nodes=$(( _nodes + 1 ))
fi
mpmd_nodes=$(( mpmd_nodes + _nodes ))
fi

# ICE
if [[ ${ICE_tasks:-0} -gt 0 ]]; then
ice_petlist_bounds="${n} $((n + ICE_tasks - 1))"
n=$((n + ICE_tasks))
_tasks=$(( ICE_tasks*ice_omp_num_threads ))
_nodes=$(( _tasks / TPN ))
if (( _nodes * TPN < _tasks )); then
_nodes=$(( _nodes + 1 ))
fi
mpmd_nodes=$(( mpmd_nodes + _nodes ))
fi

# WAV
if [[ ${WAV_tasks:-0} -gt 0 ]]; then
wav_petlist_bounds="${n} $((n + WAV_tasks - 1))"
n=$((n + WAV_tasks))
_tasks=$(( WAV_tasks*wav_omp_num_threads ))
_nodes=$(( _tasks / TPN ))
if (( _nodes * TPN < _tasks )); then
_nodes=$(( _nodes + 1 ))
fi
mpmd_nodes=$(( mpmd_nodes + _nodes ))
fi

# CHM
chm_petlist_bounds="0 $((ATM_compute_tasks - 1))"

# MED
med_petlist_bounds="0 $((ATM_compute_tasks - 1))"

# AQM
aqm_petlist_bounds="0 $((ATM_compute_tasks - 1))"

# LND
if [[ ${lnd_model:-} = "lm4" ]]; then
# set lnd_petlist_bounds to be same as ATM_compute_tasks
lnd_petlist_bounds="0 $((ATM_compute_tasks - 1))"
elif [[ ${LND_tasks:-0} -gt 0 ]]; then # noahmp component or other
lnd_petlist_bounds="${n} $((n + LND_tasks - 1))"
n=$((n + LND_tasks))
fi

# FBH
if [[ ${FBH_tasks:-0} -gt 0 ]]; then
fbh_petlist_bounds="${n} $((n + FBH_tasks - 1))"
n=$((n + FBH_tasks))
fi

unset _tasks
unset _nodes

UFS_tasks=${n}

if [[ ${RTVERBOSE} == true ]]; then
echo "ATM_petlist_bounds: ${atm_petlist_bounds:-}"
echo "OCN_petlist_bounds: ${ocn_petlist_bounds:-}"
echo "ICE_petlist_bounds: ${ice_petlist_bounds:-}"
echo "WAV_petlist_bounds: ${wav_petlist_bounds:-}"
echo "CHM_petlist_bounds: ${chm_petlist_bounds:-}"
echo "MED_petlist_bounds: ${med_petlist_bounds:-}"
echo "AQM_petlist_bounds: ${aqm_petlist_bounds:-}"
echo "LND_petlist_bounds: ${lnd_petlist_bounds:-}"
echo "FBH_petlist_bounds: ${fbh_petlist_bounds:-}"
echo "UFS_tasks : ${UFS_tasks:-}"
echo "mpmd_nodes : ${mpmd_nodes:-}"
fi

# TASKS is now set to UFS_TASKS
export TASKS=${UFS_tasks}
}

function compute_petbounds_and_tasks_esmf_threading() {

# each test MUST define ${COMPONENT}_tasks variable for all components it is using
# and MUST NOT define those that it's not using or set the value to 0.
Expand Down Expand Up @@ -82,8 +204,8 @@ function compute_petbounds_and_tasks() {
if [[ ${lnd_model:-} = "lm4" ]]; then
# set lnd_petlist_bounds to be same as ATM_compute_tasks
lnd_petlist_bounds="0 $((ATM_compute_tasks - 1))"
elif [[ ${LND_tasks:-0} -gt 0 ]]; then # noahmp component or other
LND_tasks=$((LND_tasks * lnd_omp_num_threads))
elif [[ ${LND_tasks:-0} -gt 0 ]]; then # noahmp component or other
LND_tasks=$((LND_tasks * lnd_omp_num_threads))
lnd_petlist_bounds="${n} $((n + LND_tasks - 1))"
n=$((n + LND_tasks))
fi
Expand Down
82 changes: 71 additions & 11 deletions tests/run_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -163,10 +163,28 @@ else
exit 1
fi

compute_petbounds_and_tasks
if [[ ${ESMF_THREADING} == true ]]; then
compute_petbounds_and_tasks_esmf_threading
else
compute_petbounds_and_tasks_traditional_threading
fi

if [[ -f ${PATHRT}/parm/${UFS_CONFIGURE} ]]; then
atparse < "${PATHRT}/parm/${UFS_CONFIGURE}" > ufs.configure
(
# atm_omp_num_threads=-1
# ocn_omp_num_threads=-1
# ice_omp_num_threads=-1
# wav_omp_num_threads=-1
# chm_omp_num_threads=-1
# med_omp_num_threads=-1
# aqm_omp_num_threads=-1
# lnd_omp_num_threads=-1
# fbh_omp_num_threads=-1
atparse < "${PATHRT}/parm/${UFS_CONFIGURE}" > ufs.configure
if [[ ${ESMF_THREADING} != true ]]; then
sed -i -e "/_omp_num_threads:/d" ufs.configure
fi
)
else
echo "Cannot find file ${UFS_CONFIGURE} set by variable UFS_CONFIGURE"
exit 1
Expand Down Expand Up @@ -240,7 +258,7 @@ fi
if [[ "Q${FIELD_TABLE:-}" != Q ]]; then
cp "${PATHRT}/parm/field_table/${FIELD_TABLE}" field_table
fi

# fix files
if [[ ${FV3} == true ]]; then
cp "${INPUTDATA_ROOT}"/FV3_fix/*.txt .
Expand Down Expand Up @@ -367,18 +385,60 @@ if (( NODES * TPN < TASKS )); then
fi
export NODES

UFS_TASKS=${TASKS}
TASKS=$(( NODES * TPN ))
export TASKS
if [[ ${ESMF_THREADING} == true ]]; then
UFS_TASKS=${TASKS}
TASKS=$(( NODES * TPN ))
export TASKS

PPN=$(( UFS_TASKS / NODES ))
if (( UFS_TASKS - ( PPN * NODES ) > 0 )); then
PPN=$((PPN + 1))
PPN=$(( UFS_TASKS / NODES ))
if (( UFS_TASKS - ( PPN * NODES ) > 0 )); then
PPN=$((PPN + 1))
fi
export PPN
export UFS_TASKS
else
PPN=${TPN}
fi
export PPN
export UFS_TASKS

if [[ ${SCHEDULER} = 'pbs' ]]; then

mpiexec_cmd=""
if [[ ${ESMF_THREADING} == false && ${MPMD} == true ]]; then
if [[ ${ATM_tasks:-0} -gt 0 ]]; then
mpiexec_cmd+=" -n ${ATM_tasks} -ppn $((128/atm_omp_num_threads)) --cpu-bind verbose,depth --depth ${atm_omp_num_threads} --env OMP_NUM_THREADS=${atm_omp_num_threads} ./fv3.exe :"
fi

if [[ ${OCN_tasks:-0} -gt 0 ]]; then
mpiexec_cmd+=" -n ${OCN_tasks} -ppn $((128/ocn_omp_num_threads)) --cpu-bind verbose,depth --depth ${ocn_omp_num_threads} --env OMP_NUM_THREADS=${ocn_omp_num_threads} ./fv3.exe :"
fi

if [[ ${ICE_tasks:-0} -gt 0 ]]; then
mpiexec_cmd+=" -n ${ICE_tasks} -ppn $((128/ice_omp_num_threads)) --cpu-bind verbose,depth --depth ${ice_omp_num_threads} --env OMP_NUM_THREADS=${ice_omp_num_threads} ./fv3.exe :"
fi

if [[ ${WAV_tasks:-0} -gt 0 ]]; then
mpiexec_cmd+=" -n ${WAV_tasks} -ppn $((128/wav_omp_num_threads)) --cpu-bind verbose,depth --depth ${wav_omp_num_threads} --env OMP_NUM_THREADS=${wav_omp_num_threads} ./fv3.exe :"
fi

if [[ ${LND_tasks:-0} -gt 0 ]]; then
mpiexec_cmd+=" -n ${LND_tasks} -ppn $((128/lnd_omp_num_threads)) --cpu-bind verbose,depth --depth ${lnd_omp_num_threads} --env OMP_NUM_THREADS=${lnd_omp_num_threads} ./fv3.exe :"
fi

if [[ ${FBH_tasks:-0} -gt 0 ]]; then
mpiexec_cmd+=" -n ${FBH_tasks} -ppn $((128/fbh_omp_num_threads)) --cpu-bind verbose,depth --depth ${fbh_omp_num_threads} --env OMP_NUM_THREADS=${fbh_omp_num_threads} ./fv3.exe :"
fi
# Remove trailing ':'
if [[ "${mpiexec_cmd: -1}" == ":" ]]; then
mpiexec_cmd="${mpiexec_cmd:0:-1}"
fi
NODES=${mpmd_nodes}
else
mpiexec_cmd+=" -n ${TASKS} -ppn ${TPN} --cpu-bind core --depth ${THRD} ./fv3.exe"
fi

echo "mpiexec_cmd = ${mpiexec_cmd}"
MPIEXEC_CMD_ARGS=${mpiexec_cmd}

if [[ -e ${PATHRT}/fv3_conf/fv3_qsub.IN_${MACHINE_ID} ]]; then
atparse < "${PATHRT}/fv3_conf/fv3_qsub.IN_${MACHINE_ID}" > job_card
else
Expand Down
4 changes: 4 additions & 0 deletions tests/tests/cpld_2threads_p8
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,7 @@ export N_SPLIT=5
# HR4 GFSv17 GWD update
export DO_GWD_OPT_PSL=.true.
export DO_GSL_DRAG_SS=.false.

ESMF_THREADING=false
MPMD=true
THRD=$THRD_cpl_thrd

0 comments on commit f970da0

Please sign in to comment.