Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/develop' into random_rt_fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
DusanJovic-NOAA committed Jul 26, 2024
2 parents 192ec74 + e6ec3d1 commit 079fb65
Show file tree
Hide file tree
Showing 42 changed files with 2,969 additions and 2,405 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,8 @@ tests/fv3_conf/compile_qsub.IN
tests/fv3_conf/fv3_slurm.IN
tests/fv3_conf/fv3_qsub.IN
build*.log*
rocoto_workflow*
fail_compile_*
fail_test_*
tests/run_dir
tests/logs/log_*
2 changes: 1 addition & 1 deletion CMEPS-interface/CMEPS
2 changes: 1 addition & 1 deletion FV3
Submodule FV3 updated 1 files
+1 −1 ccpp/physics
2 changes: 1 addition & 1 deletion NOAHMP-interface/noahmp
2 changes: 1 addition & 1 deletion doc/UsersGuide/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ alabaster==0.7.16
# via sphinx
babel==2.14.0
# via sphinx
certifi==2024.2.2
certifi==2024.7.4
# via requests
charset-normalizer==3.3.2
# via requests
Expand Down
33 changes: 33 additions & 0 deletions modulefiles/ufs_frontera.intel.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
help([[
loads UFS Model prerequisites for Frontera/Intel
]])

prepend_path("MODULEPATH", "/work2/06146/tg854455/frontera/spack-stack/modulefiles")
load("ecflow/5.8.4")

prepend_path("MODULEPATH", "/work2/01118/tg803972/frontera/spack-stack/spack-stack-1.6.0/envs/unified-env/install/modulefiles/Core")

stack_intel_ver=os.getenv("stack_intel_ver") or "19.1.1.217"
load(pathJoin("stack-intel", stack_intel_ver))

stack_impi_ver=os.getenv("stack_impi_ver") or "2020.4.304"
load(pathJoin("stack-intel-mpi", stack_impi_ver))

cmake_ver=os.getenv("cmake_ver") or "3.24.2"
load(pathJoin("cmake", cmake_ver))
--load("cmake/3.24.2")

load("ufs_common")

stack_python_ver=os.getenv("stack_python_ver") or "3.10.13"
load(pathJoin("stack-python", stack_python_ver))

nccmp_ver=os.getenv("nccmp_ver") or "1.9.0.1"
load(pathJoin("nccmp", nccmp_ver))

setenv("CC", "mpiicc")
setenv("CXX", "mpiicpc")
setenv("FC", "mpiifort")
setenv("CMAKE_Platform", "frontera.intel")

whatis("Description: UFS build environment")
2 changes: 1 addition & 1 deletion tests/bl_date.conf
Original file line number Diff line number Diff line change
@@ -1 +1 @@
export BL_DATE=20240624
export BL_DATE=20240724
3 changes: 3 additions & 0 deletions tests/detect_machine.sh
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ case $(hostname -f) in

login[1-4].stampede2.tacc.utexas.edu) MACHINE_ID=stampede ;; ### stampede1-4

login[1-4].frontera.tacc.utexas.edu) MACHINE_ID=frontera ;; ### frontera1-4
c*.frontera.tacc.utexas.edu) MACHINE_ID=frontera ;; ### frontera compute

login0[1-2].expanse.sdsc.edu) MACHINE_ID=expanse ;; ### expanse1-2

discover3[1-5].prv.cube) MACHINE_ID=discover ;; ### discover31-35
Expand Down
27 changes: 27 additions & 0 deletions tests/error-test.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# This file is an alternative to rt.conf that tests whether the regression test system rt.sh can detect failure conditions.
#
# ./rt.sh [options] -l error-test.conf
#
# If the rt.sh detects errors correctly, the workflow shouldn't finish. Some jobs should be failed or not submitted, and some should succeed.
# See details below.

# This should succeed
COMPILE | atm_dyn32 | intel | -DAPP=ATM -DCCPP_SUITES=FV3_GFS_v16,FV3_GFS_v16_flake,FV3_GFS_v17_p8,FV3_GFS_v17_p8_rrtmgp,FV3_GFS_v15_thompson_mynn_lam3km,FV3_WoFS_v0,FV3_GFS_v17_p8_mynn,FV3_GFS_v17_p8_ugwpv1 -D32BIT=ON | | fv3 |

# This should succeed
RUN | control_c48.v2.sfc | | baseline |

# These tests should always fail, and prevent the workflow from completing.
RUN | fail_to_copy | | baseline |
RUN | fail_to_run | | baseline |

# Using 64-bit dynamics ensures results change, but the test runs. The workflow jobs should complete
# for the COMPILE and RUN, but the results should change.
COMPILE | atm_dyn64 | intel | -DAPP=ATM -DCCPP_SUITES=FV3_GFS_v16,FV3_GFS_v16_flake,FV3_GFS_v17_p8,FV3_GFS_v17_p8_rrtmgp,FV3_GFS_v15_thompson_mynn_lam3km,FV3_WoFS_v0,FV3_GFS_v17_p8_mynn,FV3_GFS_v17_p8_ugwpv1 | | fv3 |
RUN | control_c48 | | baseline |

# This compile job should fail, and prevent the workflow from completing.
COMPILE | fail_to_compile | intel | --invalid-argument -DAPP=ATM -DCCPP_SUITES=whatever | | fv3 |

# This test should not be submitted, because its compile job has failed.
RUN | dependency_unmet | | baseline |
7 changes: 7 additions & 0 deletions tests/fv3_conf/fv3_qsub.IN_acorn
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,13 @@ export ESMF_RUNTIME_COMPLIANCECHECK=OFF:depth=4
export ESMF_RUNTIME_PROFILE=ON
export ESMF_RUNTIME_PROFILE_OUTPUT="SUMMARY"

# This "if" block is part of the rt.sh self-tests in error-test.conf. It emulates the model failing to run.
if [ "${JOB_SHOULD_FAIL:-NO}" = WHEN_RUNNING ] ; then
echo "The job should abort now, with exit status 1." 1>&2
echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2
false
fi

mpiexec -n @[TASKS] -ppn @[TPN] -depth @[THRD] ./fv3.exe

echo "Model ended: " `date`
Expand Down
7 changes: 7 additions & 0 deletions tests/fv3_conf/fv3_qsub.IN_derecho
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,13 @@ export MPICH_COLL_OPT_OFF=1
# Avoid job errors because of filesystem synchronization delays
sync && sleep 1

# This "if" block is part of the rt.sh self-tests in error-test.conf. It emulates the model failing to run.
if [ "${JOB_SHOULD_FAIL:-NO}" = WHEN_RUNNING ] ; then
echo "The job should abort now, with exit status 1." 1>&2
echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2
false
fi

mpiexec -n @[UFS_TASKS] -ppn @[PPN] --hostfile $PBS_NODEFILE ./fv3.exe

echo "Model ended: " `date`
Expand Down
7 changes: 7 additions & 0 deletions tests/fv3_conf/fv3_qsub.IN_wcoss2
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,13 @@ export ESMF_RUNTIME_COMPLIANCECHECK=OFF:depth=4
export ESMF_RUNTIME_PROFILE=ON
export ESMF_RUNTIME_PROFILE_OUTPUT="SUMMARY"

# This "if" block is part of the rt.sh self-tests in error-test.conf. It emulates the model failing to run.
if [ "${JOB_SHOULD_FAIL:-NO}" = WHEN_RUNNING ] ; then
echo "The job should abort now, with exit status 1." 1>&2
echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2
false
fi

mpiexec -n @[TASKS] -ppn @[TPN] -depth @[THRD] ./fv3.exe

echo "Model ended: " `date`
Expand Down
8 changes: 8 additions & 0 deletions tests/fv3_conf/fv3_slurm.IN_expanse
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,14 @@ echo "Model started: "`date`
export OMP_STACK_SIZE=512M
export OMP_NUM_THREADS=@[THRD]
export I_MPI_PMI_LIBRARY=/cm/shared/apps/slurm/current/lib64/libpmi.so

# This "if" block is part of the rt.sh self-tests in error-test.conf. It emulates the model failing to run.
if [ "${JOB_SHOULD_FAIL:-NO}" = WHEN_RUNNING ] ; then
echo "The job should abort now, with exit status 1." 1>&2
echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2
false
fi

srun -n @[TASKS] ./fv3.exe

echo "Model ended: " `date`
Expand Down
7 changes: 7 additions & 0 deletions tests/fv3_conf/fv3_slurm.IN_gaea
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,13 @@ export ESMF_RUNTIME_PROFILE_OUTPUT="SUMMARY"
# Avoid job errors because of filesystem synchronization delays
sync && sleep 1

# This "if" block is part of the rt.sh self-tests in error-test.conf. It emulates the model failing to run.
if [ "${JOB_SHOULD_FAIL:-NO}" = WHEN_RUNNING ] ; then
echo "The job should abort now, with exit status 1." 1>&2
echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2
false
fi

srun --label -n @[TASKS] ./fv3.exe

echo "Model ended: " `date`
Expand Down
7 changes: 7 additions & 0 deletions tests/fv3_conf/fv3_slurm.IN_hera
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,13 @@ export PSM_SHAREDCONTEXTS=1
# Avoid job errors because of filesystem synchronization delays
sync && sleep 1

# This "if" block is part of the rt.sh self-tests in error-test.conf. It emulates the model failing to run.
if [ "${JOB_SHOULD_FAIL:-NO}" = WHEN_RUNNING ] ; then
echo "The job should abort now, with exit status 1." 1>&2
echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2
false
fi

# shellcheck disable=SC2102
srun --label -n @[TASKS] ./fv3.exe

Expand Down
7 changes: 7 additions & 0 deletions tests/fv3_conf/fv3_slurm.IN_hercules
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,13 @@ fi
# Avoid job errors because of filesystem synchronization delays
sync && sleep 1

# This "if" block is part of the rt.sh self-tests in error-test.conf. It emulates the model failing to run.
if [ "${JOB_SHOULD_FAIL:-NO}" = WHEN_RUNNING ] ; then
echo "The job should abort now, with exit status 1." 1>&2
echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2
false
fi

srun --label -n @[TASKS] ./fv3.exe

echo "Model ended: " `date`
Expand Down
7 changes: 7 additions & 0 deletions tests/fv3_conf/fv3_slurm.IN_jet
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,13 @@ export ESMF_RUNTIME_PROFILE_OUTPUT="SUMMARY"
# Avoid job errors because of filesystem synchronization delays
sync && sleep 1

# This "if" block is part of the rt.sh self-tests in error-test.conf. It emulates the model failing to run.
if [ "${JOB_SHOULD_FAIL:-NO}" = WHEN_RUNNING ] ; then
echo "The job should abort now, with exit status 1." 1>&2
echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2
false
fi

srun --label -n @[TASKS] --cpus-per-task=@[THRD] ./fv3.exe

echo "Model ended: " `date`
Expand Down
7 changes: 7 additions & 0 deletions tests/fv3_conf/fv3_slurm.IN_noaacloud
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,13 @@ export OMP_NUM_THREADS=1
# Avoid job errors because of filesystem synchronization delays
sync && sleep 1

# This "if" block is part of the rt.sh self-tests in error-test.conf. It emulates the model failing to run.
if [ "${JOB_SHOULD_FAIL:-NO}" = WHEN_RUNNING ] ; then
echo "The job should abort now, with exit status 1." 1>&2
echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2
false
fi

srun --mpi=pmi2 --label -n @[TASKS] ./fv3.exe

echo "Model ended: " `date`
Expand Down
7 changes: 7 additions & 0 deletions tests/fv3_conf/fv3_slurm.IN_orion
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,13 @@ export ESMF_RUNTIME_PROFILE_OUTPUT="SUMMARY"
# Avoid job errors because of filesystem synchronization delays
sync && sleep 1

# This "if" block is part of the rt.sh self-tests in error-test.conf. It emulates the model failing to run.
if [ "${JOB_SHOULD_FAIL:-NO}" = WHEN_RUNNING ] ; then
echo "The job should abort now, with exit status 1." 1>&2
echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2
false
fi

srun --label -n @[TASKS] ./fv3.exe

echo "Model ended: " `date`
Expand Down
7 changes: 7 additions & 0 deletions tests/fv3_conf/fv3_slurm.IN_s4
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,13 @@ export PSM_SHAREDCONTEXTS=1
# Avoid job errors because of filesystem synchronization delays
sync && sleep 1

# This "if" block is part of the rt.sh self-tests in error-test.conf. It emulates the model failing to run.
if [ "${JOB_SHOULD_FAIL:-NO}" = WHEN_RUNNING ] ; then
echo "The job should abort now, with exit status 1." 1>&2
echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2
false
fi

srun --label -n @[TASKS] ./fv3.exe

echo "Model ended: " `date`
Expand Down
7 changes: 7 additions & 0 deletions tests/fv3_conf/fv3_slurm.IN_stampede
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,13 @@ export LD_BIND_NOW=1
# Avoid job errors because of filesystem synchronization delays
#sync && sleep 1

# This "if" block is part of the rt.sh self-tests in error-test.conf. It emulates the model failing to run.
if [ "${JOB_SHOULD_FAIL:-NO}" = WHEN_RUNNING ] ; then
echo "The job should abort now, with exit status 1." 1>&2
echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2
false
fi

#mpirun -prepend-rank -np $SBATCH_NP ./fv3.exe
ibrun -n @[TASKS] ./fv3.exe

Expand Down
Loading

0 comments on commit 079fb65

Please sign in to comment.