Skip to content

Commit

Permalink
Merge branch 'ndk/machinefiles/pm-cpu-update-gcc-nvidia-amd-compilers…
Browse files Browse the repository at this point in the history
…' into next (PR #6702)

On pm-cpu we were using updated Intel compiler and other module versions that were compatible, but had not yet updated the others due to #6516. After #6687, I think they are resolved and we can now update.

The main change here is updating gcc compiler, but other module versions are also updated at the same time.
Also, try to clean up the machine config settings across all NERSC machines to be more consistent.

module                      current        machine defaults (in this PR)
gcc                         12.2.0         12.3 (gcc-native)
PrgEnv-gnu                  8.3.3          8.5.0
cray-libsci                 23.02.1.1      23.12.5
cray-mpich                  8.1.25         8.1.28
craype                      2.7.20         2.7.30
cray-hdf5-parallel          1.12.2.3       1.12.2.9
cray-netcdf-hdf5parallel    4.9.0.3        4.9.0.9
cray-parallel-netcdf        1.12.3.3       1.12.3.9
On muller-cpu, already tried updating the compiler versions and was testing a work-around with special compiler flags for GNU. With this PR, can remove that work-around.

Removing FI_CXI_RX_MATCH_MODE=software for machines other than primary pm-cpu. Will remove this in another PR as the default FI_CXI_RX_MATCH_MODE=hybrid now seems fine.

We might see some cases not BFB using GNU compiler. For the e3sm-developer tests (what we test nightly), the only test that did not pass baseline compare was ERP_Ld3.ne4pg2_oQU480.F2010.pm-cpu_gnu
  • Loading branch information
ndkeen committed Oct 21, 2024
2 parents 4b0a4aa + bdcc2f5 commit d1e8163
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 57 deletions.
20 changes: 0 additions & 20 deletions cime_config/machines/Depends.muller-cpu.gnu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -7,23 +7,3 @@ if (NOT DEBUG)
e3sm_deoptimize_file("${ITEM}")
endforeach()
endif()

# On pm-cpu (and muller-cpu), with gcc-native/12.3, we see hang with DEBUG runs of certain tests.
# https://github.com/E3SM-Project/E3SM/issues/6516
# Currently, we have pm-cpu using gcc/12.2.0 which does not have this issue, but using muller-cpu to test 12.3
# Turning off -O0 for these 2 files (by adding -O) at least avoids hang and will produce FPE in HOMME code
if (CMAKE_Fortran_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3)
if (DEBUG)

set(ADJUST
eam/src/dynamics/se/inidat.F90
eam/src/dynamics/se/dyn_comp.F90
)

foreach(ITEM IN LISTS ADJUST)
e3sm_add_flags("${ITEM}" "-O")
#e3sm_add_flags("${ITEM}" "-DNDEBUG -O")
endforeach()

endif()
endif()
112 changes: 75 additions & 37 deletions cime_config/machines/config_machines.xml
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@
<cmd_path lang="csh">module</cmd_path>

<modules>
<command name="unload">cpe</command>
<command name="unload">cray-hdf5-parallel</command>
<command name="unload">cray-netcdf-hdf5parallel</command>
<command name="unload">cray-parallel-netcdf</command>
Expand Down Expand Up @@ -218,8 +219,8 @@

<modules compiler="gnu">
<command name="load">PrgEnv-gnu/8.5.0</command>
<command name="load">gcc/12.2.0</command>
<command name="load">cray-libsci/23.02.1.1</command>
<command name="load">gcc-native/12.3</command>
<command name="load">cray-libsci/23.12.5</command>
</modules>

<modules compiler="intel">
Expand All @@ -229,35 +230,23 @@

<modules compiler="nvidia">
<command name="load">PrgEnv-nvidia</command>
<command name="load">nvidia/22.7</command>
<command name="load">cray-libsci/23.02.1.1</command>
<command name="load">nvidia/24.5</command>
<command name="load">cray-libsci/23.12.5</command>
</modules>

<modules compiler="amdclang">
<command name="load">PrgEnv-aocc</command>
<command name="load">aocc/4.0.0</command>
<command name="load">cray-libsci/23.02.1.1</command>
<command name="load">aocc/4.1.0</command>
<command name="load">cray-libsci/23.12.5</command>
</modules>

<modules compiler="intel">
<modules>
<command name="load">craype-accel-host</command>
<command name="load">craype/2.7.30</command>
<command name="load">cray-mpich/8.1.28</command>
<command name="load">cray-hdf5-parallel/1.12.2.9</command>
<command name="load">cray-netcdf-hdf5parallel/4.9.0.9</command>
<command name="load">cray-parallel-netcdf/1.12.3.9</command>
</modules>

<modules compiler="!intel">
<command name="load">craype-accel-host</command>
<command name="load">craype/2.7.20</command>
<command name="load">cray-mpich/8.1.25</command>
<command name="load">cray-hdf5-parallel/1.12.2.3</command>
<command name="load">cray-netcdf-hdf5parallel/4.9.0.3</command>
<command name="load">cray-parallel-netcdf/1.12.3.3</command>
</modules>

<modules>
<command name="load">cmake/3.24.3</command>
<command name="load">evp-patch</command>
</modules>
Expand Down Expand Up @@ -368,6 +357,7 @@
<cmd_path lang="csh">module</cmd_path>

<modules>
<command name="unload">cpe</command>
<command name="unload">cray-hdf5-parallel</command>
<command name="unload">cray-netcdf-hdf5parallel</command>
<command name="unload">cray-parallel-netcdf</command>
Expand All @@ -378,13 +368,14 @@
<command name="unload">PrgEnv-nvidia</command>
<command name="unload">PrgEnv-cray</command>
<command name="unload">PrgEnv-aocc</command>
<command name="unload">gcc-native</command>
<command name="unload">intel</command>
<command name="unload">intel-oneapi</command>
<command name="unload">nvidia</command>
<command name="unload">aocc</command>
<command name="unload">cudatoolkit</command>
<command name="unload">cray-libsci</command>
<command name="unload">climate-utils</command>
<command name="unload">cray-libsci</command>
<command name="unload">matlab</command>
<command name="unload">craype-accel-nvidia80</command>
<command name="unload">craype-accel-host</command>
Expand Down Expand Up @@ -436,6 +427,7 @@
<RUNDIR>$CIME_OUTPUT_ROOT/$CASE/run</RUNDIR>
<EXEROOT>$CIME_OUTPUT_ROOT/$CASE/bld</EXEROOT>
<TEST_TPUT_TOLERANCE>0.1</TEST_TPUT_TOLERANCE>
<TEST_MEMLEAK_TOLERANCE>0.20</TEST_MEMLEAK_TOLERANCE>

<environment_variables>
<env name="MPICH_ENV_DISPLAY">1</env>
Expand Down Expand Up @@ -595,17 +587,18 @@
<env name="FI_CXI_RX_MATCH_MODE">software</env>
<env name="FI_MR_CACHE_MONITOR">kdreg2</env>
<env name="MPICH_COLL_SYNC">MPI_Bcast</env>
<env name="Albany_ROOT">$SHELL{if [ -z "$Albany_ROOT" ]; then echo /global/common/software/e3sm/mali_tpls/albany-e3sm-serial-release-gcc; else echo "$Albany_ROOT"; fi}</env>
<env name="Trilinos_ROOT">$SHELL{if [ -z "$Trilinos_ROOT" ]; then echo /global/common/software/e3sm/mali_tpls/trilinos-e3sm-serial-release-gcc; else echo "$Trilinos_ROOT"; fi}</env>
<env name="NETCDF_PATH">$ENV{CRAY_NETCDF_HDF5PARALLEL_PREFIX}</env>
<env name="PNETCDF_PATH">$ENV{CRAY_PARALLEL_NETCDF_PREFIX}</env>
<env name="GATOR_INITIAL_MB">4000MB</env>
</environment_variables>
<environment_variables compiler="intel" mpilib="mpich">
<env name="ADIOS2_ROOT">$SHELL{if [ -z "$ADIOS2_ROOT" ]; then echo /global/cfs/cdirs/e3sm/3rdparty/adios2/2.9.1/cray-mpich-8.1.25/intel-2023.1.0; else echo "$ADIOS2_ROOT"; fi}</env>
</environment_variables>
<environment_variables compiler="gnu" mpilib="mpich">
<env name="ADIOS2_ROOT">$SHELL{if [ -z "$ADIOS2_ROOT" ]; then echo /global/cfs/cdirs/e3sm/3rdparty/adios2/2.9.1/cray-mpich-8.1.25/gcc-11.2.0; else echo "$ADIOS2_ROOT"; fi}</env>
<env name="BLA_VENDOR">Generic</env>
<env name="Albany_ROOT">$SHELL{if [ -z "$Albany_ROOT" ]; then echo /global/common/software/e3sm/albany/2024.03.26/gcc/11.2.0; else echo "$Albany_ROOT"; fi}</env>
<env name="Trilinos_ROOT">$SHELL{if [ -z "$Trilinos_ROOT" ]; then echo /global/common/software/e3sm/trilinos/15.1.1/gcc/11.2.0; else echo "$Trilinos_ROOT"; fi}</env>
</environment_variables>
<environment_variables compiler="nvidia" mpilib="mpich">
<env name="ADIOS2_ROOT">$SHELL{if [ -z "$ADIOS2_ROOT" ]; then echo /global/cfs/cdirs/e3sm/3rdparty/adios2/2.9.1/cray-mpich-8.1.25/nvidia-22.7; else echo "$ADIOS2_ROOT"; fi}</env>
Expand All @@ -621,6 +614,13 @@
<environment_variables compiler="amdclang" mpilib="mpich">
<env name="ADIOS2_ROOT">$SHELL{if [ -z "$ADIOS2_ROOT" ]; then echo /global/cfs/cdirs/e3sm/3rdparty/adios2/2.9.1/cray-mpich-8.1.25/aocc-4.0.0; else echo "$ADIOS2_ROOT"; fi}</env>
</environment_variables>
<environment_variables compiler="intel">
<env name="MOAB_ROOT">$SHELL{if [ -z "$MOAB_ROOT" ]; then echo /global/cfs/cdirs/e3sm/software/moab/intel; else echo "$MOAB_ROOT"; fi}</env>
</environment_variables>
<environment_variables compiler="gnu">
<env name="MOAB_ROOT">$SHELL{if [ -z "$MOAB_ROOT" ]; then echo /global/cfs/cdirs/e3sm/software/moab/gnu; else echo "$MOAB_ROOT"; fi}</env>
</environment_variables>

<resource_limits>
<resource name="RLIMIT_STACK">-1</resource>
</resource_limits>
Expand Down Expand Up @@ -687,13 +687,14 @@
<command name="unload">PrgEnv-nvidia</command>
<command name="unload">PrgEnv-cray</command>
<command name="unload">PrgEnv-aocc</command>
<command name="unload">gcc-native</command>
<command name="unload">intel</command>
<command name="unload">intel-oneapi</command>
<command name="unload">nvidia</command>
<command name="unload">aocc</command>
<command name="unload">cudatoolkit</command>
<command name="unload">cray-libsci</command>
<command name="unload">climate-utils</command>
<command name="unload">cray-libsci</command>
<command name="unload">matlab</command>
<command name="unload">craype-accel-nvidia80</command>
<command name="unload">craype-accel-host</command>
Expand Down Expand Up @@ -745,6 +746,7 @@
<RUNDIR>$CIME_OUTPUT_ROOT/$CASE/run</RUNDIR>
<EXEROOT>$CIME_OUTPUT_ROOT/$CASE/bld</EXEROOT>
<TEST_TPUT_TOLERANCE>0.1</TEST_TPUT_TOLERANCE>
<TEST_MEMLEAK_TOLERANCE>0.20</TEST_MEMLEAK_TOLERANCE>

<environment_variables>
<env name="MPICH_ENV_DISPLAY">1</env>
Expand All @@ -766,6 +768,9 @@
<environment_variables compiler="nvidiagpu">
<env name="MPICH_GPU_SUPPORT_ENABLED">1</env>
</environment_variables>
<environment_variables compiler="gnugpu">
<env name="MOAB_ROOT">$SHELL{if [ -z "$MOAB_ROOT" ]; then echo /global/cfs/cdirs/e3sm/software/moab/gnugpu ; else echo "$MOAB_ROOT"; fi}</env>
</environment_variables>
<environment_variables compiler="gnu.*" mpilib="mpich">
<env name="ADIOS2_ROOT">$SHELL{if [ -z "$ADIOS2_ROOT" ]; then echo /global/cfs/cdirs/e3sm/3rdparty/adios2/2.9.1/cray-mpich-8.1.25/gcc-11.2.0; else echo "$ADIOS2_ROOT"; fi}</env>
</environment_variables>
Expand Down Expand Up @@ -823,6 +828,7 @@
<cmd_path lang="csh">module</cmd_path>

<modules>
<command name="unload">cpe</command>
<command name="unload">cray-hdf5-parallel</command>
<command name="unload">cray-netcdf-hdf5parallel</command>
<command name="unload">cray-parallel-netcdf</command>
Expand Down Expand Up @@ -850,43 +856,43 @@
</modules>

<modules compiler="gnu">
<command name="load">PrgEnv-gnu</command>
<command name="load">gcc-native</command>
<command name="load">cray-libsci</command>
<command name="load">PrgEnv-gnu/8.5.0</command>
<command name="load">gcc-native/13.2</command>
<command name="load">cray-libsci/24.03.0</command>
</modules>

<modules compiler="intel">
<command name="load">PrgEnv-intel</command>
<command name="load">intel</command>
<command name="load">PrgEnv-intel/8.5.0</command>
<command name="load">intel/2024.1.0</command>
</modules>

<modules compiler="nvidia">
<command name="load">PrgEnv-nvidia</command>
<command name="load">nvidia/24.5</command>
<command name="load">cray-libsci</command>
<command name="load">cray-libsci/24.03.0</command>
</modules>

<modules compiler="amdclang">
<command name="load">PrgEnv-aocc</command>
<command name="load">aocc/4.0.1</command>
<command name="load">cray-libsci</command>
<command name="load">aocc/4.1.0</command>
<command name="load">cray-libsci/24.03.0</command>
</modules>

<modules>
<command name="load">craype-accel-host</command>
<command name="load">cray-libsci</command>
<command name="load">craype/2.7.30</command>
<command name="load">cray-mpich/8.1.28</command>
<command name="load">cray-hdf5-parallel/1.12.2.9</command>
<command name="load">cray-netcdf-hdf5parallel/4.9.0.9</command>
<command name="load">cray-parallel-netcdf/1.12.3.9</command>
<command name="load">craype/2.7.31.11</command>
<command name="load">cray-mpich/8.1.29</command>
<command name="load">cray-hdf5-parallel/1.12.2.11</command>
<command name="load">cray-netcdf-hdf5parallel/4.9.0.11</command>
<command name="load">cray-parallel-netcdf/1.12.3.11</command>
<command name="load">cmake/3.24.3</command>
</modules>
</module_system>

<RUNDIR>$CIME_OUTPUT_ROOT/$CASE/run</RUNDIR>
<EXEROOT>$CIME_OUTPUT_ROOT/$CASE/bld</EXEROOT>
<TEST_TPUT_TOLERANCE>0.1</TEST_TPUT_TOLERANCE>
<TEST_MEMLEAK_TOLERANCE>0.20</TEST_MEMLEAK_TOLERANCE>

<environment_variables>
<env name="MPICH_ENV_DISPLAY">1</env>
Expand All @@ -902,12 +908,44 @@
<env name="MPICH_COLL_SYNC">MPI_Bcast</env>
<env name="NETCDF_PATH">$ENV{CRAY_NETCDF_HDF5PARALLEL_PREFIX}</env>
<env name="PNETCDF_PATH">$ENV{CRAY_PARALLEL_NETCDF_PREFIX}</env>
<env name="GATOR_INITIAL_MB">4000MB</env>
</environment_variables>
<environment_variables compiler="intel" mpilib="mpich">
<env name="ADIOS2_ROOT">$SHELL{if [ -z "$ADIOS2_ROOT" ]; then echo /global/cfs/cdirs/e3sm/3rdparty/adios2/2.9.1/cray-mpich-8.1.25/intel-2023.1.0; else echo "$ADIOS2_ROOT"; fi}</env>
</environment_variables>
<environment_variables compiler="gnu" mpilib="mpich">
<env name="ADIOS2_ROOT">$SHELL{if [ -z "$ADIOS2_ROOT" ]; then echo /global/cfs/cdirs/e3sm/3rdparty/adios2/2.9.1/cray-mpich-8.1.25/gcc-11.2.0; else echo "$ADIOS2_ROOT"; fi}</env>
<env name="BLA_VENDOR">Generic</env>
<env name="Albany_ROOT">$SHELL{if [ -z "$Albany_ROOT" ]; then echo /global/common/software/e3sm/albany/2024.03.26/gcc/11.2.0; else echo "$Albany_ROOT"; fi}</env>
<env name="Trilinos_ROOT">$SHELL{if [ -z "$Trilinos_ROOT" ]; then echo /global/common/software/e3sm/trilinos/15.1.1/gcc/11.2.0; else echo "$Trilinos_ROOT"; fi}</env>
</environment_variables>
<environment_variables compiler="nvidia" mpilib="mpich">
<env name="ADIOS2_ROOT">$SHELL{if [ -z "$ADIOS2_ROOT" ]; then echo /global/cfs/cdirs/e3sm/3rdparty/adios2/2.9.1/cray-mpich-8.1.25/nvidia-22.7; else echo "$ADIOS2_ROOT"; fi}</env>
</environment_variables>
<environment_variables compiler="nvidia">
<env name="BLAS_ROOT">$SHELL{if [ -z "$BLAS_ROOT" ]; then echo $NVIDIA_PATH/compilers; else echo "$BLAS_ROOT"; fi}</env>
<env name="LAPACK_ROOT">$SHELL{if [ -z "$LAPACK_ROOT" ]; then echo $NVIDIA_PATH/compilers; else echo "$LAPACK_ROOT"; fi}</env>
<env name="BLA_VENDOR">NVHPC</env>
</environment_variables>
<environment_variables compiler="intel">
<env name="BLA_VENDOR">Intel10_64_dyn</env>
</environment_variables>
<environment_variables compiler="amdclang" mpilib="mpich">
<env name="ADIOS2_ROOT">$SHELL{if [ -z "$ADIOS2_ROOT" ]; then echo /global/cfs/cdirs/e3sm/3rdparty/adios2/2.9.1/cray-mpich-8.1.25/aocc-4.0.0; else echo "$ADIOS2_ROOT"; fi}</env>
</environment_variables>
<environment_variables compiler="intel">
<env name="MOAB_ROOT">$SHELL{if [ -z "$MOAB_ROOT" ]; then echo /global/cfs/cdirs/e3sm/software/moab/intel; else echo "$MOAB_ROOT"; fi}</env>
</environment_variables>
<environment_variables compiler="gnu">
<env name="MOAB_ROOT">$SHELL{if [ -z "$MOAB_ROOT" ]; then echo /global/cfs/cdirs/e3sm/software/moab/gnu; else echo "$MOAB_ROOT"; fi}</env>
</environment_variables>

<resource_limits>
<resource name="RLIMIT_STACK">-1</resource>
</resource_limits>
</machine>


<machine MACH="spock">
<DESC>Spock. NCCS moderate-security system that contains similar hardware and software as the upcoming Frontier system at ORNL.</DESC>
<NODENAME_REGEX>.*spock.*</NODENAME_REGEX>
Expand Down

0 comments on commit d1e8163

Please sign in to comment.