diff --git a/CIME/Tools/Makefile b/CIME/Tools/Makefile
index 556ce8aa937..ff8bb42ce53 100644
--- a/CIME/Tools/Makefile
+++ b/CIME/Tools/Makefile
@@ -613,6 +613,9 @@ endif
# Remove arch flag if it exists
F90_LDFLAGS := $(filter-out -arch%,$(LDFLAGS))
+ifdef GPUFLAGS
+ F90_LDFLAGS += $(GPUFLAGS)
+endif
# Machine stuff to appear last on the link step
ifndef MLIBS
diff --git a/CIME/XML/env_batch.py b/CIME/XML/env_batch.py
index 9cf4136b539..581048b263e 100644
--- a/CIME/XML/env_batch.py
+++ b/CIME/XML/env_batch.py
@@ -620,6 +620,9 @@ def _process_args(self, case, submit_arg_nodes, job, resolve=True):
if name:
if resolve and "$" in name:
rflag = self._resolve_argument(case, flag, name, job)
+ # This is to prevent -gpu_type=none in qsub args
+ if rflag.endswith("=none"):
+ continue
if len(rflag) > len(flag):
submitargs += " {}".format(rflag)
else:
diff --git a/CIME/XML/env_mach_pes.py b/CIME/XML/env_mach_pes.py
index c7635573f95..76c6588901b 100644
--- a/CIME/XML/env_mach_pes.py
+++ b/CIME/XML/env_mach_pes.py
@@ -42,6 +42,8 @@ def get_value(
resolved=True,
subgroup=None,
max_mpitasks_per_node=None,
+ max_cputasks_per_gpu_node=None,
+ ngpus_per_node=None,
): # pylint: disable=arguments-differ
# Special variable NINST_MAX is used to determine the number of
# drivers in multi-driver mode.
@@ -58,7 +60,13 @@ def get_value(
if "NTASKS" in vid or "ROOTPE" in vid:
if max_mpitasks_per_node is None:
max_mpitasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE")
- if value is not None and value < 0:
+ if max_cputasks_per_gpu_node is None:
+ max_cputasks_per_gpu_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE")
+ if ngpus_per_node is None:
+ ngpus_per_node = self.get_value("NGPUS_PER_NODE")
+ if (ngpus_per_node and value) and value < 0:
+ value = -1 * value * max_cputasks_per_gpu_node
+ elif value and value < 0:
value = -1 * value * max_mpitasks_per_node
# in the nuopc driver there is only one NINST value
# so that NINST_{comp} = NINST
@@ -154,6 +162,7 @@ def get_total_tasks(self, comp_classes, async_interface=False):
tt = rootpe + nthrds * ((ntasks - 1) * pstrid + 1)
maxrootpe = max(maxrootpe, rootpe)
total_tasks = max(tt, total_tasks)
+
if asyncio_tasks:
total_tasks = total_tasks + len(asyncio_tasks)
if self.get_value("MULTI_DRIVER"):
@@ -167,13 +176,24 @@ def get_tasks_per_node(self, total_tasks, max_thread_count):
"totaltasks > 0 expected, totaltasks = {}".format(total_tasks),
)
if self._comp_interface == "nuopc" and self.get_value("ESMF_AWARE_THREADING"):
- tasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE")
+ if self.get_value("NGPUS_PER_NODE") > 0:
+ tasks_per_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE")
+ else:
+ tasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE")
else:
- tasks_per_node = min(
- self.get_value("MAX_TASKS_PER_NODE") // max_thread_count,
- self.get_value("MAX_MPITASKS_PER_NODE"),
- total_tasks,
- )
+ ngpus_per_node = self.get_value("NGPUS_PER_NODE")
+ if ngpus_per_node and ngpus_per_node > 0:
+ tasks_per_node = min(
+ self.get_value("MAX_TASKS_PER_NODE") // max_thread_count,
+ self.get_value("MAX_CPUTASKS_PER_GPU_NODE"),
+ total_tasks,
+ )
+ else:
+ tasks_per_node = min(
+ self.get_value("MAX_TASKS_PER_NODE") // max_thread_count,
+ self.get_value("MAX_MPITASKS_PER_NODE"),
+ total_tasks,
+ )
return tasks_per_node if tasks_per_node > 0 else 1
def get_total_nodes(self, total_tasks, max_thread_count):
diff --git a/CIME/XML/env_mach_specific.py b/CIME/XML/env_mach_specific.py
index 03e84f0faee..4652f2a7d0a 100644
--- a/CIME/XML/env_mach_specific.py
+++ b/CIME/XML/env_mach_specific.py
@@ -320,7 +320,8 @@ def _compute_resource_actions(self, resource_nodes, case, job=None):
def _compute_actions(self, nodes, child_tag, case, job=None):
result = [] # list of tuples ("name", "argument")
- compiler, mpilib = case.get_value("COMPILER"), case.get_value("MPILIB")
+ compiler = case.get_value("COMPILER")
+ mpilib = case.get_value("MPILIB")
for node in nodes:
if self._match_attribs(self.attrib(node), case, job=job):
diff --git a/CIME/build.py b/CIME/build.py
index 26702cd8b76..b8d481b80d8 100644
--- a/CIME/build.py
+++ b/CIME/build.py
@@ -246,6 +246,20 @@ def get_standard_cmake_args(case, sharedpath):
cmake_args += " -Dcompile_threaded={} ".format(
stringify_bool(case.get_build_threaded())
)
+ # check settings for GPU
+ gpu_type = case.get_value("GPU_TYPE")
+ gpu_offload = case.get_value("GPU_OFFLOAD")
+ if gpu_type != "none":
+ expect(
+ gpu_offload != "none",
+ "Both GPU_TYPE and GPU_OFFLOAD must be defined if either is",
+ )
+ cmake_args += f" -DGPU_TYPE={gpu_type} -DGPU_OFFLOAD={gpu_offload}"
+ else:
+ expect(
+ gpu_offload == "none",
+ "Both GPU_TYPE and GPU_OFFLOAD must be defined if either is",
+ )
ocn_model = case.get_value("COMP_OCN")
atm_dycore = case.get_value("CAM_DYCORE")
diff --git a/CIME/case/case.py b/CIME/case/case.py
index 9c2f87946ef..567eb223988 100644
--- a/CIME/case/case.py
+++ b/CIME/case/case.py
@@ -74,6 +74,7 @@ class Case(object):
This class extends across multiple files, class members external to this file
are listed in the following imports
+
"""
from CIME.case.case_setup import case_setup
@@ -123,6 +124,7 @@ def __init__(self, case_root=None, read_only=True, record=False, non_local=False
self._env_generic_files = []
self._files = []
self._comp_interface = None
+ self.gpu_enabled = False
self._non_local = non_local
self.read_xml()
@@ -275,6 +277,9 @@ def initialize_derived_attributes(self):
if max_gpus_per_node:
self.ngpus_per_node = self.get_value("NGPUS_PER_NODE")
+ # update the maximum MPI tasks for a GPU node (could differ from a pure-CPU node)
+ if self.ngpus_per_node > 0:
+ max_mpitasks_per_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE")
self.tasks_per_numa = int(math.ceil(self.tasks_per_node / 2.0))
smt_factor = max(
@@ -451,6 +456,12 @@ def get_values(self, item, attribute=None, resolved=True, subgroup=None):
return []
def get_value(self, item, attribute=None, resolved=True, subgroup=None):
+ if item == "GPU_ENABLED":
+ if not self.gpu_enabled:
+ if self.get_value("GPU_TYPE") != "none":
+ self.gpu_enabled = True
+ return "true" if self.gpu_enabled else "false"
+
result = None
for env_file in self._files:
# Wait and resolve in self rather than in env_file
@@ -1141,7 +1152,6 @@ def _setup_mach_pes(self, pecount, multi_driver, ninst, machine_name, mpilib):
comment = None
force_tasks = None
force_thrds = None
-
if match1:
opti_tasks = match1.group(1)
if opti_tasks.isdigit():
@@ -1211,7 +1221,6 @@ def _setup_mach_pes(self, pecount, multi_driver, ninst, machine_name, mpilib):
pstrid = pes_pstrid[pstrid_str] if pstrid_str in pes_pstrid else 1
totaltasks.append((ntasks + rootpe) * nthrds)
-
mach_pes_obj.set_value(ntasks_str, ntasks)
mach_pes_obj.set_value(nthrds_str, nthrds)
mach_pes_obj.set_value(rootpe_str, rootpe)
@@ -1262,6 +1271,8 @@ def configure(
extra_machines_dir=None,
case_group=None,
ngpus_per_node=0,
+ gpu_type=None,
+ gpu_offload=None,
):
expect(
@@ -1344,6 +1355,7 @@ def configure(
and "MPILIB" not in x
and "MAX_MPITASKS_PER_NODE" not in x
and "MAX_TASKS_PER_NODE" not in x
+ and "MAX_CPUTASKS_PER_GPU_NODE" not in x
and "MAX_GPUS_PER_NODE" not in x
]
@@ -1378,6 +1390,7 @@ def configure(
for name in (
"MAX_TASKS_PER_NODE",
"MAX_MPITASKS_PER_NODE",
+ "MAX_CPUTASKS_PER_GPU_NODE",
"MAX_GPUS_PER_NODE",
):
dmax = machobj.get_value(name, {"compiler": compiler})
@@ -1385,13 +1398,23 @@ def configure(
dmax = machobj.get_value(name)
if dmax:
self.set_value(name, dmax)
+ elif name == "MAX_CPUTASKS_PER_GPU_NODE":
+ logger.debug(
+ "Variable {} not defined for machine {} and compiler {}".format(
+ name, machine_name, compiler
+ )
+ )
elif name == "MAX_GPUS_PER_NODE":
logger.debug(
- "Variable {} not defined for machine {}".format(name, machine_name)
+ "Variable {} not defined for machine {} and compiler {}".format(
+ name, machine_name, compiler
+ )
)
else:
logger.warning(
- "Variable {} not defined for machine {}".format(name, machine_name)
+ "Variable {} not defined for machine {} and compiler {}".format(
+ name, machine_name, compiler
+ )
)
machdir = machobj.get_machines_dir()
@@ -1509,47 +1532,62 @@ def configure(
self.set_value("TEST", True)
# ----------------------------------------------------------------------------------------------------------
- # Sanity check:
- # 1. We assume that there is always a string "gpu" in the compiler name if we want to enable GPU
- # 2. For compilers without the string "gpu" in the name:
- # 2.1. the ngpus-per-node argument would not update the NGPUS_PER_NODE XML variable, as long as
- # the MAX_GPUS_PER_NODE XML variable is not defined (i.e., this argument is not in effect).
- # 2.2. if the MAX_GPUS_PER_NODE XML variable is defined, then the ngpus-per-node argument
- # must be set to 0. Otherwise, an error will be triggered.
- # 3. For compilers with the string "gpu" in the name:
- # 3.1. if ngpus-per-node argument is smaller than 0, an error will be triggered.
- # 3.2. if ngpus_per_node argument is larger than the value of MAX_GPUS_PER_NODE, the NGPUS_PER_NODE
+ # Sanity check for a GPU run:
+ # 1. GPU_TYPE and GPU_OFFLOAD must both be defined to use GPUS
+ # 2. if ngpus_per_node argument is larger than the value of MAX_GPUS_PER_NODE, the NGPUS_PER_NODE
# XML variable in the env_mach_pes.xml file would be set to MAX_GPUS_PER_NODE automatically.
- # 3.3. if ngpus-per-node argument is equal to 0, it will be updated to 1 automatically.
+ # 3. if ngpus-per-node argument is equal to 0, it will be updated to 1 automatically.
# ----------------------------------------------------------------------------------------------------------
max_gpus_per_node = self.get_value("MAX_GPUS_PER_NODE")
- if max_gpus_per_node:
- if "gpu" in compiler:
- if not ngpus_per_node:
- ngpus_per_node = 1
- logger.warning(
- "Setting ngpus_per_node to 1 for compiler {}".format(compiler)
- )
- expect(
- ngpus_per_node > 0,
- " ngpus_per_node is expected > 0 for compiler {}; current value is {}".format(
- compiler, ngpus_per_node
- ),
- )
- else:
- expect(
- ngpus_per_node == 0,
- " ngpus_per_node is expected = 0 for compiler {}; current value is {}".format(
- compiler, ngpus_per_node
- ),
- )
+ if gpu_type and str(gpu_type).lower() != "none":
+ expect(
+ max_gpus_per_node,
+ f"GPUS are not defined for machine={machine_name} and compiler={compiler}",
+ )
+ expect(
+ gpu_offload,
+ "Both gpu-type and gpu-offload must be defined if either is defined",
+ )
+ expect(
+ compiler in ["nvhpc", "cray"],
+ f"Only nvhpc and cray compilers are expected for a GPU run; the user given compiler is {compiler}, ",
+ )
+ valid_gpu_type = self.get_value("GPU_TYPE").split(",")
+ valid_gpu_type.remove("none")
+ expect(
+ gpu_type in valid_gpu_type,
+ f"Unsupported GPU type is given: {gpu_type} ; valid values are {valid_gpu_type}",
+ )
+ valid_gpu_offload = self.get_value("GPU_OFFLOAD").split(",")
+ valid_gpu_offload.remove("none")
+ expect(
+ gpu_offload in valid_gpu_offload,
+ f"Unsupported GPU programming model is given: {gpu_offload} ; valid values are {valid_gpu_offload}",
+ )
+ self.gpu_enabled = True
if ngpus_per_node >= 0:
self.set_value(
"NGPUS_PER_NODE",
- ngpus_per_node
+ max(1, ngpus_per_node)
if ngpus_per_node <= max_gpus_per_node
else max_gpus_per_node,
)
+ elif gpu_offload and str(gpu_offload).lower() != "none":
+ expect(
+ False,
+ "Both gpu-type and gpu-offload must be defined if either is defined",
+ )
+ elif ngpus_per_node != 0:
+ expect(
+ False,
+ f"ngpus_per_node is expected to be 0 for a pure CPU run ; {ngpus_per_node} is provided instead ;",
+ )
+
+ # Set these two GPU XML variables here to overwrite the default values
+ if gpu_type:
+ self.set_value("GPU_TYPE", str(gpu_type).lower())
+ if gpu_offload:
+ self.set_value("GPU_OFFLOAD", str(gpu_offload).lower())
self.initialize_derived_attributes()
@@ -2073,12 +2111,10 @@ def get_mpirun_cmd(self, job=None, allow_unresolved_envvars=True, overrides=None
mpi_arg_string += " : "
ngpus_per_node = self.get_value("NGPUS_PER_NODE")
- if ngpus_per_node and ngpus_per_node > 0 and config.gpus_use_set_device_rank:
- # 1. this setting is tested on Casper only and may not work on other machines
- # 2. need to be revisited in the future for a more adaptable implementation
- rundir = self.get_value("RUNDIR")
- output_name = rundir + "/set_device_rank.sh"
- mpi_arg_string = mpi_arg_string + " " + output_name + " "
+ if ngpus_per_node and ngpus_per_node > 0:
+ mpi_gpu_run_script = self.get_value("MPI_GPU_WRAPPER_SCRIPT")
+ if mpi_gpu_run_script:
+ mpi_arg_string = mpi_arg_string + " " + mpi_gpu_run_script
return self.get_resolved_value(
"{} {} {} {}".format(
@@ -2375,6 +2411,8 @@ def create(
extra_machines_dir=None,
case_group=None,
ngpus_per_node=0,
+ gpu_type=None,
+ gpu_offload=None,
):
try:
# Set values for env_case.xml
@@ -2448,6 +2486,8 @@ def create(
extra_machines_dir=extra_machines_dir,
case_group=case_group,
ngpus_per_node=ngpus_per_node,
+ gpu_type=gpu_type,
+ gpu_offload=gpu_offload,
)
self.create_caseroot()
diff --git a/CIME/case/case_setup.py b/CIME/case/case_setup.py
index aa8fb8b6b6c..363e8f4fab4 100644
--- a/CIME/case/case_setup.py
+++ b/CIME/case/case_setup.py
@@ -21,7 +21,6 @@
copy_local_macros_to_dir,
)
from CIME.utils import batch_jobid
-from CIME.utils import transform_vars
from CIME.test_status import *
from CIME.locked_files import unlock_file, lock_file
@@ -482,31 +481,3 @@ def case_setup(self, clean=False, test_mode=False, reset=False, keep=None):
caseroot=caseroot,
is_batch=is_batch,
)
-
- # put the following section here to make sure the rundir is generated first
- machdir = self.get_value("MACHDIR")
- mach = self.get_value("MACH")
- ngpus_per_node = self.get_value("NGPUS_PER_NODE")
- overrides = {}
- overrides["ngpus_per_node"] = ngpus_per_node
- input_template = os.path.join(machdir, "mpi_run_gpu.{}".format(mach))
- if os.path.isfile(input_template):
- # update the wrapper script that sets the device id for each MPI rank
- output_text = transform_vars(
- open(input_template, "r").read(), case=self, overrides=overrides
- )
-
- # write it out to the run dir
- rundir = self.get_value("RUNDIR")
- output_name = os.path.join(rundir, "set_device_rank.sh")
- logger.info("Creating file {}".format(output_name))
- with open(output_name, "w") as f:
- f.write(output_text)
-
- # make the wrapper script executable
- if os.path.isfile(output_name):
- os.system("chmod +x " + output_name)
- else:
- expect(
- False, "The file {} is not written out correctly.".format(output_name)
- )
diff --git a/CIME/config.py b/CIME/config.py
index 8491b2f3f2e..3cef6cc0530 100644
--- a/CIME/config.py
+++ b/CIME/config.py
@@ -177,11 +177,6 @@ def __init__(self):
False,
desc="If set to `True` then COMP_ROOT_DIR_CPL is set using UFS_DRIVER if defined.",
)
- self._set_attribute(
- "gpus_use_set_device_rank",
- True,
- desc="If set to `True` and NGPUS_PER_NODE > 0 then `$RUNDIR/set_device_rank.sh` is appended when the MPI run command is generated.",
- )
self._set_attribute(
"test_custom_project_machine",
"melvin",
diff --git a/CIME/data/config/xml_schemas/config_machines.xsd b/CIME/data/config/xml_schemas/config_machines.xsd
index d6e3c280a93..b025c4039e0 100644
--- a/CIME/data/config/xml_schemas/config_machines.xsd
+++ b/CIME/data/config/xml_schemas/config_machines.xsd
@@ -6,6 +6,8 @@
+
+
@@ -56,6 +58,10 @@
+
+
+
+
@@ -166,6 +172,16 @@
+
+
+
+
+
+
+
+