diff --git a/CIME/Tools/Makefile b/CIME/Tools/Makefile index 556ce8aa937..ff8bb42ce53 100644 --- a/CIME/Tools/Makefile +++ b/CIME/Tools/Makefile @@ -613,6 +613,9 @@ endif # Remove arch flag if it exists F90_LDFLAGS := $(filter-out -arch%,$(LDFLAGS)) +ifdef GPUFLAGS + F90_LDFLAGS += $(GPUFLAGS) +endif # Machine stuff to appear last on the link step ifndef MLIBS diff --git a/CIME/XML/env_batch.py b/CIME/XML/env_batch.py index 9cf4136b539..581048b263e 100644 --- a/CIME/XML/env_batch.py +++ b/CIME/XML/env_batch.py @@ -620,6 +620,9 @@ def _process_args(self, case, submit_arg_nodes, job, resolve=True): if name: if resolve and "$" in name: rflag = self._resolve_argument(case, flag, name, job) + # This is to prevent -gpu_type=none in qsub args + if rflag.endswith("=none"): + continue if len(rflag) > len(flag): submitargs += " {}".format(rflag) else: diff --git a/CIME/XML/env_mach_pes.py b/CIME/XML/env_mach_pes.py index c7635573f95..76c6588901b 100644 --- a/CIME/XML/env_mach_pes.py +++ b/CIME/XML/env_mach_pes.py @@ -42,6 +42,8 @@ def get_value( resolved=True, subgroup=None, max_mpitasks_per_node=None, + max_cputasks_per_gpu_node=None, + ngpus_per_node=None, ): # pylint: disable=arguments-differ # Special variable NINST_MAX is used to determine the number of # drivers in multi-driver mode. @@ -58,7 +60,13 @@ def get_value( if "NTASKS" in vid or "ROOTPE" in vid: if max_mpitasks_per_node is None: max_mpitasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE") - if value is not None and value < 0: + if max_cputasks_per_gpu_node is None: + max_cputasks_per_gpu_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE") + if ngpus_per_node is None: + ngpus_per_node = self.get_value("NGPUS_PER_NODE") + if (ngpus_per_node and value) and value < 0: + value = -1 * value * max_cputasks_per_gpu_node + elif value and value < 0: value = -1 * value * max_mpitasks_per_node # in the nuopc driver there is only one NINST value # so that NINST_{comp} = NINST @@ -154,6 +162,7 @@ def get_total_tasks(self, comp_classes, async_interface=False): tt = rootpe + nthrds * ((ntasks - 1) * pstrid + 1) maxrootpe = max(maxrootpe, rootpe) total_tasks = max(tt, total_tasks) + if asyncio_tasks: total_tasks = total_tasks + len(asyncio_tasks) if self.get_value("MULTI_DRIVER"): @@ -167,13 +176,24 @@ def get_tasks_per_node(self, total_tasks, max_thread_count): "totaltasks > 0 expected, totaltasks = {}".format(total_tasks), ) if self._comp_interface == "nuopc" and self.get_value("ESMF_AWARE_THREADING"): - tasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE") + if self.get_value("NGPUS_PER_NODE") > 0: + tasks_per_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE") + else: + tasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE") else: - tasks_per_node = min( - self.get_value("MAX_TASKS_PER_NODE") // max_thread_count, - self.get_value("MAX_MPITASKS_PER_NODE"), - total_tasks, - ) + ngpus_per_node = self.get_value("NGPUS_PER_NODE") + if ngpus_per_node and ngpus_per_node > 0: + tasks_per_node = min( + self.get_value("MAX_TASKS_PER_NODE") // max_thread_count, + self.get_value("MAX_CPUTASKS_PER_GPU_NODE"), + total_tasks, + ) + else: + tasks_per_node = min( + self.get_value("MAX_TASKS_PER_NODE") // max_thread_count, + self.get_value("MAX_MPITASKS_PER_NODE"), + total_tasks, + ) return tasks_per_node if tasks_per_node > 0 else 1 def get_total_nodes(self, total_tasks, max_thread_count): diff --git a/CIME/XML/env_mach_specific.py b/CIME/XML/env_mach_specific.py index 03e84f0faee..4652f2a7d0a 100644 --- a/CIME/XML/env_mach_specific.py +++ b/CIME/XML/env_mach_specific.py @@ -320,7 +320,8 @@ def _compute_resource_actions(self, resource_nodes, case, job=None): def _compute_actions(self, nodes, child_tag, case, job=None): result = [] # list of tuples ("name", "argument") - compiler, mpilib = case.get_value("COMPILER"), case.get_value("MPILIB") + compiler = case.get_value("COMPILER") + mpilib = case.get_value("MPILIB") for node in nodes: if self._match_attribs(self.attrib(node), case, job=job): diff --git a/CIME/build.py b/CIME/build.py index 26702cd8b76..b8d481b80d8 100644 --- a/CIME/build.py +++ b/CIME/build.py @@ -246,6 +246,20 @@ def get_standard_cmake_args(case, sharedpath): cmake_args += " -Dcompile_threaded={} ".format( stringify_bool(case.get_build_threaded()) ) + # check settings for GPU + gpu_type = case.get_value("GPU_TYPE") + gpu_offload = case.get_value("GPU_OFFLOAD") + if gpu_type != "none": + expect( + gpu_offload != "none", + "Both GPU_TYPE and GPU_OFFLOAD must be defined if either is", + ) + cmake_args += f" -DGPU_TYPE={gpu_type} -DGPU_OFFLOAD={gpu_offload}" + else: + expect( + gpu_offload == "none", + "Both GPU_TYPE and GPU_OFFLOAD must be defined if either is", + ) ocn_model = case.get_value("COMP_OCN") atm_dycore = case.get_value("CAM_DYCORE") diff --git a/CIME/case/case.py b/CIME/case/case.py index 9c2f87946ef..567eb223988 100644 --- a/CIME/case/case.py +++ b/CIME/case/case.py @@ -74,6 +74,7 @@ class Case(object): This class extends across multiple files, class members external to this file are listed in the following imports + """ from CIME.case.case_setup import case_setup @@ -123,6 +124,7 @@ def __init__(self, case_root=None, read_only=True, record=False, non_local=False self._env_generic_files = [] self._files = [] self._comp_interface = None + self.gpu_enabled = False self._non_local = non_local self.read_xml() @@ -275,6 +277,9 @@ def initialize_derived_attributes(self): if max_gpus_per_node: self.ngpus_per_node = self.get_value("NGPUS_PER_NODE") + # update the maximum MPI tasks for a GPU node (could differ from a pure-CPU node) + if self.ngpus_per_node > 0: + max_mpitasks_per_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE") self.tasks_per_numa = int(math.ceil(self.tasks_per_node / 2.0)) smt_factor = max( @@ -451,6 +456,12 @@ def get_values(self, item, attribute=None, resolved=True, subgroup=None): return [] def get_value(self, item, attribute=None, resolved=True, subgroup=None): + if item == "GPU_ENABLED": + if not self.gpu_enabled: + if self.get_value("GPU_TYPE") != "none": + self.gpu_enabled = True + return "true" if self.gpu_enabled else "false" + result = None for env_file in self._files: # Wait and resolve in self rather than in env_file @@ -1141,7 +1152,6 @@ def _setup_mach_pes(self, pecount, multi_driver, ninst, machine_name, mpilib): comment = None force_tasks = None force_thrds = None - if match1: opti_tasks = match1.group(1) if opti_tasks.isdigit(): @@ -1211,7 +1221,6 @@ def _setup_mach_pes(self, pecount, multi_driver, ninst, machine_name, mpilib): pstrid = pes_pstrid[pstrid_str] if pstrid_str in pes_pstrid else 1 totaltasks.append((ntasks + rootpe) * nthrds) - mach_pes_obj.set_value(ntasks_str, ntasks) mach_pes_obj.set_value(nthrds_str, nthrds) mach_pes_obj.set_value(rootpe_str, rootpe) @@ -1262,6 +1271,8 @@ def configure( extra_machines_dir=None, case_group=None, ngpus_per_node=0, + gpu_type=None, + gpu_offload=None, ): expect( @@ -1344,6 +1355,7 @@ def configure( and "MPILIB" not in x and "MAX_MPITASKS_PER_NODE" not in x and "MAX_TASKS_PER_NODE" not in x + and "MAX_CPUTASKS_PER_GPU_NODE" not in x and "MAX_GPUS_PER_NODE" not in x ] @@ -1378,6 +1390,7 @@ def configure( for name in ( "MAX_TASKS_PER_NODE", "MAX_MPITASKS_PER_NODE", + "MAX_CPUTASKS_PER_GPU_NODE", "MAX_GPUS_PER_NODE", ): dmax = machobj.get_value(name, {"compiler": compiler}) @@ -1385,13 +1398,23 @@ def configure( dmax = machobj.get_value(name) if dmax: self.set_value(name, dmax) + elif name == "MAX_CPUTASKS_PER_GPU_NODE": + logger.debug( + "Variable {} not defined for machine {} and compiler {}".format( + name, machine_name, compiler + ) + ) elif name == "MAX_GPUS_PER_NODE": logger.debug( - "Variable {} not defined for machine {}".format(name, machine_name) + "Variable {} not defined for machine {} and compiler {}".format( + name, machine_name, compiler + ) ) else: logger.warning( - "Variable {} not defined for machine {}".format(name, machine_name) + "Variable {} not defined for machine {} and compiler {}".format( + name, machine_name, compiler + ) ) machdir = machobj.get_machines_dir() @@ -1509,47 +1532,62 @@ def configure( self.set_value("TEST", True) # ---------------------------------------------------------------------------------------------------------- - # Sanity check: - # 1. We assume that there is always a string "gpu" in the compiler name if we want to enable GPU - # 2. For compilers without the string "gpu" in the name: - # 2.1. the ngpus-per-node argument would not update the NGPUS_PER_NODE XML variable, as long as - # the MAX_GPUS_PER_NODE XML variable is not defined (i.e., this argument is not in effect). - # 2.2. if the MAX_GPUS_PER_NODE XML variable is defined, then the ngpus-per-node argument - # must be set to 0. Otherwise, an error will be triggered. - # 3. For compilers with the string "gpu" in the name: - # 3.1. if ngpus-per-node argument is smaller than 0, an error will be triggered. - # 3.2. if ngpus_per_node argument is larger than the value of MAX_GPUS_PER_NODE, the NGPUS_PER_NODE + # Sanity check for a GPU run: + # 1. GPU_TYPE and GPU_OFFLOAD must both be defined to use GPUS + # 2. if ngpus_per_node argument is larger than the value of MAX_GPUS_PER_NODE, the NGPUS_PER_NODE # XML variable in the env_mach_pes.xml file would be set to MAX_GPUS_PER_NODE automatically. - # 3.3. if ngpus-per-node argument is equal to 0, it will be updated to 1 automatically. + # 3. if ngpus-per-node argument is equal to 0, it will be updated to 1 automatically. # ---------------------------------------------------------------------------------------------------------- max_gpus_per_node = self.get_value("MAX_GPUS_PER_NODE") - if max_gpus_per_node: - if "gpu" in compiler: - if not ngpus_per_node: - ngpus_per_node = 1 - logger.warning( - "Setting ngpus_per_node to 1 for compiler {}".format(compiler) - ) - expect( - ngpus_per_node > 0, - " ngpus_per_node is expected > 0 for compiler {}; current value is {}".format( - compiler, ngpus_per_node - ), - ) - else: - expect( - ngpus_per_node == 0, - " ngpus_per_node is expected = 0 for compiler {}; current value is {}".format( - compiler, ngpus_per_node - ), - ) + if gpu_type and str(gpu_type).lower() != "none": + expect( + max_gpus_per_node, + f"GPUS are not defined for machine={machine_name} and compiler={compiler}", + ) + expect( + gpu_offload, + "Both gpu-type and gpu-offload must be defined if either is defined", + ) + expect( + compiler in ["nvhpc", "cray"], + f"Only nvhpc and cray compilers are expected for a GPU run; the user given compiler is {compiler}, ", + ) + valid_gpu_type = self.get_value("GPU_TYPE").split(",") + valid_gpu_type.remove("none") + expect( + gpu_type in valid_gpu_type, + f"Unsupported GPU type is given: {gpu_type} ; valid values are {valid_gpu_type}", + ) + valid_gpu_offload = self.get_value("GPU_OFFLOAD").split(",") + valid_gpu_offload.remove("none") + expect( + gpu_offload in valid_gpu_offload, + f"Unsupported GPU programming model is given: {gpu_offload} ; valid values are {valid_gpu_offload}", + ) + self.gpu_enabled = True if ngpus_per_node >= 0: self.set_value( "NGPUS_PER_NODE", - ngpus_per_node + max(1, ngpus_per_node) if ngpus_per_node <= max_gpus_per_node else max_gpus_per_node, ) + elif gpu_offload and str(gpu_offload).lower() != "none": + expect( + False, + "Both gpu-type and gpu-offload must be defined if either is defined", + ) + elif ngpus_per_node != 0: + expect( + False, + f"ngpus_per_node is expected to be 0 for a pure CPU run ; {ngpus_per_node} is provided instead ;", + ) + + # Set these two GPU XML variables here to overwrite the default values + if gpu_type: + self.set_value("GPU_TYPE", str(gpu_type).lower()) + if gpu_offload: + self.set_value("GPU_OFFLOAD", str(gpu_offload).lower()) self.initialize_derived_attributes() @@ -2073,12 +2111,10 @@ def get_mpirun_cmd(self, job=None, allow_unresolved_envvars=True, overrides=None mpi_arg_string += " : " ngpus_per_node = self.get_value("NGPUS_PER_NODE") - if ngpus_per_node and ngpus_per_node > 0 and config.gpus_use_set_device_rank: - # 1. this setting is tested on Casper only and may not work on other machines - # 2. need to be revisited in the future for a more adaptable implementation - rundir = self.get_value("RUNDIR") - output_name = rundir + "/set_device_rank.sh" - mpi_arg_string = mpi_arg_string + " " + output_name + " " + if ngpus_per_node and ngpus_per_node > 0: + mpi_gpu_run_script = self.get_value("MPI_GPU_WRAPPER_SCRIPT") + if mpi_gpu_run_script: + mpi_arg_string = mpi_arg_string + " " + mpi_gpu_run_script return self.get_resolved_value( "{} {} {} {}".format( @@ -2375,6 +2411,8 @@ def create( extra_machines_dir=None, case_group=None, ngpus_per_node=0, + gpu_type=None, + gpu_offload=None, ): try: # Set values for env_case.xml @@ -2448,6 +2486,8 @@ def create( extra_machines_dir=extra_machines_dir, case_group=case_group, ngpus_per_node=ngpus_per_node, + gpu_type=gpu_type, + gpu_offload=gpu_offload, ) self.create_caseroot() diff --git a/CIME/case/case_setup.py b/CIME/case/case_setup.py index aa8fb8b6b6c..363e8f4fab4 100644 --- a/CIME/case/case_setup.py +++ b/CIME/case/case_setup.py @@ -21,7 +21,6 @@ copy_local_macros_to_dir, ) from CIME.utils import batch_jobid -from CIME.utils import transform_vars from CIME.test_status import * from CIME.locked_files import unlock_file, lock_file @@ -482,31 +481,3 @@ def case_setup(self, clean=False, test_mode=False, reset=False, keep=None): caseroot=caseroot, is_batch=is_batch, ) - - # put the following section here to make sure the rundir is generated first - machdir = self.get_value("MACHDIR") - mach = self.get_value("MACH") - ngpus_per_node = self.get_value("NGPUS_PER_NODE") - overrides = {} - overrides["ngpus_per_node"] = ngpus_per_node - input_template = os.path.join(machdir, "mpi_run_gpu.{}".format(mach)) - if os.path.isfile(input_template): - # update the wrapper script that sets the device id for each MPI rank - output_text = transform_vars( - open(input_template, "r").read(), case=self, overrides=overrides - ) - - # write it out to the run dir - rundir = self.get_value("RUNDIR") - output_name = os.path.join(rundir, "set_device_rank.sh") - logger.info("Creating file {}".format(output_name)) - with open(output_name, "w") as f: - f.write(output_text) - - # make the wrapper script executable - if os.path.isfile(output_name): - os.system("chmod +x " + output_name) - else: - expect( - False, "The file {} is not written out correctly.".format(output_name) - ) diff --git a/CIME/config.py b/CIME/config.py index 8491b2f3f2e..3cef6cc0530 100644 --- a/CIME/config.py +++ b/CIME/config.py @@ -177,11 +177,6 @@ def __init__(self): False, desc="If set to `True` then COMP_ROOT_DIR_CPL is set using UFS_DRIVER if defined.", ) - self._set_attribute( - "gpus_use_set_device_rank", - True, - desc="If set to `True` and NGPUS_PER_NODE > 0 then `$RUNDIR/set_device_rank.sh` is appended when the MPI run command is generated.", - ) self._set_attribute( "test_custom_project_machine", "melvin", diff --git a/CIME/data/config/xml_schemas/config_machines.xsd b/CIME/data/config/xml_schemas/config_machines.xsd index d6e3c280a93..b025c4039e0 100644 --- a/CIME/data/config/xml_schemas/config_machines.xsd +++ b/CIME/data/config/xml_schemas/config_machines.xsd @@ -6,6 +6,8 @@ + + @@ -56,6 +58,10 @@ + + + + @@ -166,6 +172,16 @@ + + + + + + + +