diff --git a/CIME/Tools/Makefile b/CIME/Tools/Makefile
index 556ce8aa937..ff8bb42ce53 100644
--- a/CIME/Tools/Makefile
+++ b/CIME/Tools/Makefile
@@ -613,6 +613,9 @@ endif
 
 # Remove arch flag if it exists
 F90_LDFLAGS := $(filter-out -arch%,$(LDFLAGS))
+ifdef GPUFLAGS
+  F90_LDFLAGS += $(GPUFLAGS)
+endif
 
 # Machine stuff to appear last on the link step
 ifndef MLIBS
diff --git a/CIME/XML/env_batch.py b/CIME/XML/env_batch.py
index 9cf4136b539..581048b263e 100644
--- a/CIME/XML/env_batch.py
+++ b/CIME/XML/env_batch.py
@@ -620,6 +620,9 @@ def _process_args(self, case, submit_arg_nodes, job, resolve=True):
                 if name:
                     if resolve and "$" in name:
                         rflag = self._resolve_argument(case, flag, name, job)
+                        # This is to prevent -gpu_type=none in qsub args
+                        if rflag.endswith("=none"):
+                            continue
                         if len(rflag) > len(flag):
                             submitargs += " {}".format(rflag)
                     else:
diff --git a/CIME/XML/env_mach_pes.py b/CIME/XML/env_mach_pes.py
index c7635573f95..76c6588901b 100644
--- a/CIME/XML/env_mach_pes.py
+++ b/CIME/XML/env_mach_pes.py
@@ -42,6 +42,8 @@ def get_value(
         resolved=True,
         subgroup=None,
         max_mpitasks_per_node=None,
+        max_cputasks_per_gpu_node=None,
+        ngpus_per_node=None,
     ):  # pylint: disable=arguments-differ
         # Special variable NINST_MAX is used to determine the number of
         # drivers in multi-driver mode.
@@ -58,7 +60,13 @@ def get_value(
         if "NTASKS" in vid or "ROOTPE" in vid:
             if max_mpitasks_per_node is None:
                 max_mpitasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE")
-            if value is not None and value < 0:
+            if max_cputasks_per_gpu_node is None:
+                max_cputasks_per_gpu_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE")
+            if ngpus_per_node is None:
+                ngpus_per_node = self.get_value("NGPUS_PER_NODE")
+            if (ngpus_per_node and value) and value < 0:
+                value = -1 * value * max_cputasks_per_gpu_node
+            elif value and value < 0:
                 value = -1 * value * max_mpitasks_per_node
         # in the nuopc driver there is only one NINST value
         # so that NINST_{comp} = NINST
@@ -154,6 +162,7 @@ def get_total_tasks(self, comp_classes, async_interface=False):
             tt = rootpe + nthrds * ((ntasks - 1) * pstrid + 1)
             maxrootpe = max(maxrootpe, rootpe)
             total_tasks = max(tt, total_tasks)
+
         if asyncio_tasks:
             total_tasks = total_tasks + len(asyncio_tasks)
         if self.get_value("MULTI_DRIVER"):
@@ -167,13 +176,24 @@ def get_tasks_per_node(self, total_tasks, max_thread_count):
             "totaltasks > 0 expected, totaltasks = {}".format(total_tasks),
         )
         if self._comp_interface == "nuopc" and self.get_value("ESMF_AWARE_THREADING"):
-            tasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE")
+            if self.get_value("NGPUS_PER_NODE") > 0:
+                tasks_per_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE")
+            else:
+                tasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE")
         else:
-            tasks_per_node = min(
-                self.get_value("MAX_TASKS_PER_NODE") // max_thread_count,
-                self.get_value("MAX_MPITASKS_PER_NODE"),
-                total_tasks,
-            )
+            ngpus_per_node = self.get_value("NGPUS_PER_NODE")
+            if ngpus_per_node and ngpus_per_node > 0:
+                tasks_per_node = min(
+                    self.get_value("MAX_TASKS_PER_NODE") // max_thread_count,
+                    self.get_value("MAX_CPUTASKS_PER_GPU_NODE"),
+                    total_tasks,
+                )
+            else:
+                tasks_per_node = min(
+                    self.get_value("MAX_TASKS_PER_NODE") // max_thread_count,
+                    self.get_value("MAX_MPITASKS_PER_NODE"),
+                    total_tasks,
+                )
         return tasks_per_node if tasks_per_node > 0 else 1
 
     def get_total_nodes(self, total_tasks, max_thread_count):
diff --git a/CIME/XML/env_mach_specific.py b/CIME/XML/env_mach_specific.py
index 03e84f0faee..4652f2a7d0a 100644
--- a/CIME/XML/env_mach_specific.py
+++ b/CIME/XML/env_mach_specific.py
@@ -320,7 +320,8 @@ def _compute_resource_actions(self, resource_nodes, case, job=None):
 
     def _compute_actions(self, nodes, child_tag, case, job=None):
         result = []  # list of tuples ("name", "argument")
-        compiler, mpilib = case.get_value("COMPILER"), case.get_value("MPILIB")
+        compiler = case.get_value("COMPILER")
+        mpilib = case.get_value("MPILIB")
 
         for node in nodes:
             if self._match_attribs(self.attrib(node), case, job=job):
diff --git a/CIME/build.py b/CIME/build.py
index 26702cd8b76..b8d481b80d8 100644
--- a/CIME/build.py
+++ b/CIME/build.py
@@ -246,6 +246,20 @@ def get_standard_cmake_args(case, sharedpath):
     cmake_args += " -Dcompile_threaded={} ".format(
         stringify_bool(case.get_build_threaded())
     )
+    # check settings for GPU
+    gpu_type = case.get_value("GPU_TYPE")
+    gpu_offload = case.get_value("GPU_OFFLOAD")
+    if gpu_type != "none":
+        expect(
+            gpu_offload != "none",
+            "Both GPU_TYPE and GPU_OFFLOAD must be defined if either is",
+        )
+        cmake_args += f" -DGPU_TYPE={gpu_type} -DGPU_OFFLOAD={gpu_offload}"
+    else:
+        expect(
+            gpu_offload == "none",
+            "Both GPU_TYPE and GPU_OFFLOAD must be defined if either is",
+        )
 
     ocn_model = case.get_value("COMP_OCN")
     atm_dycore = case.get_value("CAM_DYCORE")
diff --git a/CIME/case/case.py b/CIME/case/case.py
index 9c2f87946ef..567eb223988 100644
--- a/CIME/case/case.py
+++ b/CIME/case/case.py
@@ -74,6 +74,7 @@ class Case(object):
 
     This class extends across multiple files, class members external to this file
     are listed in the following imports
+
     """
 
     from CIME.case.case_setup import case_setup
@@ -123,6 +124,7 @@ def __init__(self, case_root=None, read_only=True, record=False, non_local=False
         self._env_generic_files = []
         self._files = []
         self._comp_interface = None
+        self.gpu_enabled = False
         self._non_local = non_local
         self.read_xml()
 
@@ -275,6 +277,9 @@ def initialize_derived_attributes(self):
 
         if max_gpus_per_node:
             self.ngpus_per_node = self.get_value("NGPUS_PER_NODE")
+        # update the maximum MPI tasks for a GPU node (could differ from a pure-CPU node)
+        if self.ngpus_per_node > 0:
+            max_mpitasks_per_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE")
 
         self.tasks_per_numa = int(math.ceil(self.tasks_per_node / 2.0))
         smt_factor = max(
@@ -451,6 +456,12 @@ def get_values(self, item, attribute=None, resolved=True, subgroup=None):
         return []
 
     def get_value(self, item, attribute=None, resolved=True, subgroup=None):
+        if item == "GPU_ENABLED":
+            if not self.gpu_enabled:
+                if self.get_value("GPU_TYPE") != "none":
+                    self.gpu_enabled = True
+            return "true" if self.gpu_enabled else "false"
+
         result = None
         for env_file in self._files:
             # Wait and resolve in self rather than in env_file
@@ -1141,7 +1152,6 @@ def _setup_mach_pes(self, pecount, multi_driver, ninst, machine_name, mpilib):
         comment = None
         force_tasks = None
         force_thrds = None
-
         if match1:
             opti_tasks = match1.group(1)
             if opti_tasks.isdigit():
@@ -1211,7 +1221,6 @@ def _setup_mach_pes(self, pecount, multi_driver, ninst, machine_name, mpilib):
             pstrid = pes_pstrid[pstrid_str] if pstrid_str in pes_pstrid else 1
 
             totaltasks.append((ntasks + rootpe) * nthrds)
-
             mach_pes_obj.set_value(ntasks_str, ntasks)
             mach_pes_obj.set_value(nthrds_str, nthrds)
             mach_pes_obj.set_value(rootpe_str, rootpe)
@@ -1262,6 +1271,8 @@ def configure(
         extra_machines_dir=None,
         case_group=None,
         ngpus_per_node=0,
+        gpu_type=None,
+        gpu_offload=None,
     ):
 
         expect(
@@ -1344,6 +1355,7 @@ def configure(
             and "MPILIB" not in x
             and "MAX_MPITASKS_PER_NODE" not in x
             and "MAX_TASKS_PER_NODE" not in x
+            and "MAX_CPUTASKS_PER_GPU_NODE" not in x
             and "MAX_GPUS_PER_NODE" not in x
         ]
 
@@ -1378,6 +1390,7 @@ def configure(
         for name in (
             "MAX_TASKS_PER_NODE",
             "MAX_MPITASKS_PER_NODE",
+            "MAX_CPUTASKS_PER_GPU_NODE",
             "MAX_GPUS_PER_NODE",
         ):
             dmax = machobj.get_value(name, {"compiler": compiler})
@@ -1385,13 +1398,23 @@ def configure(
                 dmax = machobj.get_value(name)
             if dmax:
                 self.set_value(name, dmax)
+            elif name == "MAX_CPUTASKS_PER_GPU_NODE":
+                logger.debug(
+                    "Variable {} not defined for machine {} and compiler {}".format(
+                        name, machine_name, compiler
+                    )
+                )
             elif name == "MAX_GPUS_PER_NODE":
                 logger.debug(
-                    "Variable {} not defined for machine {}".format(name, machine_name)
+                    "Variable {} not defined for machine {} and compiler {}".format(
+                        name, machine_name, compiler
+                    )
                 )
             else:
                 logger.warning(
-                    "Variable {} not defined for machine {}".format(name, machine_name)
+                    "Variable {} not defined for machine {} and compiler {}".format(
+                        name, machine_name, compiler
+                    )
                 )
 
         machdir = machobj.get_machines_dir()
@@ -1509,47 +1532,62 @@ def configure(
             self.set_value("TEST", True)
 
         # ----------------------------------------------------------------------------------------------------------
-        # Sanity check:
-        #     1. We assume that there is always a string "gpu" in the compiler name if we want to enable GPU
-        #     2. For compilers without the string "gpu" in the name:
-        #        2.1. the ngpus-per-node argument would not update the NGPUS_PER_NODE XML variable, as long as
-        #             the MAX_GPUS_PER_NODE XML variable is not defined (i.e., this argument is not in effect).
-        #        2.2. if the MAX_GPUS_PER_NODE XML variable is defined, then the ngpus-per-node argument
-        #             must be set to 0. Otherwise, an error will be triggered.
-        #     3. For compilers with the string "gpu" in the name:
-        #        3.1. if ngpus-per-node argument is smaller than 0, an error will be triggered.
-        #        3.2. if ngpus_per_node argument is larger than the value of MAX_GPUS_PER_NODE, the NGPUS_PER_NODE
+        # Sanity check for a GPU run:
+        #        1. GPU_TYPE and GPU_OFFLOAD must both be defined to use GPUS
+        #        2. if ngpus_per_node argument is larger than the value of MAX_GPUS_PER_NODE, the NGPUS_PER_NODE
         #             XML variable in the env_mach_pes.xml file would be set to MAX_GPUS_PER_NODE automatically.
-        #        3.3. if ngpus-per-node argument is equal to 0, it will be updated to 1 automatically.
+        #        3. if ngpus-per-node argument is equal to 0, it will be updated to 1 automatically.
         # ----------------------------------------------------------------------------------------------------------
         max_gpus_per_node = self.get_value("MAX_GPUS_PER_NODE")
-        if max_gpus_per_node:
-            if "gpu" in compiler:
-                if not ngpus_per_node:
-                    ngpus_per_node = 1
-                    logger.warning(
-                        "Setting ngpus_per_node to 1 for compiler {}".format(compiler)
-                    )
-                expect(
-                    ngpus_per_node > 0,
-                    " ngpus_per_node is expected > 0 for compiler {}; current value is {}".format(
-                        compiler, ngpus_per_node
-                    ),
-                )
-            else:
-                expect(
-                    ngpus_per_node == 0,
-                    " ngpus_per_node is expected = 0 for compiler {}; current value is {}".format(
-                        compiler, ngpus_per_node
-                    ),
-                )
+        if gpu_type and str(gpu_type).lower() != "none":
+            expect(
+                max_gpus_per_node,
+                f"GPUS are not defined for machine={machine_name} and compiler={compiler}",
+            )
+            expect(
+                gpu_offload,
+                "Both gpu-type and gpu-offload must be defined if either is defined",
+            )
+            expect(
+                compiler in ["nvhpc", "cray"],
+                f"Only nvhpc and cray compilers are expected for a GPU run; the user given compiler is {compiler}, ",
+            )
+            valid_gpu_type = self.get_value("GPU_TYPE").split(",")
+            valid_gpu_type.remove("none")
+            expect(
+                gpu_type in valid_gpu_type,
+                f"Unsupported GPU type is given: {gpu_type} ; valid values are {valid_gpu_type}",
+            )
+            valid_gpu_offload = self.get_value("GPU_OFFLOAD").split(",")
+            valid_gpu_offload.remove("none")
+            expect(
+                gpu_offload in valid_gpu_offload,
+                f"Unsupported GPU programming model is given: {gpu_offload} ; valid values are {valid_gpu_offload}",
+            )
+            self.gpu_enabled = True
             if ngpus_per_node >= 0:
                 self.set_value(
                     "NGPUS_PER_NODE",
-                    ngpus_per_node
+                    max(1, ngpus_per_node)
                     if ngpus_per_node <= max_gpus_per_node
                     else max_gpus_per_node,
                 )
+        elif gpu_offload and str(gpu_offload).lower() != "none":
+            expect(
+                False,
+                "Both gpu-type and gpu-offload must be defined if either is defined",
+            )
+        elif ngpus_per_node != 0:
+            expect(
+                False,
+                f"ngpus_per_node is expected to be 0 for a pure CPU run ; {ngpus_per_node} is provided instead ;",
+            )
+
+        # Set these two GPU XML variables here to overwrite the default values
+        if gpu_type:
+            self.set_value("GPU_TYPE", str(gpu_type).lower())
+        if gpu_offload:
+            self.set_value("GPU_OFFLOAD", str(gpu_offload).lower())
 
         self.initialize_derived_attributes()
 
@@ -2073,12 +2111,10 @@ def get_mpirun_cmd(self, job=None, allow_unresolved_envvars=True, overrides=None
             mpi_arg_string += " : "
 
         ngpus_per_node = self.get_value("NGPUS_PER_NODE")
-        if ngpus_per_node and ngpus_per_node > 0 and config.gpus_use_set_device_rank:
-            # 1. this setting is tested on Casper only and may not work on other machines
-            # 2. need to be revisited in the future for a more adaptable implementation
-            rundir = self.get_value("RUNDIR")
-            output_name = rundir + "/set_device_rank.sh"
-            mpi_arg_string = mpi_arg_string + " " + output_name + " "
+        if ngpus_per_node and ngpus_per_node > 0:
+            mpi_gpu_run_script = self.get_value("MPI_GPU_WRAPPER_SCRIPT")
+            if mpi_gpu_run_script:
+                mpi_arg_string = mpi_arg_string + " " + mpi_gpu_run_script
 
         return self.get_resolved_value(
             "{} {} {} {}".format(
@@ -2375,6 +2411,8 @@ def create(
         extra_machines_dir=None,
         case_group=None,
         ngpus_per_node=0,
+        gpu_type=None,
+        gpu_offload=None,
     ):
         try:
             # Set values for env_case.xml
@@ -2448,6 +2486,8 @@ def create(
                 extra_machines_dir=extra_machines_dir,
                 case_group=case_group,
                 ngpus_per_node=ngpus_per_node,
+                gpu_type=gpu_type,
+                gpu_offload=gpu_offload,
             )
 
             self.create_caseroot()
diff --git a/CIME/case/case_setup.py b/CIME/case/case_setup.py
index aa8fb8b6b6c..363e8f4fab4 100644
--- a/CIME/case/case_setup.py
+++ b/CIME/case/case_setup.py
@@ -21,7 +21,6 @@
     copy_local_macros_to_dir,
 )
 from CIME.utils import batch_jobid
-from CIME.utils import transform_vars
 from CIME.test_status import *
 from CIME.locked_files import unlock_file, lock_file
 
@@ -482,31 +481,3 @@ def case_setup(self, clean=False, test_mode=False, reset=False, keep=None):
             caseroot=caseroot,
             is_batch=is_batch,
         )
-
-    # put the following section here to make sure the rundir is generated first
-    machdir = self.get_value("MACHDIR")
-    mach = self.get_value("MACH")
-    ngpus_per_node = self.get_value("NGPUS_PER_NODE")
-    overrides = {}
-    overrides["ngpus_per_node"] = ngpus_per_node
-    input_template = os.path.join(machdir, "mpi_run_gpu.{}".format(mach))
-    if os.path.isfile(input_template):
-        # update the wrapper script that sets the device id for each MPI rank
-        output_text = transform_vars(
-            open(input_template, "r").read(), case=self, overrides=overrides
-        )
-
-        # write it out to the run dir
-        rundir = self.get_value("RUNDIR")
-        output_name = os.path.join(rundir, "set_device_rank.sh")
-        logger.info("Creating file {}".format(output_name))
-        with open(output_name, "w") as f:
-            f.write(output_text)
-
-        # make the wrapper script executable
-        if os.path.isfile(output_name):
-            os.system("chmod +x " + output_name)
-        else:
-            expect(
-                False, "The file {} is not written out correctly.".format(output_name)
-            )
diff --git a/CIME/config.py b/CIME/config.py
index 8491b2f3f2e..3cef6cc0530 100644
--- a/CIME/config.py
+++ b/CIME/config.py
@@ -177,11 +177,6 @@ def __init__(self):
             False,
             desc="If set to `True` then COMP_ROOT_DIR_CPL is set using UFS_DRIVER if defined.",
         )
-        self._set_attribute(
-            "gpus_use_set_device_rank",
-            True,
-            desc="If set to `True` and NGPUS_PER_NODE > 0 then `$RUNDIR/set_device_rank.sh` is appended when the MPI run command is generated.",
-        )
         self._set_attribute(
             "test_custom_project_machine",
             "melvin",
diff --git a/CIME/data/config/xml_schemas/config_machines.xsd b/CIME/data/config/xml_schemas/config_machines.xsd
index d6e3c280a93..b025c4039e0 100644
--- a/CIME/data/config/xml_schemas/config_machines.xsd
+++ b/CIME/data/config/xml_schemas/config_machines.xsd
@@ -6,6 +6,8 @@
   <xs:attribute name="compiler" type="xs:string"/>
   <xs:attribute name="mpilib" type="xs:string"/>
   <xs:attribute name="comp_interface" type="xs:string"/>
+  <xs:attribute name="gpu_type" type="xs:string"/>
+  <xs:attribute name="gpu_offload" type="xs:string"/>
   <xs:attribute name="queue" type="xs:string"/>
   <xs:attribute name="DEBUG" type="upperBoolean"/>
   <xs:attribute name="PIO_VERSION" type="xs:integer"/>
@@ -56,6 +58,10 @@
   <xs:element name="MAX_TASKS_PER_NODE" type="AttrElement"/>
   <xs:element name="MAX_GPUS_PER_NODE" type="AttrElement"/>
   <xs:element name="MAX_MPITASKS_PER_NODE" type="AttrElement"/>
+  <xs:element name="MAX_CPUTASKS_PER_GPU_NODE" type="AttrElement"/>
+  <xs:element name="GPU_TYPE" type="AttrElement"/>
+  <xs:element name="GPU_OFFLOAD" type="AttrElement"/>
+  <xs:element name="MPI_GPU_WRAPPER_SCRIPT" type="AttrElement"/>
   <xs:element name="COSTPES_PER_NODE" type="xs:integer"/>
   <xs:element name="PROJECT_REQUIRED" type="xs:NCName"/>
   <xs:element name="executable" type="xs:string"/>
@@ -166,6 +172,16 @@
         <!-- MAX_MPITASKS_PER_NODE: number of physical PES per shared node on
              this machine, in practice the MPI tasks per node will not exceed this value -->
         <xs:element ref="MAX_MPITASKS_PER_NODE" minOccurs="1" maxOccurs="unbounded"/>
+        <!-- MAX_CPUTASKS_PER_GPU_NODE: number of physical PES per GPU node on
+             this machine, in practice the MPI tasks per node will not exceed this value -->
+        <xs:element ref="MAX_CPUTASKS_PER_GPU_NODE" minOccurs="0" maxOccurs="unbounded"/>
+	<!-- GPU_TYPE: the type of GPU hardware available on this machine -->
+        <xs:element ref="GPU_TYPE" minOccurs="0" maxOccurs="unbounded"/>
+	<!-- GPU_OFFLOAD: the GPU programming model used for GPU porting -->
+        <xs:element ref="GPU_OFFLOAD" minOccurs="0" maxOccurs="unbounded"/>
+	<!-- MPI_GPU_WRAPPER_SCRIPT: a wrapper script that will be attached to the MPI run
+	     command and map different MPI ranks to different GPUs within the same node -->
+        <xs:element ref="MPI_GPU_WRAPPER_SCRIPT" minOccurs="0" maxOccurs="1"/>
         <!-- Optional cost factor per node unit -->
         <xs:element ref="COSTPES_PER_NODE" minOccurs="0" maxOccurs="1"/>
         <!-- PROJECT_REQUIRED: Does this machine require a project to be specified to
@@ -249,6 +265,8 @@
       <xs:attribute ref="PIO_VERSION"/>
       <xs:attribute ref="mpilib"/>
       <xs:attribute ref="comp_interface"/>
+      <xs:attribute ref="gpu_offload"/>
+      <xs:attribute ref="gpu_type"/>
     </xs:complexType>
   </xs:element>
   <xs:element name="command">
diff --git a/CIME/data/config/xml_schemas/env_mach_specific.xsd b/CIME/data/config/xml_schemas/env_mach_specific.xsd
index f86c6b9f6e1..7778635592b 100644
--- a/CIME/data/config/xml_schemas/env_mach_specific.xsd
+++ b/CIME/data/config/xml_schemas/env_mach_specific.xsd
@@ -9,6 +9,8 @@
 <xs:attribute name="PIO_VERSION" type="xs:integer"/>
 <xs:attribute name="mpilib" type="xs:string"/>
 <xs:attribute name="comp_interface" type="xs:string"/>
+<xs:attribute name="gpu_type" type="xs:string"/>
+<xs:attribute name="gpu_offload" type="xs:string"/>
 <xs:attribute name="SMP_PRESENT" type="xs:string"/>
 <xs:attribute name="value" type="xs:string"/>
 <xs:attribute name="unit_testing" type="xs:boolean"/>
@@ -102,6 +104,8 @@
       <xs:attribute ref="PIO_VERSION" />
       <xs:attribute ref="mpilib"/>
       <xs:attribute ref="comp_interface"/>
+      <xs:attribute ref="gpu_type"/>
+      <xs:attribute ref="gpu_offload"/>
     </xs:complexType>
   </xs:element>
 
diff --git a/CIME/scripts/create_newcase.py b/CIME/scripts/create_newcase.py
index eb82d392994..1e7b33ea315 100755
--- a/CIME/scripts/create_newcase.py
+++ b/CIME/scripts/create_newcase.py
@@ -271,6 +271,18 @@ def parse_command_line(args, cimeroot, description):
         help="Specify number of GPUs used for simulation. ",
     )
 
+    parser.add_argument(
+        "--gpu-type",
+        default=None,
+        help="Specify type of GPU hardware - currently supported are v100, a100, mi250",
+    )
+
+    parser.add_argument(
+        "--gpu-offload",
+        default=None,
+        help="Specify gpu offload method - currently supported are openacc, openmp, combined",
+    )
+
     args = CIME.utils.parse_args_and_handle_standard_logging_options(args, parser)
 
     if args.srcroot is not None:
@@ -347,6 +359,8 @@ def parse_command_line(args, cimeroot, description):
         args.extra_machines_dir,
         args.case_group,
         args.ngpus_per_node,
+        args.gpu_type,
+        args.gpu_offload,
     )
 
 
@@ -384,6 +398,8 @@ def _main_func(description=None):
         extra_machines_dir,
         case_group,
         ngpus_per_node,
+        gpu_type,
+        gpu_offload,
     ) = parse_command_line(sys.argv, cimeroot, description)
 
     if script_root is None:
@@ -449,6 +465,8 @@ def _main_func(description=None):
             extra_machines_dir=extra_machines_dir,
             case_group=case_group,
             ngpus_per_node=ngpus_per_node,
+            gpu_type=gpu_type,
+            gpu_offload=gpu_offload,
         )
 
         # Called after create since casedir does not exist yet
diff --git a/CIME/test_scheduler.py b/CIME/test_scheduler.py
index 1299efa8b14..ef3caedf102 100644
--- a/CIME/test_scheduler.py
+++ b/CIME/test_scheduler.py
@@ -292,6 +292,7 @@ def __init__(
         )
 
         self._clean = clean
+
         self._namelists_only = namelists_only
 
         self._walltime = walltime
@@ -669,8 +670,17 @@ def _create_newcase_phase(self, test):
                     pesize = case_opt[1:]
                     create_newcase_cmd += " --pecount {}".format(pesize)
                 elif case_opt.startswith("G"):
-                    ngpus_per_node = case_opt[1:]
-                    create_newcase_cmd += " --ngpus-per-node {}".format(ngpus_per_node)
+                    if "-" in case_opt:
+                        ngpus_per_node, gpu_type, gpu_offload = case_opt[1:].split("-")
+                    else:
+                        error = "GPU test argument format is ngpus_per_node-gpu_type-gpu_offload"
+                        self._log_output(test, error)
+                        return False, error
+                    create_newcase_cmd += (
+                        " --ngpus-per-node {} --gpu-type {} --gpu-offload {}".format(
+                            ngpus_per_node, gpu_type, gpu_offload
+                        )
+                    )
                 elif case_opt.startswith("V"):
                     self._cime_driver = case_opt[1:]
                     create_newcase_cmd += " --driver {}".format(self._cime_driver)
diff --git a/CIME/tests/test_unit_case.py b/CIME/tests/test_unit_case.py
index ed473cea21f..dd4d18edf66 100755
--- a/CIME/tests/test_unit_case.py
+++ b/CIME/tests/test_unit_case.py
@@ -251,6 +251,8 @@ def test_copy(
                     extra_machines_dir=None,
                     case_group=None,
                     ngpus_per_node=0,
+                    gpu_type=None,
+                    gpu_offload=None,
                 )
                 create_caseroot.assert_called()
                 apply_user_mods.assert_called()
@@ -326,6 +328,8 @@ def test_create(
                     extra_machines_dir=None,
                     case_group=None,
                     ngpus_per_node=0,
+                    gpu_type=None,
+                    gpu_offload=None,
                 )
                 create_caseroot.assert_called()
                 apply_user_mods.assert_called()
diff --git a/Externals.cfg b/Externals.cfg
index 9f4591dda86..01537b54197 100644
--- a/Externals.cfg
+++ b/Externals.cfg
@@ -1,19 +1,19 @@
 [ccs_config]
-tag = ccs_config_cesm0.0.63
+tag = ccs_config_cesm0.0.76
 protocol = git
 repo_url = https://github.com/ESMCI/ccs_config_cesm
 local_path = ccs_config
 required = True
 
 [cmeps]
-tag = cmeps0.14.16
+tag = cmeps0.14.38
 protocol = git
 repo_url = https://github.com/ESCOMP/CMEPS.git
 local_path = components/cmeps
 required = True
 
 [cdeps]
-tag = cdeps1.0.7
+tag = cdeps1.0.19
 protocol = git
 repo_url = https://github.com/ESCOMP/CDEPS.git
 local_path = components/cdeps
@@ -21,7 +21,7 @@ externals = Externals_CDEPS.cfg
 required = True
 
 [cpl7]
-tag = cpl77.0.5
+tag = cpl77.0.6
 protocol = git
 repo_url = https://github.com/ESCOMP/CESM_CPL7andDataComps
 local_path = components/cpl7
@@ -42,7 +42,7 @@ local_path = libraries/mct
 required = True
 
 [parallelio]
-tag = pio2_5_10
+tag = pio2_6_0
 protocol = git
 repo_url = https://github.com/NCAR/ParallelIO
 local_path = libraries/parallelio
diff --git a/doc/source/users_guide/cime-customize.rst b/doc/source/users_guide/cime-customize.rst
index ed90e21472a..6431f5c388a 100644
--- a/doc/source/users_guide/cime-customize.rst
+++ b/doc/source/users_guide/cime-customize.rst
@@ -44,7 +44,6 @@ default_short_term_archiving       True                     bool   If set to `Tr
 driver_choices                     ('mct', 'nuopc')         tuple  Sets the available driver choices for the model.
 driver_default                     nuopc                    str    Sets the default driver for the model.
 enable_smp                         True                     bool   If set to `True` then `SMP=` is added to model compile command.
-gpus_use_set_device_rank           True                     bool   If set to `True` and NGPUS_PER_NODE > 0 then `$RUNDIR/set_device_rank.sh` is appended when the MPI run command is generated.
 make_case_run_batch_script         False                    bool   If set to `True` and case is not a test then `case.run.sh` is created in case directory from `$MACHDIR/template.case.run.sh`.
 mct_path                           {srcroot}/libraries/mct  str    Sets the path to the mct library.
 serialize_sharedlib_builds         True                     bool   If set to `True` then the TestScheduler will use `proc_pool + 1` processors to build shared libraries otherwise a single processor is used.