Deci-AI · shaydeci · Jan 5, 2023 · Dec 5, 2022 · Dec 5, 2022 · Dec 5, 2022
@@ -3,6 +3,7 @@
 from super_gradients.examples.train_from_recipe_example import train_from_recipe
 from super_gradients.examples.train_from_kd_recipe_example import train_from_kd_recipe
 from super_gradients.sanity_check import env_sanity_check
+from super_gradients.training.utils.distributed_training_utils import setup_device
 
 __all__ = [
     "ARCHITECTURES",
@@ -18,6 +19,7 @@
     "train_from_recipe",
     "train_from_kd_recipe",
     "env_sanity_check",
+    "setup_device",
 ]
 
 __version__ = "3.0.5"

@@ -1,8 +1,11 @@
 import argparse
 import sys
 from typing import Any
+from super_gradients.common.abstractions.abstract_logger import get_logger
 
 
+logger = get_logger(__name__)
+
 EXTRA_ARGS = []
 
 
@@ -18,3 +21,11 @@ def pop_arg(arg_name: str, default_value: Any = None) -> Any:
         EXTRA_ARGS.append(val)
         sys.argv.remove(val)
     return vars(args)[arg_name]
+
+
+def pop_local_rank() -> int:
+    """Pop the python arg "local-rank". If exists inform the user with a log, otherwise return -1."""
+    local_rank = pop_arg("local_rank", default_value=-1)
+    if local_rank != -1:
+        logger.info("local_rank was automatically parsed from your config.")
+    return local_rank
@@ -2,41 +2,24 @@
 import socket
 from functools import wraps
 
-from super_gradients.common.environment.argparse_utils import pop_arg
+from super_gradients.common.environment.device_utils import device_config
 from super_gradients.common.environment.omegaconf_utils import register_hydra_resolvers
-
-
-DDP_LOCAL_RANK = int(os.getenv("LOCAL_RANK", default=-1))
-INIT_TRAINER = False
+from super_gradients.common.environment.argparse_utils import pop_local_rank
 
 
 def init_trainer():
     """
     Initialize the super_gradients environment.
 
     This function should be the first thing to be called by any code running super_gradients.
-    It resolves conflicts between the different tools, packages and environments used and prepares the super_gradients environment.
     """
-    global INIT_TRAINER, DDP_LOCAL_RANK
-
-    if not INIT_TRAINER:
-        register_hydra_resolvers()
-
-        # We pop local_rank if it was specified in the args, because it would break
-        args_local_rank = pop_arg("local_rank", default_value=-1)
-
-        # Set local_rank with priority order (env variable > args.local_rank > args.default_value)
-        DDP_LOCAL_RANK = int(os.getenv("LOCAL_RANK", default=args_local_rank))
-        INIT_TRAINER = True
+    register_hydra_resolvers()
+    pop_local_rank()
 
 
 def is_distributed() -> bool:
-    return DDP_LOCAL_RANK >= 0
-
-
-def is_rank_0() -> bool:
-    """Check if the node was launched with torch.distributed.launch and if the node is of rank 0"""
-    return os.getenv("LOCAL_RANK") == "0"
+    """Check if current process is a DDP subprocess."""
+    return device_config.assigned_rank >= 0
 
 
 def is_launched_using_sg():
@@ -55,7 +38,9 @@ def is_main_process():
     """
     if not is_distributed():  # If no DDP, or DDP launching process
         return True
-    elif is_rank_0() and not is_launched_using_sg():  # If DDP launched using torch.distributed.launch or torchrun, we need to run the check on rank 0
+    elif (
+        device_config.assigned_rank == 0 and not is_launched_using_sg()
+    ):  # If DDP launched using torch.distributed.launch or torchrun, we need to run the check on rank 0
         return True
     else:
         return False
@@ -74,7 +59,7 @@ def do_nothing(*args, **kwargs):
 
     @wraps(func)
     def wrapper(*args, **kwargs):
-        if DDP_LOCAL_RANK <= 0:
+        if device_config.assigned_rank <= 0:
             return func(*args, **kwargs)
         else:
             return do_nothing(*args, **kwargs)

@@ -0,0 +1,28 @@
+import os
+import dataclasses
+
+import torch
+
+from super_gradients.common.environment.argparse_utils import pop_local_rank
+
+
+__all__ = ["device_config"]
+
+
+def _get_assigned_rank() -> int:
+    """Get the rank assigned by DDP launcher. If not DDP subprocess, return -1."""
+    if os.getenv("LOCAL_RANK") is not None:
+        return int(os.getenv("LOCAL_RANK"))
+    else:
+        return pop_local_rank()
+
+
+@dataclasses.dataclass
+class DeviceConfig:
+    device: str = "cuda" if torch.cuda.is_available() else "cpu"
+    multi_gpu: str = None
+    assigned_rank: str = dataclasses.field(default=_get_assigned_rank(), init=False)
+
+
+# Singleton holding the device information
+device_config = DeviceConfig()
@@ -12,7 +12,6 @@
 
 from super_gradients import Trainer
 from super_gradients.training.metrics.classification_metrics import Accuracy, Top5
-from super_gradients.training import MultiGPUMode
 from torch.optim import ASGD
 from torch.optim.lr_scheduler import MultiStepLR, ReduceLROnPlateau
 from torch.nn import CrossEntropyLoss
@@ -49,7 +48,7 @@
 ]
 
 # Bring everything together with Trainer and start training
-trainer = Trainer("Cifar10_external_objects_example", multi_gpu=MultiGPUMode.OFF)
+trainer = Trainer("Cifar10_external_objects_example")
 
 train_params = {
     "max_epochs": 300,