Allow str and dict in Trainer init signature (mosaicml#277)

coryMosaicML · Feb 23, 2022 · 196cfc4 · 196cfc4
1 parent 2a63326
commit 196cfc4
Show file tree

Hide file tree

Showing 2 changed files with 30 additions and 16 deletions.
diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py
@@ -72,8 +72,8 @@ class Trainer:
             object because the scheduler needs an optimizer to be constructed and we construct the optimizer
             in `__init__`. (default:
             ``[CosineAnnealingLRHparams(T_max=f"{max_epochs}ep"), WarmUpLRHparams()]``).
-        device (Device, optional): The device to use for training. Either `DeviceCPU` or `DeviceGPU`.
-            (default ``DeviceCPU(n_cpus=1)``)
+        device (str or Device, optional): The device to use for training. Either `cpu` or `gpu`.
+            (default `cpu`)
         grad_accum (int, optional): The number of microbatches to split a per-device batch into. Gradients
             are summed over the microbatches per device. (default: ``1``)
         grad_clip_norm (float, optional): The norm to clip gradient magnitudes to. Set to None for no gradient
@@ -84,11 +84,12 @@ class Trainer:
             Set to -1 to never validate on a epochwise frequency. (default: ``1``)
         compute_training_metrics (bool, optional): True to compute metrics on training data and False to not.
             (default: ``False``)
-        precision (Precision, optional): Numerical precision to use for training. (default: ``Precision.FP32``).
+        precision (str or Precision, optional): Numerical precision to use for training, one of 'fp32', 'fp16'
+            for 'amp' (recommended). (default: ``Precision.FP32``).
         dist_timeout (float, optional): Timeout, in seconds, for initializing the distributed process group.
             (default: ``15.0``)
-        ddp_sync_strategy (DDPSyncStrategy, optional): The strategy to use for synchronizing gradients.
-            Leave unset to let the trainer auto-configure this.
+        ddp_sync_strategy (str or DDPSyncStrategy, optional): The strategy to use for synchronizing gradients.
+            Leave unset to let the trainer auto-configure this. For more details, see ``DDPSyncStrategy``.
         seed (int, optional): The seed used in randomization. When not provided a random seed
             will be created. (default: ``None``)
         deterministic_mode (bool, optional): Run the model deterministically. Experimental. Performance
@@ -128,15 +129,15 @@ def __init__(
             schedulers_hparams: Optional[Union[SchedulerHparams, List[SchedulerHparams]]] = None,
 
             # device
-            device: Optional[Device] = None,
+            device: Optional[Union[str, Device]] = None,
 
             # training hparams
             grad_accum: int = 1,
             grad_clip_norm: Optional[float] = None,
             validate_every_n_batches: int = -1,
             validate_every_n_epochs: int = 1,
             compute_training_metrics: bool = False,
-            precision: Precision = Precision.FP32,
+            precision: Union[str, Precision] = Precision.FP32,
 
             # dist hparams
             dist_timeout: float = 300.0,
@@ -162,7 +163,7 @@ def __init__(
             eval_subset_num_batches: Optional[int] = None,
 
             # DeepSpeed
-            deepspeed_hparams: Optional[DeepSpeedHparams] = None,
+            deepspeed_hparams: Optional[Union[dict, DeepSpeedHparams]] = None,
 
             # Optional config (ex. an hparams yaml file)
             config: Optional[Dict[str, Any]] = None):
@@ -172,11 +173,23 @@ def __init__(
 
         self.config = config
 
+        if isinstance(deepspeed_hparams, dict):
+            deepspeed_hparams = DeepSpeedHparams(**deepspeed_hparams)
         self.deepspeed_hparams = deepspeed_hparams
 
         if not device:
-            device = DeviceCPU() if not self.deepspeed_hparams is not None else DeviceGPU()
-        self.device = device
+            self.device = DeviceCPU() if not self.deepspeed_hparams is not None else DeviceGPU()
+        elif isinstance(device, str):
+            if device == 'cpu':
+                self.device = DeviceCPU()
+            elif device == 'gpu':
+                self.device = DeviceGPU()
+            else:
+                raise ValueError(f'device ({device}) must be one of (cpu, gpu).')
+        else:
+            if not isinstance(device, Device):
+                raise ValueError('device must be of class Device')
+            self.device = device
 
         if not seed:
             seed = reproducibility.get_random_seed()
@@ -203,7 +216,7 @@ def __init__(
             import deepspeed
             deepspeed.init_distributed()
         else:
-            dist.initialize_dist(device.dist_backend, datetime.timedelta(seconds=dist_timeout))
+            dist.initialize_dist(self.device.dist_backend, datetime.timedelta(seconds=dist_timeout))
             if ddp_sync_strategy is None:
                 self.ddp_sync_strategy = DDPSyncStrategy.SINGLE_AUTO_SYNC if not find_unused_parameters else DDPSyncStrategy.FORCED_SYNC
             else:
@@ -213,6 +226,8 @@ def __init__(
         # handle this with our version of Pytorch
         precision_context = self.device.precision_context if not self.deepspeed_enabled else cast(
             Callable[..., ContextManager], contextlib.nullcontext)
+        if isinstance(precision, str):
+            precision = Precision(precision)
 
         if not isinstance(train_dataloader, DataSpec):
             train_dataloader = DataSpec(train_dataloader)

diff --git a/composer/utils/dist.py b/composer/utils/dist.py
@@ -233,13 +233,12 @@ def initialize_dist(backend: str, timeout: datetime.timedelta):
                                "not available in your installation of PyTorch. Please install or build PyTorch "
                                "with distributed support.")
         return
-    if dist.is_initialized():
 
+    if dist.is_initialized():
         if not dist.get_backend() == backend.lower():
-            raise RuntimeError(
-                f"The requested backend ({backend}) differs from the backend "
-                "of the current process group ({torch.distributed.get_backend()}). If you wish to change backends, "
-                "please restart the python process.")
+            warnings.warn(f"The requested backend ({backend}) differs from the backend "
+                          f"of the current process group ({dist.get_backend()})."
+                          "If you wish to change backends, please restart the python process.")
         return
 
     if "RANK" in os.environ and "WORLD_SIZE" in os.environ: