Lightning-AI · williamFalcon · May 17, 2020 · Apr 19, 2020 · Apr 26, 2020 · May 3, 2020
@@ -398,6 +398,7 @@ class TrainerDPMixin(ABC):
     data_parallel_device_ids: ...
     logger: Union[LightningLoggerBase, bool]
     progress_bar_callback: ...
+    tpu_id: int
 
     @property
     @abstractmethod
@@ -443,7 +444,7 @@ def __transfer_data_to_device(self, batch, device, gpu_id=None):
         if device == 'tpu' and XLA_AVAILABLE:
             # base case: object can be directly moved using `to`
             if callable(getattr(batch, 'to', None)):
-                return batch.to(xm.xla_device())
+                return batch.to(xm.xla_device(self.tpu_id))
 
         if device == 'gpu':
             # base case: object can be directly moved using `cuda` or `to`
@@ -498,7 +499,7 @@ def single_gpu_train(self, model):
 
     def tpu_train(self, tpu_core_idx, model):
         # put model on tpu
-        model.to(xm.xla_device())
+        model.to(xm.xla_device(self.tpu_id))
 
         # get the appropriate tpu ranks
         self.tpu_local_core_rank = xm.get_local_ordinal()

@@ -175,6 +175,7 @@ class TrainerEvaluationLoopMixin(ABC):
     val_dataloaders: DataLoader
     use_tpu: bool
     reload_dataloaders_every_epoch: ...
+    tpu_id: int
 
     # Callback system
     on_validation_batch_start: Callable
@@ -249,8 +250,8 @@ def _evaluate(self, model: LightningModule, dataloaders, max_batches: int, test_
             dl_outputs = []
 
             # on TPU we have to wrap it under the ParallelLoader
-            if self.use_tpu:
-                device = xm.xla_device()
+            if self.use_tpu and self.tpu_id is None:
+                device = xm.xla_device(self.tpu_id)
                 dataloader = xla_pl.ParallelLoader(dataloader, [device])
                 dataloader = dataloader.per_device_loader(device)
 

@@ -90,6 +90,7 @@ def __init__(
             gpus: Optional[Union[List[int], str, int]] = None,
             auto_select_gpus: bool = False,
             num_tpu_cores: Optional[int] = None,
+            tpu_id: Optional[int] = None,
             log_gpu_memory: Optional[str] = None,
             progress_bar_refresh_rate: int = 1,
             overfit_pct: float = 0.0,
@@ -321,6 +322,8 @@ def __init__(
         self.num_tpu_cores = num_tpu_cores
         assert num_tpu_cores in [1, 8, None], 'num_tpu_cores can only be 1 or 8'
 
+        self.tpu_id = tpu_id
+
         if num_processes != 1 and distributed_backend != "ddp_cpu":
             rank_zero_warn("num_processes is only used for distributed_backend=\"ddp_cpu\". Ignoring it.")
         self.num_processes = num_processes
@@ -775,7 +778,10 @@ def fit(
             self.model = model
 
             # train
-            xmp.spawn(self.tpu_train, args=(model,), nprocs=self.num_tpu_cores, start_method=start_method)
+            if self.tpu_id is not None:
+                self.tpu_train(self.tpu_id, model)
+            else:
+                xmp.spawn(self.tpu_train, args=(model,), nprocs=self.num_tpu_cores, start_method=start_method)
 
             # load weights if not interrupted
             self.load_spawn_weights(model)

@@ -231,6 +231,7 @@ class TrainerTrainLoopMixin(ABC):
     total_batch_idx: int
     checkpoint_callback: ...
     terminate_on_nan: bool
+    tpu_id: int
 
     # Callback system
     callbacks: List[Callback]
@@ -393,8 +394,8 @@ def run_training_epoch(self):
         train_dataloader = self.train_dataloader
 
         # on TPU we have to wrap it under the ParallelLoader
-        if self.use_tpu:
-            device = xm.xla_device()
+        if self.use_tpu and self.tpu_id is None:
+            device = xm.xla_device(self.tpu_id)
             train_dataloader = xla_pl.ParallelLoader(train_dataloader, [device])
             train_dataloader = train_dataloader.per_device_loader(device)