minist ddp You requested GPUs: [6, 7] But your machine only has: [0, 1] #1710

xinfangliu · 2020-05-03T02:12:29Z

import os

import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST
from torchvision import transforms

import pytorch_lightning as pl

class CoolSystem(pl.LightningModule):

def __init__(self, hparams=None):
    super().__init__()

    # get hyperparams, etc...
    self.hparams = hparams

    # not the best model...
    self.l1 = torch.nn.Linear(28 * 28, 10)

def forward(self, x):
    # called with self(x)
    return torch.relu(self.l1(x.view(x.size(0), -1)))

def training_step(self, batch, batch_idx):
    # REQUIRED
    x, y = batch
    y_hat = self(x)
    loss = F.cross_entropy(y_hat, y)
    tensorboard_logs = {'train_loss': loss}
    return {'loss': loss, 'log': tensorboard_logs}

def validation_step(self, batch, batch_idx):
    # OPTIONAL
    x, y = batch
    y_hat = self(x)
    val_loss = F.cross_entropy(y_hat, y)
    return {'val_loss': val_loss}

def validation_epoch_end(self, outputs):
    # OPTIONAL
    avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
    tensorboard_logs = {'val_loss': avg_loss}
    return {'val_loss': avg_loss, 'log': tensorboard_logs}

def test_step(self, batch, batch_idx):
    # OPTIONAL
    x, y = batch
    y_hat = self(x)
    return {'test_loss': F.cross_entropy(y_hat, y)}

def test_epoch_end(self, outputs):
    avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean()
    tensorboard_logs = {'test_loss': avg_loss}
    return {'test_loss': avg_loss, 'log': tensorboard_logs}

def configure_optimizers(self):
    return torch.optim.Adam(self.parameters(), lr=0.001)

def prepare_data(self):
    self.mnist_train = MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor())
    self.mnist_test = MNIST(os.getcwd(), train=False, download=True, transform=transforms.ToTensor())

def train_dataloader(self):
    loader = DataLoader(self.mnist_train, batch_size=32, num_workers=4)
    return loader

def val_dataloader(self):
    loader = DataLoader(self.mnist_test, batch_size=32, num_workers=4)
    return loader

def test_dataloader(self):
    loader = DataLoader(self.mnist_test, batch_size=32, num_workers=4)
    return loader

from pytorch_lightning import Trainer

model = CoolSystem()

most basic trainer, uses good defaults

trainer = Trainer(gpus=[6,7],
num_nodes=4,
distributed_backend='ddp',
progress_bar_refresh_rate=10, max_epochs=10)
trainer.fit(model)
NFO:lightning:GPU available: True, used: True
INFO:lightning:CUDA_VISIBLE_DEVICES: [6,7]
Traceback (most recent call last):
File "", line 1, in
File "/home1/liuxinfang/anaconda3/envs/MomentRetrival/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main
exitcode = _main(fd)
File "/home1/liuxinfang/anaconda3/envs/MomentRetrival/lib/python3.7/multiprocessing/spawn.py", line 114, in _main
prepare(preparation_data)
File "/home1/liuxinfang/anaconda3/envs/MomentRetrival/lib/python3.7/multiprocessing/spawn.py", line 225, in prepare
_fixup_main_from_path(data['init_main_from_path'])
File "/home1/liuxinfang/anaconda3/envs/MomentRetrival/lib/python3.7/multiprocessing/spawn.py", line 277, in _fixup_main_from_path
run_name="mp_main")
File "/home1/liuxinfang/anaconda3/envs/MomentRetrival/lib/python3.7/runpy.py", line 263, in run_path
pkg_name=pkg_name, script_name=fname)
File "/home1/liuxinfang/anaconda3/envs/MomentRetrival/lib/python3.7/runpy.py", line 96, in _run_module_code
mod_name, mod_spec, pkg_name, script_name)
File "/home1/liuxinfang/anaconda3/envs/MomentRetrival/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/home1/liuxinfang/projects/minist/model.py", line 87, in
progress_bar_refresh_rate=10, max_epochs=10)
File "/home1/liuxinfang/anaconda3/envs/MomentRetrival/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 438, in init
self.data_parallel_device_ids = parse_gpu_ids(self.gpus)
File "/home1/liuxinfang/anaconda3/envs/MomentRetrival/lib/python3.7/site-packages/pytorch_lightning/trainer/distrib_parts.py", line 712, in parse_gpu_ids
gpus = sanitize_gpu_ids(gpus)
File "/home1/liuxinfang/anaconda3/envs/MomentRetrival/lib/python3.7/site-packages/pytorch_lightning/trainer/distrib_parts.py", line 678, in sanitize_gpu_ids
""")
pytorch_lightning.utilities.exceptions.MisconfigurationException:
You requested GPUs: [6, 7]
But your machine only has: [0, 1]

Actually my machine has 8 gpus, since gpu 0,1 are used by other users, i need to use 6,7 with enough memory . The code performs normally with single gpu 6 or 7, but failed with more than one gpus.

The text was updated successfully, but these errors were encountered:

xinfangliu · 2020-05-03T02:13:40Z

Actually my machine has 8 GPUs since GPU 0,1 are used by other users, I need to use 6,7 with enough memory. The code performs normally with single GPU 6 or 7 but failed with more than one GPUs.

xinfangliu added bug Something isn't working help wanted Open to be worked on labels May 3, 2020

williamFalcon mentioned this issue May 31, 2020

Replaces ddp .spawn with subprocess #2029

Merged

williamFalcon closed this as completed in #2029 Jun 1, 2020

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

minist ddp You requested GPUs: [6, 7] But your machine only has: [0, 1] #1710

minist ddp You requested GPUs: [6, 7] But your machine only has: [0, 1] #1710

xinfangliu commented May 3, 2020

xinfangliu commented May 3, 2020

minist ddp You requested GPUs: [6, 7] But your machine only has: [0, 1] #1710

minist ddp You requested GPUs: [6, 7] But your machine only has: [0, 1] #1710

Comments

xinfangliu commented May 3, 2020

most basic trainer, uses good defaults

xinfangliu commented May 3, 2020