Skip to content

Commit

Permalink
Replaces ddp .spawn with subprocess (#2029)
Browse files Browse the repository at this point in the history
* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* replace ddp spawn with subprocess

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix

* hot fix
  • Loading branch information
williamFalcon committed Jun 1, 2020
1 parent fd38f52 commit 82a2029
Show file tree
Hide file tree
Showing 19 changed files with 283 additions and 174 deletions.
4 changes: 2 additions & 2 deletions .run_local_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ rm -rf ./tests/cometruns*
rm -rf ./tests/wandb*
rm -rf ./tests/tests/*
rm -rf ./lightning_logs
python -m coverage run --source pytorch_lightning -m py.test pytorch_lightning tests pl_examples -v --doctest-modules --flake8
python -m coverage run --source pytorch_lightning -m py.test pytorch_lightning tests pl_examples -v --doctest-modules --flake8 --durations=0
python -m coverage report -m

# specific file
# python -m coverage run --source pytorch_lightning -m py.test -k test_trainer.py --flake8
# python -m coverage run --source pytorch_lightning -m py.test -k test_trainer.py --flake8 --durations=0
17 changes: 8 additions & 9 deletions pl_examples/basic_examples/cpu_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,25 +10,23 @@
import pytorch_lightning as pl
from pl_examples.models.lightning_template import LightningTemplateModel

SEED = 2334
torch.manual_seed(SEED)
np.random.seed(SEED)
pl.seed_everything(234)


def main(hparams):
def main(args):
"""
Main training routine specific for this project
:param hparams:
:param args:
"""
# ------------------------
# 1 INIT LIGHTNING MODEL
# ------------------------
model = LightningTemplateModel(hparams)
model = LightningTemplateModel(**vars(args))

# ------------------------
# 2 INIT TRAINER
# ------------------------
trainer = pl.Trainer(max_epochs=hparams.epochs, overfit_pct=0.01, early_stop_callback=True)
trainer = pl.Trainer.from_argparse_args(args)

# ------------------------
# 3 START TRAINING
Expand All @@ -46,9 +44,10 @@ def main(hparams):

# each LightningModule defines arguments relevant to it
parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir)
hyperparams = parser.parse_args()
parser = pl.Trainer.add_argparse_args(parser)
args = parser.parse_args()

# ---------------------
# RUN TRAINING
# ---------------------
main(hyperparams)
main(args)
2 changes: 1 addition & 1 deletion pytorch_lightning/core/lightning.py
Original file line number Diff line number Diff line change
Expand Up @@ -957,7 +957,7 @@ def init_ddp_connection(
f"is not equal to the computed world size ({world_size}). Ignored.")

torch_backend = "nccl" if self.trainer.on_gpu else "gloo"
log.info(f"initializing proc_rank {proc_rank} world {world_size}")
log.info(f"initializing ddp: LOCAL_RANK: {proc_rank}/{world_size - 1} WORLD_SIZE:{world_size}")
torch_distrib.init_process_group(torch_backend, rank=proc_rank, world_size=world_size)

def configure_apex(
Expand Down
88 changes: 82 additions & 6 deletions pytorch_lightning/trainer/distrib_data_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,11 @@ def train_fx(trial_hparams, cluster_manager, _):
import re
from abc import ABC, abstractmethod
from typing import Union
import subprocess
import sys
from time import sleep
import numpy as np
from os.path import abspath

import torch
from pytorch_lightning import _logger as log
Expand Down Expand Up @@ -311,7 +316,7 @@ def set_nvidia_flags(self, is_slurm_managing_tasks, data_parallel_device_ids):
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

# when slurm is managing the task it sets the visible devices
if not is_slurm_managing_tasks:
if not is_slurm_managing_tasks and 'CUDA_VISIBLE_DEVICES' not in os.environ:
if isinstance(data_parallel_device_ids, int):
id_str = ','.join(str(x) for x in list(range(data_parallel_device_ids)))
os.environ["CUDA_VISIBLE_DEVICES"] = id_str
Expand All @@ -322,7 +327,74 @@ def set_nvidia_flags(self, is_slurm_managing_tasks, data_parallel_device_ids):
# don't make this debug... this is good UX
log.info(f'CUDA_VISIBLE_DEVICES: [{os.environ["CUDA_VISIBLE_DEVICES"]}]')

def ddp_train(self, process_idx, model):
def __set_random_port(self):
"""
When running DDP NOT managed by SLURM, the ports might collide
:return:
"""
try:
default_port = os.environ['MASTER_PORT']
except Exception:
import random
default_port = random.randint(10000, 19000)
os.environ['MASTER_PORT'] = str(default_port)

def spawn_ddp_children(self, model):
self.__set_random_port()
port = os.environ['MASTER_PORT']

master_address = '127.0.0.1' if 'MASTER_ADDR' not in os.environ else os.environ['MASTER_ADDR']
os.environ['MASTER_PORT'] = f'{port}'
os.environ['MASTER_ADDR'] = f'{master_address}'

# allow the user to pass the node rank
node_rank = '0'
if 'NODE_RANK' in os.environ:
node_rank = os.environ['NODE_RANK']
if 'GROUP_RANK' in os.environ:
node_rank = os.environ['GROUP_RANK']

os.environ['NODE_RANK'] = node_rank
os.environ['LOCAL_RANK'] = '0'

# pull out the commands used to run the script and resolve the abs file path
command = sys.argv
full_path = abspath(command[0])
command[0] = full_path
command = ['python'] + command

# since this script sets the visible devices we replace the gpus flag with a number
num_gpus = os.environ['CUDA_VISIBLE_DEVICES'].split(',').__len__()

# if script called without a flag, pass in a flag anyhow
if '--gpus' not in command:
arg_gpus = len(self.gpus) if isinstance(self.gpus, list) else self.gpus
command += ['--gpus', arg_gpus]

gpu_flag_idx = command.index('--gpus')
command[gpu_flag_idx + 1] = f'{num_gpus}'

os.environ['WORLD_SIZE'] = f'{num_gpus * self.num_nodes}'

self.interactive_ddp_procs = []
for local_rank in range(1, self.num_processes):
env_copy = os.environ.copy()
env_copy['LOCAL_RANK'] = f'{local_rank}'

# import pdb; pdb.set_trace()
# start process
proc = subprocess.Popen(command, env=env_copy)
self.interactive_ddp_procs.append(proc)

# starting all processes at once can cause issues
# with dataloaders delay between 1-10 seconds
delay = np.random.uniform(1, 5, 1)[0]
sleep(delay)

local_rank = 0
self.ddp_train(local_rank, model, is_master=True)

def ddp_train(self, process_idx, model, is_master=False):
"""
Entry point into a DP thread
:param gpu_idx:
Expand Down Expand Up @@ -359,7 +431,14 @@ def ddp_train(self, process_idx, model):
# MODEL
# copy model to each gpu
if self.on_gpu:
self.root_gpu = process_idx
gpu_idx = process_idx
if is_master:
# source of truth is cuda for gpu idx
gpus = os.environ['CUDA_VISIBLE_DEVICES'].split(',')
local_rank = int(os.environ['LOCAL_RANK'])
gpu_idx = int(gpus[local_rank])

self.root_gpu = gpu_idx
torch.cuda.set_device(self.root_gpu)
model.cuda(self.root_gpu)

Expand Down Expand Up @@ -388,9 +467,6 @@ def ddp_train(self, process_idx, model):
# continue training routine
self.run_pretrain_routine(model)

# when ddp ends, we save the model
self.save_spawn_weights(model)

def save_spawn_weights(self, model):
"""
Dump a temporary checkpoint after ddp ends to get weights out of the process
Expand Down
10 changes: 10 additions & 0 deletions pytorch_lightning/trainer/distrib_parts.py
Original file line number Diff line number Diff line change
Expand Up @@ -685,8 +685,18 @@ def sanitize_gpu_ids(gpus):
:return: unmodified gpus variable
"""
all_available_gpus = get_all_available_gpus()
misconfig = False
for gpu in gpus:
if gpu not in all_available_gpus:
misconfig = True

if misconfig:
# sometimes auto ddp might have different flags
# but this is not what the user intended
# correct for the user
if len(gpus) == len(all_available_gpus):
gpus = all_available_gpus
else:
raise MisconfigurationException(f"""
You requested GPUs: {gpus}
But your machine only has: {all_available_gpus}
Expand Down
38 changes: 14 additions & 24 deletions pytorch_lightning/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from pytorch_lightning.utilities import rank_zero_warn, parsing


try:
from apex import amp
except ImportError:
Expand Down Expand Up @@ -119,7 +118,7 @@ def __init__(
distributed_backend: Optional[str] = None,
precision: int = 32,
print_nan_grads: bool = False, # backward compatible, todo: remove in v0.9.0
weights_summary: Optional[str] = 'full',
weights_summary: Optional[str] = 'top',
weights_save_path: Optional[str] = None,
num_sanity_val_steps: int = 2,
truncated_bptt_steps: Optional[int] = None,
Expand Down Expand Up @@ -494,6 +493,7 @@ def __init__(
# init flags for SLURM+ddp to work
self.proc_rank = 0
self.world_size = 1
self.interactive_ddp_procs = []
self.configure_slurm_ddp(self.num_nodes)
self.node_rank = self.determine_ddp_node_rank()

Expand Down Expand Up @@ -871,16 +871,12 @@ def fit(
task = int(os.environ['LOCAL_RANK'])
self.ddp_train(task, model)

else:
self.__set_random_port()
# track for predict
elif self.distributed_backend == 'cpu_ddp':
self.model = model
# train
mp.spawn(self.ddp_train, nprocs=self.num_processes, args=(model,))
# load weights if not interrupted
if self.on_colab_kaggle:
self.load_spawn_weights(model)
self.model = model

elif self.distributed_backend == 'ddp':
self.spawn_ddp_children(model)

# 1 gpu or dp option triggers training using DP module
# easier to avoid NCCL issues
Expand Down Expand Up @@ -928,18 +924,6 @@ def fit(
# used for testing or when we need to know that training succeeded
return 1

def __set_random_port(self):
"""
When running DDP NOT managed by SLURM, the ports might collide
:return:
"""
try:
default_port = os.environ['MASTER_PORT']
except Exception:
import random
default_port = random.randint(10000, 19000)
os.environ['MASTER_PORT'] = str(default_port)

def __attach_dataloaders(self, model, train_dataloader=None, val_dataloaders=None, test_dataloaders=None):
# when dataloader is passed via fit, patch the train_dataloader
# functions to overwrite with these implementations
Expand Down Expand Up @@ -1046,7 +1030,10 @@ def run_pretrain_routine(self, model: LightningModule):

# clear cache before training
if self.on_gpu:
torch.cuda.empty_cache()
# use context because of:
# https://discuss.pytorch.org/t/out-of-memory-when-i-use-torch-cuda-empty-cache/57898
with torch.cuda.device(f'cuda:{self.root_gpu}'):
torch.cuda.empty_cache()

# CORE TRAINING LOOP
self.train()
Expand Down Expand Up @@ -1096,7 +1083,10 @@ def test(
if model is not None:
self.model = model
self.fit(model)
elif self.use_ddp or self.use_tpu: # pragma: no-cover

# on tpu, .spawn means we don't have a trained model
# TODO: remove TPU spawn
elif self.use_tpu: # pragma: no-cover
# attempt to load weights from a spawn
path = os.path.join(self.default_root_dir, '__temp_weight_ddp_end.ckpt')
test_model = self.model
Expand Down
35 changes: 19 additions & 16 deletions pytorch_lightning/trainer/training_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ def training_step(self, batch, batch_idx):
from pytorch_lightning.trainer.supporters import TensorRunningAccum
from pytorch_lightning.utilities import rank_zero_warn
from pytorch_lightning.utilities.exceptions import MisconfigurationException
import subprocess

try:
from apex import amp
Expand Down Expand Up @@ -305,13 +306,13 @@ def has_arg(self, *args):

def train(self):
# add signal handlers for process kills
def _signal_kill_handler(*args):
return TrainerTrainLoopMixin.run_training_teardown(self)

orig_signal_handlers = {}
for sig_name in SIGNAL_TERMINATE:
orig_signal_handlers[sig_name] = signal.signal(getattr(signal, sig_name),
_signal_kill_handler)
# def _signal_kill_handler(*args):
# return TrainerTrainLoopMixin.run_training_teardown(self)
#
# orig_signal_handlers = {}
# for sig_name in SIGNAL_TERMINATE:
# orig_signal_handlers[sig_name] = signal.signal(getattr(signal, sig_name),
# _signal_kill_handler)

# get model
model = self.get_model()
Expand Down Expand Up @@ -384,15 +385,17 @@ def _signal_kill_handler(*args):

self.run_training_teardown()

# reset signal handlers
for sig_name in SIGNAL_TERMINATE:
signal.signal(getattr(signal, sig_name), orig_signal_handlers[sig_name])

except KeyboardInterrupt:
if self.proc_rank == 0:
log.info('Detected KeyboardInterrupt, attempting graceful shutdown...')
self.interrupted = True
self.run_training_teardown()
rank_zero_warn('Detected KeyboardInterrupt, attempting graceful shutdown...')

# user could press ctrl+c many times... only shutdown once
if not self.interrupted:
self.interrupted = True

for proc in self.interactive_ddp_procs:
subprocess.Popen.kill(proc)

self.run_training_teardown()

def run_training_epoch(self):

Expand Down Expand Up @@ -678,7 +681,7 @@ def _get_optimizers_iterable(self):
opt_idx = np.argmax(optimizer_freq_cumsum > current_place_in_loop)
return [(opt_idx, self.optimizers[opt_idx])]

@atexit.register
# @atexit.register
def run_training_teardown(self):
if hasattr(self, '_teardown_already_run') and self._teardown_already_run:
return
Expand Down
2 changes: 1 addition & 1 deletion tests/base/model_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def dataloader(self, train):
loader = DataLoader(
dataset=dataset,
batch_size=self.batch_size,
# test and valid shall not be shuffled
num_workers=3,
shuffle=train,
)
return loader
Expand Down
4 changes: 2 additions & 2 deletions tests/base/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def assert_speed_parity(pl_times, pt_times, num_epochs):
f"lightning was slower than PT (threshold {max_diff_per_epoch})"


def run_model_test_without_loggers(trainer_options, model, min_acc=0.50):
def run_model_test_without_loggers(trainer_options, model, min_acc=0.30):
reset_seed()

# fit model
Expand Down Expand Up @@ -155,7 +155,7 @@ def load_model_from_checkpoint(root_weights_dir, module_class=EvalModelTemplate)
return trained_model


def run_prediction(dataloader, trained_model, dp=False, min_acc=0.5):
def run_prediction(dataloader, trained_model, dp=False, min_acc=0.3):
# run prediction on 1 batch
for batch in dataloader:
break
Expand Down
Loading

0 comments on commit 82a2029

Please sign in to comment.