Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix ddp tests + .test() #2512

Merged
merged 436 commits into from
Jul 7, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
436 commits
Select commit Hold shift + click to select a range
5c2976c
added base tests for tpu
williamFalcon Jul 5, 2020
d11bb1a
added base tests for tpu
williamFalcon Jul 5, 2020
3f67989
added base tests for tpu
williamFalcon Jul 5, 2020
b75e5d4
added base tests for tpu
williamFalcon Jul 5, 2020
ccca15e
added base tests for tpu
williamFalcon Jul 5, 2020
22e756e
added base tests for tpu
williamFalcon Jul 5, 2020
e977380
added base tests for tpu
williamFalcon Jul 5, 2020
b9de0c4
added base tests for tpu
williamFalcon Jul 5, 2020
a9b67db
added base tests for tpu
williamFalcon Jul 5, 2020
e5585a5
added base tests for tpu
williamFalcon Jul 5, 2020
b7b378d
added base tests for tpu
williamFalcon Jul 5, 2020
fb9c139
added base tests for tpu
williamFalcon Jul 5, 2020
2cab5df
added base tests for tpu
williamFalcon Jul 5, 2020
4dbef0f
added base tests for tpu
williamFalcon Jul 5, 2020
8687047
added base tests for tpu
williamFalcon Jul 5, 2020
906cfbb
added base tests for tpu
williamFalcon Jul 5, 2020
38cce5d
added base tests for tpu
williamFalcon Jul 5, 2020
6ba26f8
added base tests for tpu
williamFalcon Jul 5, 2020
7fe58fd
added base tests for tpu
williamFalcon Jul 5, 2020
8e96881
added base tests for tpu
williamFalcon Jul 5, 2020
ce7eab3
added base tests for tpu
williamFalcon Jul 5, 2020
30a5228
added base tests for tpu
williamFalcon Jul 5, 2020
977284d
added base tests for tpu
williamFalcon Jul 5, 2020
fda1f3d
added base tests for tpu
williamFalcon Jul 5, 2020
8fcc6eb
added base tests for tpu
williamFalcon Jul 5, 2020
307e08b
added base tests for tpu
williamFalcon Jul 5, 2020
f0227e8
added base tests for tpu
williamFalcon Jul 5, 2020
4f55014
added base tests for tpu
williamFalcon Jul 5, 2020
cf2f22f
added base tests for tpu
williamFalcon Jul 5, 2020
2e11dc2
added base tests for tpu
williamFalcon Jul 5, 2020
ed78895
added base tests for tpu
williamFalcon Jul 5, 2020
7a2cdbc
added base tests for tpu
williamFalcon Jul 5, 2020
3518b7f
added base tests for tpu
williamFalcon Jul 5, 2020
3dcaf09
added base tests for tpu
williamFalcon Jul 5, 2020
61ad421
added base tests for tpu
williamFalcon Jul 5, 2020
336bfec
added base tests for tpu
williamFalcon Jul 5, 2020
ffa8fc7
added base tests for tpu
williamFalcon Jul 5, 2020
8d8d21e
added base tests for tpu
williamFalcon Jul 5, 2020
d112067
added base tests for tpu
williamFalcon Jul 5, 2020
0df633a
added base tests for tpu
williamFalcon Jul 5, 2020
080ccbf
added base tests for tpu
williamFalcon Jul 5, 2020
6ebd9ab
added base tests for tpu
williamFalcon Jul 5, 2020
f7ec5c2
added base tests for tpu
williamFalcon Jul 5, 2020
de4f485
added base tests for tpu
williamFalcon Jul 5, 2020
5e4ab40
added base tests for tpu
williamFalcon Jul 5, 2020
8d2b282
added base tests for tpu
williamFalcon Jul 5, 2020
4fdca74
added base tests for tpu
williamFalcon Jul 5, 2020
6e401c6
added base tests for tpu
williamFalcon Jul 5, 2020
7790675
added base tests for tpu
williamFalcon Jul 5, 2020
4b7afc6
added base tests for tpu
williamFalcon Jul 5, 2020
c20da12
added base tests for tpu
williamFalcon Jul 5, 2020
6ab4df6
added base tests for tpu
williamFalcon Jul 5, 2020
81b5052
added base tests for tpu
williamFalcon Jul 5, 2020
c2fe20a
added base tests for tpu
williamFalcon Jul 5, 2020
72aa9fd
added base tests for tpu
williamFalcon Jul 5, 2020
23b6c1d
added base tests for tpu
williamFalcon Jul 5, 2020
fa32ef7
added base tests for tpu
williamFalcon Jul 5, 2020
4a340b5
added base tests for tpu
williamFalcon Jul 5, 2020
72b6e93
added base tests for tpu
williamFalcon Jul 5, 2020
5f72245
added base tests for tpu
williamFalcon Jul 5, 2020
7fa9864
added base tests for tpu
williamFalcon Jul 5, 2020
065c020
added base tests for tpu
williamFalcon Jul 5, 2020
8996962
added base tests for tpu
williamFalcon Jul 5, 2020
4d34ac7
added base tests for tpu
williamFalcon Jul 5, 2020
2036861
added base tests for tpu
williamFalcon Jul 5, 2020
0abd7c0
added base tests for tpu
williamFalcon Jul 5, 2020
1ed4317
added base tests for tpu
williamFalcon Jul 5, 2020
eec34f3
added base tests for tpu
williamFalcon Jul 5, 2020
5790ce2
added base tests for tpu
williamFalcon Jul 5, 2020
b532422
added base tests for tpu
williamFalcon Jul 5, 2020
6c26d9d
added base tests for tpu
williamFalcon Jul 5, 2020
2e82faf
added base tests for tpu
williamFalcon Jul 5, 2020
5d7ad63
added base tests for tpu
williamFalcon Jul 5, 2020
41f5c1d
added base tests for tpu
williamFalcon Jul 5, 2020
9595874
added base tests for tpu
williamFalcon Jul 5, 2020
6d4014e
added base tests for tpu
williamFalcon Jul 5, 2020
36a8cf6
added base tests for tpu
williamFalcon Jul 5, 2020
15ced0f
added base tests for tpu
williamFalcon Jul 5, 2020
eaf3478
added base tests for tpu
williamFalcon Jul 5, 2020
7c16724
added base tests for tpu
williamFalcon Jul 5, 2020
33fbbc8
added base tests for tpu
williamFalcon Jul 5, 2020
ed1f697
added base tests for tpu
williamFalcon Jul 5, 2020
a9fd3e0
added base tests for tpu
williamFalcon Jul 5, 2020
69ac053
added base tests for tpu
williamFalcon Jul 5, 2020
4bfee48
added base tests for tpu
williamFalcon Jul 5, 2020
c2e3c9d
added base tests for tpu
williamFalcon Jul 5, 2020
4be6eda
added base tests for tpu
williamFalcon Jul 5, 2020
fbd3ad7
added base tests for tpu
williamFalcon Jul 5, 2020
003f564
added base tests for tpu
williamFalcon Jul 5, 2020
196488a
added base tests for tpu
williamFalcon Jul 5, 2020
45f2556
added base tests for tpu
williamFalcon Jul 5, 2020
9fb51a8
added base tests for tpu
williamFalcon Jul 5, 2020
061c5c1
added base tests for tpu
williamFalcon Jul 5, 2020
408c802
added base tests for tpu
williamFalcon Jul 5, 2020
c3708cf
added base tests for tpu
williamFalcon Jul 5, 2020
043ae97
added base tests for tpu
williamFalcon Jul 5, 2020
44949aa
added base tests for tpu
williamFalcon Jul 6, 2020
c1b88b1
added base tests for tpu
williamFalcon Jul 6, 2020
95cab83
added base tests for tpu
williamFalcon Jul 6, 2020
07fdb41
added base tests for tpu
williamFalcon Jul 6, 2020
74c5464
added base tests for tpu
williamFalcon Jul 6, 2020
2ee786e
added base tests for tpu
williamFalcon Jul 6, 2020
e400ded
added base tests for tpu
williamFalcon Jul 6, 2020
ba50de3
added base tests for tpu
williamFalcon Jul 6, 2020
8ca1aa4
added base tests for tpu
williamFalcon Jul 6, 2020
cda6b94
added base tests for tpu
williamFalcon Jul 6, 2020
37184aa
added base tests for tpu
williamFalcon Jul 6, 2020
c96d7a2
added base tests for tpu
williamFalcon Jul 6, 2020
341d9b2
added base tests for tpu
williamFalcon Jul 6, 2020
b064e81
added base tests for tpu
williamFalcon Jul 6, 2020
ae49eb0
added base tests for tpu
williamFalcon Jul 6, 2020
7d3c181
added base tests for tpu
williamFalcon Jul 6, 2020
acc9707
added base tests for tpu
williamFalcon Jul 6, 2020
e3e46d8
added base tests for tpu
williamFalcon Jul 6, 2020
f4c7073
added base tests for tpu
williamFalcon Jul 6, 2020
8d3ba76
added base tests for tpu
williamFalcon Jul 6, 2020
92efcd8
added base tests for tpu
williamFalcon Jul 6, 2020
af890e6
added base tests for tpu
williamFalcon Jul 6, 2020
78ebf54
added base tests for tpu
williamFalcon Jul 6, 2020
f3e47cf
added base tests for tpu
williamFalcon Jul 6, 2020
c660dd1
added base tests for tpu
williamFalcon Jul 6, 2020
32a4b2e
added base tests for tpu
williamFalcon Jul 6, 2020
111555c
added base tests for tpu
williamFalcon Jul 6, 2020
ff66eab
added base tests for tpu
williamFalcon Jul 6, 2020
ffeb1a1
added base tests for tpu
williamFalcon Jul 6, 2020
5a58e8a
added base tests for tpu
williamFalcon Jul 6, 2020
31a7487
added base tests for tpu
williamFalcon Jul 6, 2020
32929e4
added base tests for tpu
williamFalcon Jul 6, 2020
bb2b8a8
added base tests for tpu
williamFalcon Jul 6, 2020
202b621
added base tests for tpu
williamFalcon Jul 6, 2020
bd66b59
added base tests for tpu
williamFalcon Jul 6, 2020
8c61f32
added base tests for tpu
williamFalcon Jul 6, 2020
d33ca8a
added base tests for tpu
williamFalcon Jul 6, 2020
3697588
added base tests for tpu
williamFalcon Jul 6, 2020
2c59d66
added base tests for tpu
williamFalcon Jul 6, 2020
e5742d8
added base tests for tpu
williamFalcon Jul 6, 2020
e87612c
added base tests for tpu
williamFalcon Jul 6, 2020
20fe68d
added base tests for tpu
williamFalcon Jul 6, 2020
502f900
added base tests for tpu
williamFalcon Jul 6, 2020
d188609
added base tests for tpu
williamFalcon Jul 6, 2020
49af187
added base tests for tpu
williamFalcon Jul 6, 2020
faf51eb
added base tests for tpu
williamFalcon Jul 6, 2020
6cd9d9c
added base tests for tpu
williamFalcon Jul 6, 2020
a801af5
added base tests for tpu
williamFalcon Jul 6, 2020
9a35bfa
added base tests for tpu
williamFalcon Jul 6, 2020
050b449
added base tests for tpu
williamFalcon Jul 6, 2020
7638438
added base tests for tpu
williamFalcon Jul 6, 2020
2df850f
added base tests for tpu
williamFalcon Jul 6, 2020
2440e75
added base tests for tpu
williamFalcon Jul 6, 2020
86d95c9
added base tests for tpu
williamFalcon Jul 6, 2020
772c521
added base tests for tpu
williamFalcon Jul 6, 2020
a227aff
added base tests for tpu
williamFalcon Jul 6, 2020
34591b9
added base tests for tpu
williamFalcon Jul 6, 2020
281344d
added base tests for tpu
williamFalcon Jul 6, 2020
e4a7de4
added base tests for tpu
williamFalcon Jul 6, 2020
da1486d
added base tests for tpu
williamFalcon Jul 6, 2020
65026f4
added base tests for tpu
williamFalcon Jul 6, 2020
22fa88c
added base tests for tpu
williamFalcon Jul 6, 2020
6f9c9dc
added base tests for tpu
williamFalcon Jul 6, 2020
1d3ef64
added base tests for tpu
williamFalcon Jul 6, 2020
cd8e7f9
added base tests for tpu
williamFalcon Jul 6, 2020
38c62d2
added base tests for tpu
williamFalcon Jul 6, 2020
68dd3bc
added base tests for tpu
williamFalcon Jul 6, 2020
0f8c60c
added base tests for tpu
williamFalcon Jul 6, 2020
8982911
added base tests for tpu
williamFalcon Jul 6, 2020
e1a5661
added base tests for tpu
williamFalcon Jul 6, 2020
153e0b1
added base tests for tpu
williamFalcon Jul 6, 2020
27a8d14
added base tests for tpu
williamFalcon Jul 6, 2020
2aa94c8
added base tests for tpu
williamFalcon Jul 6, 2020
cc2483d
added base tests for tpu
williamFalcon Jul 6, 2020
a06128c
added base tests for tpu
williamFalcon Jul 6, 2020
2f81c0d
added base tests for tpu
williamFalcon Jul 6, 2020
73ac3eb
added base tests for tpu
williamFalcon Jul 6, 2020
f04d147
added base tests for tpu
williamFalcon Jul 6, 2020
0df9467
added base tests for tpu
williamFalcon Jul 6, 2020
5a339cd
added base tests for tpu
williamFalcon Jul 6, 2020
2e322e0
added base tests for tpu
williamFalcon Jul 6, 2020
f60c085
added base tests for tpu
williamFalcon Jul 6, 2020
57144f9
added base tests for tpu
williamFalcon Jul 6, 2020
99247a2
fix deprecation warnings
Borda Jul 7, 2020
81edf53
added base tests for tpu
williamFalcon Jul 7, 2020
825a454
added base tests for tpu
williamFalcon Jul 7, 2020
2facba2
Update pytorch_lightning/trainer/trainer.py
williamFalcon Jul 7, 2020
789e65b
added base tests for tpu
williamFalcon Jul 7, 2020
4bae01b
added base tests for tpu
williamFalcon Jul 7, 2020
a8fe4c1
added base tests for tpu
williamFalcon Jul 7, 2020
d742572
added base tests for tpu
williamFalcon Jul 7, 2020
ef26a62
added base tests for tpu
williamFalcon Jul 7, 2020
223b6b2
added base tests for tpu
williamFalcon Jul 7, 2020
9af87b7
added base tests for tpu
williamFalcon Jul 7, 2020
e143840
added base tests for tpu
williamFalcon Jul 7, 2020
8693c7c
added base tests for tpu
williamFalcon Jul 7, 2020
87f87a3
added base tests for tpu
williamFalcon Jul 7, 2020
bf4fe46
added base tests for tpu
williamFalcon Jul 7, 2020
ae98c6e
added base tests for tpu
williamFalcon Jul 7, 2020
5d01d4b
added base tests for tpu
williamFalcon Jul 7, 2020
aa30e40
added base tests for tpu
williamFalcon Jul 7, 2020
7b0deeb
added base tests for tpu
williamFalcon Jul 7, 2020
aa3a9b6
added base tests for tpu
williamFalcon Jul 7, 2020
6405a29
added base tests for tpu
williamFalcon Jul 7, 2020
64e1c1f
added base tests for tpu
williamFalcon Jul 7, 2020
ed4a295
added base tests for tpu
williamFalcon Jul 7, 2020
8ff2dec
added base tests for tpu
williamFalcon Jul 7, 2020
118a875
added base tests for tpu
williamFalcon Jul 7, 2020
5f6557d
added base tests for tpu
williamFalcon Jul 7, 2020
a179513
added base tests for tpu
williamFalcon Jul 7, 2020
50470af
added base tests for tpu
williamFalcon Jul 7, 2020
e76da74
added base tests for tpu
williamFalcon Jul 7, 2020
3c58e1b
added base tests for tpu
williamFalcon Jul 7, 2020
2e52107
added base tests for tpu
williamFalcon Jul 7, 2020
e689360
added base tests for tpu
williamFalcon Jul 7, 2020
0aa1920
added base tests for tpu
williamFalcon Jul 7, 2020
656ccf8
added base tests for tpu
williamFalcon Jul 7, 2020
cd25b09
added base tests for tpu
williamFalcon Jul 7, 2020
ba7c546
added base tests for tpu
williamFalcon Jul 7, 2020
c377af7
added base tests for tpu
williamFalcon Jul 7, 2020
0dd05b6
added base tests for tpu
williamFalcon Jul 7, 2020
fbaa81d
added base tests for tpu
williamFalcon Jul 7, 2020
98ea716
added base tests for tpu
williamFalcon Jul 7, 2020
79a8cb7
added base tests for tpu
williamFalcon Jul 7, 2020
6a9e9f6
added base tests for tpu
williamFalcon Jul 7, 2020
ab36924
added base tests for tpu
williamFalcon Jul 7, 2020
01ba566
added base tests for tpu
williamFalcon Jul 7, 2020
a8b034c
added base tests for tpu
williamFalcon Jul 7, 2020
9473a34
added base tests for tpu
williamFalcon Jul 7, 2020
f549d14
added base tests for tpu
williamFalcon Jul 7, 2020
13ba1a5
added base tests for tpu
williamFalcon Jul 7, 2020
b3d6b3f
added base tests for tpu
williamFalcon Jul 7, 2020
5067e87
added base tests for tpu
williamFalcon Jul 7, 2020
4c99856
added base tests for tpu
williamFalcon Jul 7, 2020
0640c38
added base tests for tpu
williamFalcon Jul 7, 2020
05d9e0f
added base tests for tpu
williamFalcon Jul 7, 2020
6a9d317
added base tests for tpu
williamFalcon Jul 7, 2020
0024941
added base tests for tpu
williamFalcon Jul 7, 2020
00f3624
added base tests for tpu
williamFalcon Jul 7, 2020
743d6d6
added base tests for tpu
williamFalcon Jul 7, 2020
5d11a0c
added base tests for tpu
williamFalcon Jul 7, 2020
7d08987
added base tests for tpu
williamFalcon Jul 7, 2020
890c846
added base tests for tpu
williamFalcon Jul 7, 2020
33939b6
added base tests for tpu
williamFalcon Jul 7, 2020
0f5c54e
added base tests for tpu
williamFalcon Jul 7, 2020
16b6eb0
added base tests for tpu
williamFalcon Jul 7, 2020
face96f
added base tests for tpu
williamFalcon Jul 7, 2020
7cc326f
added base tests for tpu
williamFalcon Jul 7, 2020
7abaae7
added base tests for tpu
williamFalcon Jul 7, 2020
3ba0f74
added base tests for tpu
williamFalcon Jul 7, 2020
e076453
added base tests for tpu
williamFalcon Jul 7, 2020
786f893
added base tests for tpu
williamFalcon Jul 7, 2020
7bd0feb
added base tests for tpu
williamFalcon Jul 7, 2020
47d8f0d
added base tests for tpu
williamFalcon Jul 7, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pytorch_lightning/core/decorators.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def data_loader(fn):
Warnings:
This decorator deprecated in v0.7.0 and it will be removed v0.9.0.
"""
rank_zero_warn('`data_loader` decorator deprecated in v0.7.0. Will be removed v0.9.0', DeprecationWarning)
rank_zero_warn("`data_loader` decorator deprecated in v0.7.0. It will be removed in v0.9.0", DeprecationWarning)

def inner_fx(self):
return fn(self)
Expand Down
4 changes: 4 additions & 0 deletions pytorch_lightning/loggers/tensorboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,10 @@ def experiment(self) -> SummaryWriter:
self._experiment = SummaryWriter(log_dir=self.log_dir, **self._kwargs)
return self._experiment

@experiment.setter
def experiment(self, exp):
self._experiment = exp

Comment on lines +109 to +112
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
@experiment.setter
def experiment(self, exp):
self._experiment = exp

I think this is not needed anymore?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rank_zero_only
def log_hyperparams(self, params: Union[Dict[str, Any], Namespace],
metrics: Optional[Dict[str, Any]] = None) -> None:
Expand Down
7 changes: 3 additions & 4 deletions pytorch_lightning/trainer/data_loading.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ def reset_train_dataloader(self, model: LightningModule) -> None:
self.num_training_batches = len(self.train_dataloader)
self.num_training_batches = int(self.num_training_batches * self.limit_train_batches)
else:
self.num_training_batches = self.limit_train_batches
self.num_training_batches = min(len(self.train_dataloader), self.limit_train_batches)

# determine when to check validation
# if int passed in, val checks that often
Expand Down Expand Up @@ -313,7 +313,7 @@ def _reset_eval_dataloader(
if isinstance(limit_eval_batches, float):
num_batches = int(num_batches * limit_eval_batches)
else:
num_batches = limit_eval_batches
num_batches = min(len(dataloader), limit_eval_batches)

elif limit_eval_batches not in (0.0, 1.0):
raise MisconfigurationException(
Expand All @@ -340,8 +340,7 @@ def reset_val_dataloader(self, model: LightningModule) -> None:
model: The current `LightningModule`
"""
if self.is_overridden('validation_step'):
self.num_val_batches, self.val_dataloaders = \
self._reset_eval_dataloader(model, 'val')
self.num_val_batches, self.val_dataloaders = self._reset_eval_dataloader(model, 'val')

def reset_test_dataloader(self, model) -> None:
"""Resets the validation dataloader and determines the number of batches.
Expand Down
60 changes: 44 additions & 16 deletions pytorch_lightning/trainer/distrib_data_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,8 @@ def train_fx(trial_hparams, cluster_manager, _):
from time import sleep
import numpy as np
from os.path import abspath
from torch import distributed as dist
import queue

import torch
from pytorch_lightning import _logger as log
Expand Down Expand Up @@ -163,6 +165,10 @@ def train_fx(trial_hparams, cluster_manager, _):
else:
XLA_AVAILABLE = True

pid = os.getpid()
rng1 = np.random.RandomState(pid)
RANDOM_PORTS = rng1.randint(10000, 19999, 100)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does this cause a failure for a distributed cluster > 100 nodes?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the random_port thing is only used in non-multi node ddp.



class TrainerDDPMixin(ABC):

Expand All @@ -178,6 +184,7 @@ class TrainerDDPMixin(ABC):
use_tpu: bool
default_root_dir: str
progress_bar_callback: ...
checkpoint_callback: ...
num_processes: int
num_nodes: int
node_rank: int
Expand Down Expand Up @@ -377,17 +384,19 @@ def set_nvidia_flags(self, is_slurm_managing_tasks, data_parallel_device_ids):
# don't make this debug... this is good UX
rank_zero_info(f'CUDA_VISIBLE_DEVICES: [{os.environ["CUDA_VISIBLE_DEVICES"]}]')

def set_random_port(self):
def set_random_port(self, force=False):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jeremyjordan this function is only ever called from ddp on a single node... not distributed

"""
When running DDP NOT managed by SLURM, the ports might collide
"""
try:
default_port = os.environ['MASTER_PORT']
except Exception:
# use the process id as a seed to a generator for port only
pid = os.getpid()
rng1 = np.random.RandomState(pid)
default_port = rng1.randint(10000, 19999, 1)[0]
# pick a random port first
assert self.num_nodes == 1, 'random port can only be called from single node training'
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jeremyjordan added this to make sure it's used as expected

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm understanding this, it looks like this will disable multi-node support (at least, I'm not able to run across multiple nodes anymore due to this assertion - see issue here: flatironinstitute/deepblast#46)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mortonjt can you open a github issue about this and explain how you launched your script?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure thing. See #2578

global RANDOM_PORTS
default_port = RANDOM_PORTS[-1]
RANDOM_PORTS = RANDOM_PORTS[:-1]

# when not forced, use the user port
if not force:
default_port = os.environ.get('MASTER_PORT', default_port)

os.environ['MASTER_PORT'] = str(default_port)

Expand Down Expand Up @@ -446,15 +455,24 @@ def spawn_ddp_children(self, model):
sleep(delay)

local_rank = 0
self.ddp_train(local_rank, model, is_master=True)
results = self.ddp_train(local_rank, q=None, model=model, is_master=True)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

rather longer var name the q

del os.environ['WORLD_SIZE']

def ddp_train(self, process_idx, model, is_master=False, proc_offset=0):
return results

def ddp_train(self, process_idx, q, model, is_master=False, proc_offset=0):
"""
Entry point into a DP thread
:param gpu_idx:
:param model:
:param cluster_obj:
:return:
Entry point for ddp

Args:
process_idx:
q:
model:
is_master:
proc_offset:

Returns:

"""
# offset the process id if requested
process_idx = process_idx + proc_offset
Expand Down Expand Up @@ -535,7 +553,17 @@ def ddp_train(self, process_idx, model, is_master=False, proc_offset=0):
model = model.configure_ddp(model, device_ids)

# continue training routine
self.run_pretrain_routine(model)
results = self.run_pretrain_routine(model)

# clean up memory
torch.cuda.empty_cache()

if self.global_rank == 0 and q is not None:
q.put(self.checkpoint_callback.best_model_path)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this feels hacky, what are we trying to do here? return the state of a callback to the main node? why put this specific attribute in the queue?

Copy link
Member

@awaelchli awaelchli Jul 7, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think he did this so that one can call .test in the main process, which will access the best model to test on. But I agree with you, this seems fragile and dangerous to modify state across processes, there's gotta be a better way. Could one put the whole trainer in the queue in theory?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i tried that but it didn’t work. i think in a different PR we can do something like state_dict for the trainer.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i agree this isn't optimal but let's get a release that fixes all the test issues (which this PR does) and then we can figure out a longer term strategy

q.put(results)

if self.global_rank == 0 and self.distributed_backend != 'ddp_spawn':
return results

def save_spawn_weights(self, model):
"""
Expand Down
17 changes: 11 additions & 6 deletions pytorch_lightning/trainer/distrib_parts.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from pytorch_lightning.utilities import move_data_to_device, NATIVE_AMP_AVALAIBLE
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from pytorch_lightning.utilities.distributed import rank_zero_only
from pytorch_lightning.utilities import rank_zero_warn

try:
from apex import amp
Expand Down Expand Up @@ -182,7 +183,8 @@ def single_gpu_train(self, model):
self.optimizers = optimizers
self.reinit_scheduler_properties(self.optimizers, self.lr_schedulers)

self.run_pretrain_routine(model)
results = self.run_pretrain_routine(model)
return results

def tpu_train(self, tpu_core_idx, model):
# call setup after the ddp process has connected
Expand Down Expand Up @@ -221,6 +223,7 @@ def tpu_train(self, tpu_core_idx, model):

# when training ends on these platforms dump weights to get out of the main process
if self.on_colab_kaggle:
rank_zero_warn('cleaning up... please do not interrupt')
self.save_spawn_weights(model)

def dp_train(self, model):
Expand All @@ -229,12 +232,12 @@ def dp_train(self, model):
if self.is_function_implemented('setup', model):
model.setup('fit')

model.cuda(self.root_gpu)

# CHOOSE OPTIMIZER
# allow for lr schedulers as well
self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers(model)

model.cuda(self.root_gpu)

# hack forward to do autocast for the user
model_autocast_original_forward = model.forward
if self.use_amp and NATIVE_AMP_AVALAIBLE:
Expand Down Expand Up @@ -264,10 +267,11 @@ def dp_train(self, model):

model = LightningDataParallel(model, device_ids=device_ids)

self.run_pretrain_routine(model)

result = self.run_pretrain_routine(model)
model.forward = model_autocast_original_forward

return result

def horovod_train(self, model):
# call setup after the ddp process has connected
self.setup('fit')
Expand Down Expand Up @@ -325,10 +329,11 @@ def filter_named_parameters(model, optimizer):
# Synchronization will be performed explicitly following backward()
stack.enter_context(optimizer.skip_synchronize())

self.run_pretrain_routine(model)
result = self.run_pretrain_routine(model)

# Make sure all workers have finished training before returning to the user
hvd.join()
return result


def _normalize_parse_gpu_string_input(s: Union[int, str, List[int]]) -> Union[int, List[int]]:
Expand Down
7 changes: 5 additions & 2 deletions pytorch_lightning/trainer/evaluation_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,7 @@ def _evaluate(
if self.is_overridden('test_end', model=model):
# TODO: remove in v1.0.0
eval_results = model.test_end(outputs)
rank_zero_warn('Method `test_end` was deprecated in v0.7 and will be removed v1.0.'
rank_zero_warn('Method `test_end` was deprecated in v0.7 and will be removed in v1.0.'
' Use `test_epoch_end` instead.', DeprecationWarning)

elif self.is_overridden('test_epoch_end', model=model):
Expand All @@ -335,7 +335,7 @@ def _evaluate(
if self.is_overridden('validation_end', model=model):
# TODO: remove in v1.0.0
eval_results = model.validation_end(outputs)
rank_zero_warn('Method `validation_end` was deprecated in v0.7 and will be removed v1.0.'
rank_zero_warn('Method `validation_end` was deprecated in v0.7 and will be removed in v1.0.'
' Use `validation_epoch_end` instead.', DeprecationWarning)

elif self.is_overridden('validation_epoch_end', model=model):
Expand Down Expand Up @@ -391,6 +391,7 @@ def run_evaluation(self, test_mode: bool = False):
eval_results = self._evaluate(self.model, dataloaders, max_batches, test_mode)

# enable no returns
callback_metrics = {}
if eval_results is not None and len(eval_results) > 0:
_, prog_bar_metrics, log_metrics, callback_metrics, _ = self.process_output(eval_results)

Expand Down Expand Up @@ -428,6 +429,8 @@ def run_evaluation(self, test_mode: bool = False):
else:
self.on_validation_end()

return callback_metrics

def evaluation_forward(self, model, batch, batch_idx, dataloader_idx, test_mode: bool = False):
# make dataloader_idx arg in validation_step optional
args = [batch, batch_idx]
Expand Down
Loading