Skip to content

Commit

Permalink
Merge branch 'master' into horovod_fix
Browse files Browse the repository at this point in the history
  • Loading branch information
Borda committed Apr 30, 2020
2 parents 6cd2834 + 142bc02 commit bcaafc3
Show file tree
Hide file tree
Showing 14 changed files with 242 additions and 18 deletions.
2 changes: 1 addition & 1 deletion .github/PULL_REQUEST_TEMPLATE.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
- [ ] Did you read the [contributor guideline](https://github.com/PyTorchLightning/pytorch-lightning/blob/master/.github/CONTRIBUTING.md), Pull Request section?
- [ ] Did you make sure to update the docs?
- [ ] Did you write any new necessary tests?
- [ ] If you made a notable change (that affects users), did you update the [CHANGELOG](https://github.com/PyTorchLightning/pytorch-lightning/blob/master/.github/CHANGELOG.md)?
- [ ] If you made a notable change (that affects users), did you update the [CHANGELOG](https://github.com/PyTorchLightning/pytorch-lightning/blob/master/CHANGELOG.md)?

<!-- For CHANGELOG separate each item in unreleased section by blank line to reduce collisions -->

Expand Down
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

### Added

- Added callback for logging learning rates ([#1498](https://github.com/PyTorchLightning/pytorch-lightning/pull/1498))

### Changed

### Deprecated
Expand All @@ -18,6 +20,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

- Fixed ModelCheckpoint not None checking filepath ([#1654](https://github.com/PyTorchLightning/pytorch-lightning/pull/1654))

- Trainer now calls `on_load_checkpoint()` when resuming from a checkpoint ([#1666](https://github.com/PyTorchLightning/pytorch-lightning/pull/1666))

- Fixed Horovod distributed backend to set the `root_gpu` property ([#1669](https://github.com/PyTorchLightning/pytorch-lightning/pull/1669))


Expand Down
8 changes: 8 additions & 0 deletions docs/source/callbacks.rst
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,11 @@ We successfully extended functionality without polluting our super clean
.. automodule:: pytorch_lightning.callbacks.progress
:noindex:
:exclude-members:

---------

.. automodule:: pytorch_lightning.callbacks.lr_logger
:noindex:
:exclude-members:
_extract_lr,
_find_names
2 changes: 1 addition & 1 deletion docs/source/new-project.rst
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ To also add a validation loop add the following functions
def validation_epoch_end(self, outputs):
avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
tensorboard_logs = {'val_loss': avg_loss}
return {'val_loss': avg_loss, 'log': tensorboard_logs
return {'val_loss': avg_loss, 'log': tensorboard_logs}
def val_dataloader(self):
# TODO: do a real train/val split
Expand Down
2 changes: 2 additions & 0 deletions pytorch_lightning/callbacks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks.gradient_accumulation_scheduler import GradientAccumulationScheduler
from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
from pytorch_lightning.callbacks.lr_logger import LearningRateLogger
from pytorch_lightning.callbacks.progress import ProgressBarBase, ProgressBar

__all__ = [
'Callback',
'EarlyStopping',
'ModelCheckpoint',
'GradientAccumulationScheduler',
'LearningRateLogger',
'ProgressBarBase',
'ProgressBar',
]
118 changes: 118 additions & 0 deletions pytorch_lightning/callbacks/lr_logger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
r"""
Logging of learning rates
=========================
Log learning rate for lr schedulers during training
"""

from pytorch_lightning.callbacks.base import Callback
from pytorch_lightning.utilities.exceptions import MisconfigurationException


class LearningRateLogger(Callback):
r"""
Automatically logs learning rate for learning rate schedulers during training.
Example::
>>> from pytorch_lightning import Trainer
>>> from pytorch_lightning.callbacks import LearningRateLogger
>>> lr_logger = LearningRateLogger()
>>> trainer = Trainer(callbacks=[lr_logger])
Logging names are automatically determined based on optimizer class name.
In case of multiple optimizers of same type, they will be named `Adam`,
`Adam-1` etc. If a optimizer has multiple parameter groups they will
be named `Adam/pg1`, `Adam/pg2` etc. To control naming, pass in a
`name` keyword in the construction of the learning rate schdulers
Example::
def configure_optimizer(self):
optimizer = torch.optim.Adam(...)
lr_scheduler = {'scheduler': torch.optim.lr_schedulers.LambdaLR(optimizer, ...)
'name': 'my_logging_name'}
return [optimizer], [lr_scheduler]
"""
def __init__(self):
self.lrs = None
self.lr_sch_names = []

def on_train_start(self, trainer, pl_module):
""" Called before training, determines unique names for all lr
schedulers in the case of multiple of the same type or in
the case of multiple parameter groups
"""
if trainer.lr_schedulers == []:
raise MisconfigurationException(
'Cannot use LearningRateLogger callback with models that have no'
' learning rate schedulers. Please see documentation for'
' `configure_optimizers` method.')

if not trainer.logger:
raise MisconfigurationException(
'Cannot use LearningRateLogger callback with Trainer that has no logger.')

# Find names for schedulers
names = self._find_names(trainer.lr_schedulers)

# Initialize for storing values
self.lrs = dict.fromkeys(names, [])

def on_batch_start(self, trainer, pl_module):
latest_stat = self._extract_lr(trainer, 'step')
if trainer.logger and latest_stat:
trainer.logger.log_metrics(latest_stat, step=trainer.global_step)

def on_epoch_start(self, trainer, pl_module):
latest_stat = self._extract_lr(trainer, 'epoch')
if trainer.logger and latest_stat:
trainer.logger.log_metrics(latest_stat, step=trainer.global_step)

def _extract_lr(self, trainer, interval):
""" Extracts learning rates for lr schedulers and saves information
into dict structure. """
latest_stat = {}
for name, scheduler in zip(self.lr_sch_names, trainer.lr_schedulers):
if scheduler['interval'] == interval:
param_groups = scheduler['scheduler'].optimizer.param_groups
if len(param_groups) != 1:
for i, pg in enumerate(param_groups):
lr, key = pg['lr'], f'{name}/{i + 1}'
self.lrs[key].append(lr)
latest_stat[key] = lr
else:
self.lrs[name].append(param_groups[0]['lr'])
latest_stat[name] = param_groups[0]['lr']
return latest_stat

def _find_names(self, lr_schedulers):
# Create uniqe names in the case we have multiple of the same learning
# rate schduler + multiple parameter groups
names = []
for scheduler in lr_schedulers:
sch = scheduler['scheduler']
if 'name' in scheduler:
name = scheduler['name']
else:
opt_name = 'lr-' + sch.optimizer.__class__.__name__
i, name = 1, opt_name
# Multiple schduler of the same type
while True:
if name not in names:
break
i, name = i + 1, f'{opt_name}-{i}'

# Multiple param groups for the same schduler
param_groups = sch.optimizer.param_groups
if len(param_groups) != 1:
for i, pg in enumerate(param_groups):
temp = name + '/pg' + str(i + 1)
names.append(temp)
else:
names.append(name)

self.lr_sch_names.append(name)
return names
8 changes: 5 additions & 3 deletions pytorch_lightning/core/lightning.py
Original file line number Diff line number Diff line change
Expand Up @@ -930,10 +930,12 @@ def init_ddp_connection(
if 'MASTER_ADDR' not in os.environ:
log.warning("MASTER_ADDR environment variable is not defined. Set as localhost")
os.environ['MASTER_ADDR'] = '127.0.0.1'
log.debug(f"MASTER_ADDR: {os.environ['MASTER_ADDR']}")

if 'MASTER_PORT' not in os.environ:
log.warning("MASTER_PORT environment variable is not defined. Set as 12910")
os.environ['MASTER_PORT'] = '12910'
log.debug(f"MASTER_PORT: {os.environ['MASTER_PORT']}")

if 'WORLD_SIZE' in os.environ and os.environ['WORLD_SIZE'] != world_size:
log.warning("WORLD_SIZE environment variable is not equal to the computed "
Expand Down Expand Up @@ -1160,9 +1162,9 @@ def optimizer_step(self, current_epoch, batch_idx, optimizer,

# native amp + lbfgs is a no go right now
if self.trainer.use_amp and self.trainer.use_native_amp:
m = 'native PyTorch amp and lbfgs are not compatible. To request, please file' \
'a Github issue in PyTorch and tag @mcarilli'
raise MisconfigurationException(m)
raise MisconfigurationException(
'native PyTorch amp and lbfgs are not compatible.'
' To request, please file a Github issue in PyTorch and tag @mcarilli')
optimizer.step(second_order_closure)
else:
if self.trainer.use_amp and self.trainer.use_native_amp:
Expand Down
6 changes: 5 additions & 1 deletion pytorch_lightning/trainer/distrib_data_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,10 @@ def configure_slurm_ddp(self, num_gpu_nodes):
except Exception as e:
pass

# notify user the that slurm is managing tasks
if self.is_slurm_managing_tasks:
log.info('Multi-processing is handled by Slurm.')

def set_nvidia_flags(self, is_slurm_managing_tasks, data_parallel_device_ids):
if data_parallel_device_ids is None:
return
Expand All @@ -291,7 +295,7 @@ def set_nvidia_flags(self, is_slurm_managing_tasks, data_parallel_device_ids):
gpu_str = ','.join([str(x) for x in data_parallel_device_ids])
os.environ["CUDA_VISIBLE_DEVICES"] = gpu_str

log.info(f'CUDA_VISIBLE_DEVICES: [{os.environ["CUDA_VISIBLE_DEVICES"]}]')
log.debug(f'CUDA_VISIBLE_DEVICES: [{os.environ["CUDA_VISIBLE_DEVICES"]}]')

def ddp_train(self, process_idx, model):
"""
Expand Down
13 changes: 9 additions & 4 deletions pytorch_lightning/trainer/distrib_parts.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,10 +461,15 @@ def __transfer_data_to_device(self, batch, device, gpu_id=None):

# when tuple
if isinstance(batch, tuple):
batch = list(batch)
for i, x in enumerate(batch):
batch[i] = self.__transfer_data_to_device(x, device, gpu_id)
return tuple(batch)
# when namedtuple
if hasattr(batch, '_fields'):
elem_type = type(batch)
return elem_type(*(self.__transfer_data_to_device(x, device, gpu_id) for x in batch))
else:
batch = list(batch)
for i, x in enumerate(batch):
batch[i] = self.__transfer_data_to_device(x, device, gpu_id)
return tuple(batch)

# when dict
if isinstance(batch, dict):
Expand Down
14 changes: 11 additions & 3 deletions pytorch_lightning/trainer/training_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ def sig_handler(self, signum, frame): # pragma: no-cover
if result == 0:
log.info(f'requeued exp {job_id}')
else:
log.info('requeue failed...')
log.warning('requeue failed...')

# close experiment to avoid issues
self.logger.close()
Expand Down Expand Up @@ -251,9 +251,11 @@ def save_checkpoint(self, filepath):
# do the actual save
try:
self._atomic_save(checkpoint, filepath)
except AttributeError:
except AttributeError as e:
if 'hparams' in checkpoint:
del checkpoint['hparams']
rank_zero_warn('warning, `hparams` dropped from checkpoint.'
f' An attribute is not picklable {e}')

self._atomic_save(checkpoint, filepath)

Expand All @@ -278,6 +280,10 @@ def restore(self, checkpoint_path: str, on_gpu: bool):

# load the state_dict on the model automatically
model.load_state_dict(checkpoint['state_dict'])

# give model a chance to load something
model.on_load_checkpoint(checkpoint)

if on_gpu:
model.cuda(self.root_gpu)

Expand Down Expand Up @@ -430,9 +436,11 @@ def hpc_save(self, folderpath: str, logger):
# TODO: fix for anything with multiprocess DP, DDP, DDP2
try:
self._atomic_save(checkpoint, filepath)
except AttributeError:
except AttributeError as e:
if 'hparams' in checkpoint:
del checkpoint['hparams']
rank_zero_warn('warning, `hparams` dropped from checkpoint.'
f' An attribute is not picklable {e}')

self._atomic_save(checkpoint, filepath)

Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ exclude_lines =
pragma: no-cover
warnings
pass
rank_zero_warn

[flake8]
# TODO: this should be 88 or 100 according PEP8
Expand Down
59 changes: 58 additions & 1 deletion tests/callbacks/test_callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@
import tests.base.utils as tutils
from pytorch_lightning import Callback
from pytorch_lightning import Trainer, LightningModule
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.callbacks import EarlyStopping, LearningRateLogger, ModelCheckpoint
from tests.base import (
LightTrainDataloader,
LightTestMixin,
LightValidationMixin,
LightTestOptimizersWithMixedSchedulingMixin,
TestModelBase
)

Expand Down Expand Up @@ -273,3 +274,59 @@ class CurrentTestModel(LightTrainDataloader, TestModelBase):

# These should be different if the dirpath has be overridden
assert trainer.ckpt_path != trainer.default_root_dir


def test_lr_logger_single_lr(tmpdir):
""" Test that learning rates are extracted and logged for single lr scheduler"""
tutils.reset_seed()

class CurrentTestModel(LightTrainDataloader, TestModelBase):
pass

hparams = tutils.get_default_hparams()
model = CurrentTestModel(hparams)

lr_logger = LearningRateLogger()
trainer = Trainer(
default_root_dir=tmpdir,
max_epochs=5,
val_percent_check=0.1,
train_percent_check=0.5,
callbacks=[lr_logger]
)
results = trainer.fit(model)

assert lr_logger.lrs, 'No learning rates logged'
assert len(lr_logger.lrs) == len(trainer.lr_schedulers), \
'Number of learning rates logged does not match number of lr schedulers'
assert all([k in ['lr-Adam'] for k in lr_logger.lrs.keys()]), \
'Names of learning rates not set correctly'


def test_lr_logger_multi_lrs(tmpdir):
""" Test that learning rates are extracted and logged for multi lr schedulers """
tutils.reset_seed()

class CurrentTestModel(LightTestOptimizersWithMixedSchedulingMixin,
LightTrainDataloader,
TestModelBase):
pass

hparams = tutils.get_default_hparams()
model = CurrentTestModel(hparams)

lr_logger = LearningRateLogger()
trainer = Trainer(
default_root_dir=tmpdir,
max_epochs=1,
val_percent_check=0.1,
train_percent_check=0.5,
callbacks=[lr_logger]
)
results = trainer.fit(model)

assert lr_logger.lrs, 'No learning rates logged'
assert len(lr_logger.lrs) == len(trainer.lr_schedulers), \
'Number of learning rates logged does not match number of lr schedulers'
assert all([k in ['lr-Adam', 'lr-Adam-1'] for k in lr_logger.lrs.keys()]), \
'Names of learning rates not set correctly'
8 changes: 8 additions & 0 deletions tests/models/test_cpu.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from collections import namedtuple
import platform

import pytest
Expand Down Expand Up @@ -221,6 +222,13 @@ def test_single_gpu_batch_parse():
assert batch[1][0]['b'].device.index == 0
assert batch[1][0]['b'].type() == 'torch.cuda.FloatTensor'

# namedtuple of tensor
BatchType = namedtuple('BatchType', ['a', 'b'])
batch = [BatchType(a=torch.rand(2, 3), b=torch.rand(2, 3)) for _ in range(2)]
batch = trainer.transfer_batch_to_gpu(batch, 0)
assert batch[0].a.device.index == 0
assert batch[0].a.type() == 'torch.cuda.FloatTensor'


def test_simple_cpu(tmpdir):
"""Verify continue training session on CPU."""
Expand Down
Loading

0 comments on commit bcaafc3

Please sign in to comment.