diff --git a/.github/workflows/docker_builds.yml b/.github/workflows/docker_builds.yml index 1ac289ba61f7db..736ff72460d749 100644 --- a/.github/workflows/docker_builds.yml +++ b/.github/workflows/docker_builds.yml @@ -22,7 +22,7 @@ jobs: - uses: actions/checkout@v2 - name: Publish Releases to Docker # only on releases - uses: elgohr/Publish-Docker-Github-Action@master + uses: elgohr/Publish-Docker-Github-Action@2.14 if: contains(github.ref, 'refs/tags/') && !contains(${{ steps.get_version.outputs.VERSION }}, 'rc') %% !contains(${{ steps.get_version.outputs.VERSION }}, 'dev') with: name: pytorchlightning/pytorch_lightning @@ -30,10 +30,10 @@ jobs: password: ${{ secrets.DOCKER_PASSWORD }} dockerfile: docker/Dockerfile buildargs: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},LIGHTNING_VERSION=${{ steps.get_version.outputs.VERSION }} - tags: "${{ steps.get_version.outputs.VERSION }}_py${{ matrix.python_version }}_torch${{ matrix.pytorch_version }},stable_py${{ matrix.python_version }}_torch${{ matrix.pytorch_version }}" + tags: "${{ steps.get_version.outputs.VERSION }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }},stable-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}" - name: Publish Master # publish master - uses: elgohr/Publish-Docker-Github-Action@master + uses: elgohr/Publish-Docker-Github-Action@2.14 if: github.event_name == 'push' with: name: pytorchlightning/pytorch_lightning @@ -41,4 +41,4 @@ jobs: password: ${{ secrets.DOCKER_PASSWORD }} dockerfile: docker/Dockerfile buildargs: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},LIGHTNING_VERSION=${{ steps.get_version.outputs.VERSION }} - tags: "latest_py${{ matrix.python_version }}_torch${{ matrix.pytorch_version }}" + tags: "nightly-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}" diff --git a/.github/workflows/rebase.yml b/.github/workflows/rebase.yml index 2aa94bea056118..06d20652c6b5a4 100644 --- a/.github/workflows/rebase.yml +++ b/.github/workflows/rebase.yml @@ -1,8 +1,9 @@ name: Automatic Rebase # https://github.com/marketplace/actions/automatic-rebase -on: - - pull_request +on: + issue_comment: + types: [created] jobs: rebase: @@ -17,10 +18,3 @@ jobs: uses: cirrus-actions/rebase@1.2 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # https://github.community/t5/GitHub-Actions/Workflow-is-failing-if-no-job-can-be-ran-due-to-condition/m-p/38186#M3250 - always_job: - name: Always run job - runs-on: ubuntu-latest - steps: - - name: Always run - run: echo "This job is used to prevent the workflow to fail when all other jobs are skipped." diff --git a/CHANGELOG.md b/CHANGELOG.md index d5ac610feb32ad..b2ec1c763d4eb5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,11 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added callback for logging learning rates ([#1498](https://github.com/PyTorchLightning/pytorch-lightning/pull/1498)) +- Added transfer learning example (for a binary classification task in computer vision) ([#1564](https://github.com/PyTorchLightning/pytorch-lightning/pull/1564)) + ### Changed + +- Reduction when `batch_size < num_gpus` ([#1609](https://github.com/PyTorchLightning/pytorch-lightning/pull/1609)) ### Deprecated @@ -26,6 +30,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed bugs that prevent lr finder to be used together with early stopping and validation dataloaders ([1676](https://github.com/PyTorchLightning/pytorch-lightning/pull/1676)) +- Fixed wandb logger `global_step` affects other loggers ([#1492](https://github.com/PyTorchLightning/pytorch-lightning/issues/1485)) + + ## [0.7.5] - 2020-04-27 ### Changed @@ -70,6 +77,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Defines shared proc. rank, remove rank from instances (e.g. loggers) ([#1408](https://github.com/PyTorchLightning/pytorch-lightning/pull/1408)) - Updated semantic segmentation example with custom U-Net and logging ([#1371](https://github.com/PyTorchLightning/pytorch-lightning/pull/1371)) - Disabled val and test shuffling ([#1600](https://github.com/PyTorchLightning/pytorch-lightning/pull/1600)) +- Updated LightningTemplateModel to look more like Colab example ([#1546](https://github.com/PyTorchLightning/pytorch-lightning/pull/1577)) ### Deprecated diff --git a/pl_examples/domain_templates/computer_vision_fine_tuning.py b/pl_examples/domain_templates/computer_vision_fine_tuning.py new file mode 100644 index 00000000000000..42a0a936d9e340 --- /dev/null +++ b/pl_examples/domain_templates/computer_vision_fine_tuning.py @@ -0,0 +1,440 @@ +"""Computer vision example on Transfer Learning. + +This computer vision example illustrates how one could fine-tune a pre-trained +network (by default, a ResNet50 is used) using pytorch-lightning. For the sake +of this example, the 'cats and dogs dataset' (~60MB, see `DATA_URL` below) and +the proposed network (denoted by `TransferLearningModel`, see below) is +trained for 15 epochs. The training consists in three stages. From epoch 0 to +4, the feature extractor (the pre-trained network) is frozen except maybe for +the BatchNorm layers (depending on whether `train_bn = True`). The BatchNorm +layers (if `train_bn = True`) and the parameters of the classifier are trained +as a single parameters group with lr = 1e-2. From epoch 5 to 9, the last two +layer groups of the pre-trained network are unfrozen and added to the +optimizer as a new parameter group with lr = 1e-4 (while lr = 1e-3 for the +first parameter group in the optimizer). Eventually, from epoch 10, all the +remaining layer groups of the pre-trained network are unfrozen and added to +the optimizer as a third parameter group. From epoch 10, the parameters of the +pre-trained network are trained with lr = 1e-5 while those of the classifier +are trained with lr = 1e-4. + +Note: + See: https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html +""" + +import argparse +from collections import OrderedDict +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import Optional, Generator, Union + +import pytorch_lightning as pl +import torch +import torch.nn.functional as F +from pytorch_lightning import _logger as log +from torch import optim +from torch.optim.lr_scheduler import MultiStepLR +from torch.optim.optimizer import Optimizer +from torch.utils.data import DataLoader +from torchvision import models +from torchvision import transforms +from torchvision.datasets import ImageFolder +from torchvision.datasets.utils import download_and_extract_archive + +BN_TYPES = (torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.BatchNorm3d) +DATA_URL = 'https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip' + + +# --- Utility functions --- + + +def _make_trainable(module: torch.nn.Module) -> None: + """Unfreezes a given module. + + Args: + module: The module to unfreeze + """ + for param in module.parameters(): + param.requires_grad = True + module.train() + + +def _recursive_freeze(module: torch.nn.Module, + train_bn: bool = True) -> None: + """Freezes the layers of a given module. + + Args: + module: The module to freeze + train_bn: If True, leave the BatchNorm layers in training mode + """ + children = list(module.children()) + if not children: + if not (isinstance(module, BN_TYPES) and train_bn): + for param in module.parameters(): + param.requires_grad = False + module.eval() + else: + # Make the BN layers trainable + _make_trainable(module) + else: + for child in children: + _recursive_freeze(module=child, train_bn=train_bn) + + +def freeze(module: torch.nn.Module, + n: Optional[int] = None, + train_bn: bool = True) -> None: + """Freezes the layers up to index n (if n is not None). + + Args: + module: The module to freeze (at least partially) + n: Max depth at which we stop freezing the layers. If None, all + the layers of the given module will be frozen. + train_bn: If True, leave the BatchNorm layers in training mode + """ + children = list(module.children()) + n_max = len(children) if n is None else int(n) + + for child in children[:n_max]: + _recursive_freeze(module=child, train_bn=train_bn) + + for child in children[n_max:]: + _make_trainable(module=child) + + +def filter_params(module: torch.nn.Module, + train_bn: bool = True) -> Generator: + """Yields the trainable parameters of a given module. + + Args: + module: A given module + train_bn: If True, leave the BatchNorm layers in training mode + + Returns: + Generator + """ + children = list(module.children()) + if not children: + if not (isinstance(module, BN_TYPES) and train_bn): + for param in module.parameters(): + if param.requires_grad: + yield param + else: + for child in children: + for param in filter_params(module=child, train_bn=train_bn): + yield param + + +def _unfreeze_and_add_param_group(module: torch.nn.Module, + optimizer: Optimizer, + lr: Optional[float] = None, + train_bn: bool = True): + """Unfreezes a module and adds its parameters to an optimizer.""" + _make_trainable(module) + params_lr = optimizer.param_groups[0]['lr'] if lr is None else float(lr) + optimizer.add_param_group( + {'params': filter_params(module=module, train_bn=train_bn), + 'lr': params_lr / 10., + }) + + +# --- Pytorch-lightning module --- + + +class TransferLearningModel(pl.LightningModule): + """Transfer Learning with pre-trained ResNet50. + + Args: + hparams: Model hyperparameters + dl_path: Path where the data will be downloaded + """ + def __init__(self, + hparams: argparse.Namespace, + dl_path: Union[str, Path]) -> None: + super().__init__() + self.hparams = hparams + self.dl_path = dl_path + self.__build_model() + + def __build_model(self): + """Define model layers & loss.""" + + # 1. Load pre-trained network: + model_func = getattr(models, self.hparams.backbone) + backbone = model_func(pretrained=True) + + _layers = list(backbone.children())[:-1] + self.feature_extractor = torch.nn.Sequential(*_layers) + freeze(module=self.feature_extractor, train_bn=self.hparams.train_bn) + + # 2. Classifier: + _fc_layers = [torch.nn.Linear(2048, 256), + torch.nn.Linear(256, 32), + torch.nn.Linear(32, 1)] + self.fc = torch.nn.Sequential(*_fc_layers) + + # 3. Loss: + self.loss_func = F.binary_cross_entropy_with_logits + + def forward(self, x): + """Forward pass. Returns logits.""" + + # 1. Feature extraction: + x = self.feature_extractor(x) + x = x.squeeze(-1).squeeze(-1) + + # 2. Classifier (returns logits): + x = self.fc(x) + + return x + + def loss(self, labels, logits): + return self.loss_func(input=logits, target=labels) + + def train(self, mode=True): + super().train(mode=mode) + + epoch = self.current_epoch + if epoch < self.hparams.milestones[0] and mode: + # feature extractor is frozen (except for BatchNorm layers) + freeze(module=self.feature_extractor, + train_bn=self.hparams.train_bn) + + elif self.hparams.milestones[0] <= epoch < self.hparams.milestones[1] and mode: + # Unfreeze last two layers of the feature extractor + freeze(module=self.feature_extractor, + n=-2, + train_bn=self.hparams.train_bn) + + def on_epoch_start(self): + """Use `on_epoch_start` to unfreeze layers progressively.""" + optimizer = self.trainer.optimizers[0] + if self.current_epoch == self.hparams.milestones[0]: + _unfreeze_and_add_param_group(module=self.feature_extractor[-2:], + optimizer=optimizer, + train_bn=self.hparams.train_bn) + + elif self.current_epoch == self.hparams.milestones[1]: + _unfreeze_and_add_param_group(module=self.feature_extractor[:-2], + optimizer=optimizer, + train_bn=self.hparams.train_bn) + + def training_step(self, batch, batch_idx): + + # 1. Forward pass: + x, y = batch + y_logits = self.forward(x) + y_true = y.view((-1, 1)).type_as(x) + y_bin = torch.ge(y_logits, 0) + + # 2. Compute loss & accuracy: + train_loss = self.loss(y_true, y_logits) + num_correct = torch.eq(y_bin.view(-1), y_true.view(-1)).sum() + + # 3. Outputs: + tqdm_dict = {'train_loss': train_loss} + output = OrderedDict({'loss': train_loss, + 'num_correct': num_correct, + 'log': tqdm_dict, + 'progress_bar': tqdm_dict}) + + return output + + def training_epoch_end(self, outputs): + """Compute and log training loss and accuracy at the epoch level.""" + + train_loss_mean = torch.stack([output['loss'] + for output in outputs]).mean() + train_acc_mean = torch.stack([output['num_correct'] + for output in outputs]).sum().float() + train_acc_mean /= (len(outputs) * self.hparams.batch_size) + return {'log': {'train_loss': train_loss_mean, + 'train_acc': train_acc_mean, + 'step': self.current_epoch}} + + def validation_step(self, batch, batch_idx): + + # 1. Forward pass: + x, y = batch + y_logits = self.forward(x) + y_true = y.view((-1, 1)).type_as(x) + y_bin = torch.ge(y_logits, 0) + + # 2. Compute loss & accuracy: + val_loss = self.loss(y_true, y_logits) + num_correct = torch.eq(y_bin.view(-1), y_true.view(-1)).sum() + + return {'val_loss': val_loss, + 'num_correct': num_correct} + + def validation_epoch_end(self, outputs): + """Compute and log validation loss and accuracy at the epoch level.""" + + val_loss_mean = torch.stack([output['val_loss'] + for output in outputs]).mean() + val_acc_mean = torch.stack([output['num_correct'] + for output in outputs]).sum().float() + val_acc_mean /= (len(outputs) * self.hparams.batch_size) + return {'log': {'val_loss': val_loss_mean, + 'val_acc': val_acc_mean, + 'step': self.current_epoch}} + + def configure_optimizers(self): + optimizer = optim.Adam(filter(lambda p: p.requires_grad, + self.parameters()), + lr=self.hparams.lr) + + scheduler = MultiStepLR(optimizer, + milestones=self.hparams.milestones, + gamma=self.hparams.lr_scheduler_gamma) + + return [optimizer], [scheduler] + + def prepare_data(self): + """Download images and prepare images datasets.""" + + # 1. Download the images + download_and_extract_archive(url=DATA_URL, + download_root=self.dl_path, + remove_finished=True) + + data_path = Path(self.dl_path).joinpath('cats_and_dogs_filtered') + + # 2. Load the data + preprocessing & data augmentation + normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + + train_dataset = ImageFolder(root=data_path.joinpath('train'), + transform=transforms.Compose([ + transforms.Resize((224, 224)), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ])) + + valid_dataset = ImageFolder(root=data_path.joinpath('validation'), + transform=transforms.Compose([ + transforms.Resize((224, 224)), + transforms.ToTensor(), + normalize, + ])) + + self.train_dataset = train_dataset + self.valid_dataset = valid_dataset + + def __dataloader(self, train): + """Train/validation loaders.""" + + _dataset = self.train_dataset if train else self.valid_dataset + loader = DataLoader(dataset=_dataset, + batch_size=self.hparams.batch_size, + num_workers=self.hparams.num_workers, + shuffle=True if train else False) + + return loader + + def train_dataloader(self): + log.info('Training data loaded.') + return self.__dataloader(train=True) + + def val_dataloader(self): + log.info('Validation data loaded.') + return self.__dataloader(train=False) + + @staticmethod + def add_model_specific_args(parent_parser): + parser = argparse.ArgumentParser(parents=[parent_parser]) + parser.add_argument('--backbone', + default='resnet50', + type=str, + metavar='BK', + help='Name (as in ``torchvision.models``) of the feature extractor') + parser.add_argument('--epochs', + default=15, + type=int, + metavar='N', + help='total number of epochs', + dest='nb_epochs') + parser.add_argument('--batch-size', + default=8, + type=int, + metavar='B', + help='batch size', + dest='batch_size') + parser.add_argument('--gpus', + type=int, + default=1, + help='number of gpus to use') + parser.add_argument('--lr', + '--learning-rate', + default=1e-2, + type=float, + metavar='LR', + help='initial learning rate', + dest='lr') + parser.add_argument('--lr-scheduler-gamma', + default=1e-1, + type=float, + metavar='LRG', + help='Factor by which the learning rate is reduced at each milestone', + dest='lr_scheduler_gamma') + parser.add_argument('--num-workers', + default=6, + type=int, + metavar='W', + help='number of CPU workers', + dest='num_workers') + parser.add_argument('--train-bn', + default=True, + type=bool, + metavar='TB', + help='Whether the BatchNorm layers should be trainable', + dest='train_bn') + parser.add_argument('--milestones', + default=[5, 10], + type=list, + metavar='M', + help='List of two epochs milestones') + return parser + + +def main(hparams: argparse.Namespace) -> None: + """Train the model. + + Args: + hparams: Model hyper-parameters + + Note: + For the sake of the example, the images dataset will be downloaded + to a temporary directory. + """ + + with TemporaryDirectory(dir=hparams.root_data_path) as tmp_dir: + + model = TransferLearningModel(hparams, dl_path=tmp_dir) + + trainer = pl.Trainer( + weights_summary=None, + show_progress_bar=True, + num_sanity_val_steps=0, + gpus=hparams.gpus, + min_epochs=hparams.nb_epochs, + max_epochs=hparams.nb_epochs) + + trainer.fit(model) + + +def get_args() -> argparse.Namespace: + parent_parser = argparse.ArgumentParser(add_help=False) + parent_parser.add_argument('--root-data-path', + metavar='DIR', + type=str, + default=Path.cwd().as_posix(), + help='Root directory where to download the data', + dest='root_data_path') + parser = TransferLearningModel.add_model_specific_args(parent_parser) + return parser.parse_args() + + +if __name__ == '__main__': + + main(get_args()) diff --git a/pl_examples/models/lightning_template.py b/pl_examples/models/lightning_template.py index c5a7f9396f287b..13b3bc67a912b4 100644 --- a/pl_examples/models/lightning_template.py +++ b/pl_examples/models/lightning_template.py @@ -46,22 +46,6 @@ def __init__(self, hparams): # init superclass super().__init__() self.hparams = hparams - - self.batch_size = hparams.batch_size - - # if you specify an example input, the summary will show input/output for each layer - self.example_input_array = torch.rand(5, 28 * 28) - - # build model - self.__build_model() - - # --------------------- - # MODEL SETUP - # --------------------- - def __build_model(self): - """ - Layout the model. - """ self.c_d1 = nn.Linear(in_features=self.hparams.in_features, out_features=self.hparams.hidden_dim) self.c_d1_bn = nn.BatchNorm1d(self.hparams.hidden_dim) @@ -70,27 +54,17 @@ def __build_model(self): self.c_d2 = nn.Linear(in_features=self.hparams.hidden_dim, out_features=self.hparams.out_features) - # --------------------- - # TRAINING - # --------------------- def forward(self, x): """ No special modification required for Lightning, define it as you normally would in the `nn.Module` in vanilla PyTorch. """ - x = self.c_d1(x) + x = self.c_d1(x.view(x.size(0), -1)) x = torch.tanh(x) x = self.c_d1_bn(x) x = self.c_d1_drop(x) - x = self.c_d2(x) - logits = F.log_softmax(x, dim=1) - - return logits - - def loss(self, labels, logits): - nll = F.nll_loss(logits, labels) - return nll + return x def training_step(self, batch, batch_idx): """ @@ -99,22 +73,10 @@ def training_step(self, batch, batch_idx): """ # forward pass x, y = batch - x = x.view(x.size(0), -1) - y_hat = self(x) - - # calculate loss - loss_val = self.loss(y, y_hat) - - tqdm_dict = {'train_loss': loss_val} - output = OrderedDict({ - 'loss': loss_val, - 'progress_bar': tqdm_dict, - 'log': tqdm_dict - }) - - # can also return just a scalar instead of a dict (return loss_val) - return output + loss = F.cross_entropy(y_hat, y) + tensorboard_logs = {'train_loss': loss} + return {'loss': loss, 'log': tensorboard_logs} def validation_step(self, batch, batch_idx): """ @@ -122,58 +84,35 @@ def validation_step(self, batch, batch_idx): passed in as `batch`. """ x, y = batch - x = x.view(x.size(0), -1) y_hat = self(x) - - loss_val = self.loss(y, y_hat) - - # acc + val_loss = F.cross_entropy(y_hat, y) labels_hat = torch.argmax(y_hat, dim=1) - val_acc = torch.sum(y == labels_hat).item() / (len(y) * 1.0) - val_acc = torch.tensor(val_acc) - - if self.on_gpu: - val_acc = val_acc.cuda(loss_val.device.index) - - output = OrderedDict({ - 'val_loss': loss_val, - 'val_acc': val_acc, - }) + n_correct_pred = torch.sum(y == labels_hat).item() + return {'val_loss': val_loss, "n_correct_pred": n_correct_pred, "n_pred": len(x)} - # can also return just a scalar instead of a dict (return loss_val) - return output + def test_step(self, batch, batch_idx): + x, y = batch + y_hat = self(x) + test_loss = F.cross_entropy(y_hat, y) + labels_hat = torch.argmax(y_hat, dim=1) + n_correct_pred = torch.sum(y == labels_hat).item() + return {'test_loss': test_loss, "n_correct_pred": n_correct_pred, "n_pred": len(x)} def validation_epoch_end(self, outputs): """ Called at the end of validation to aggregate outputs. :param outputs: list of individual outputs of each validation step. """ - # if returned a scalar from validation_step, outputs is a list of tensor scalars - # we return just the average in this case (if we want) - # return torch.stack(outputs).mean() - - val_loss_mean = 0 - val_acc_mean = 0 - for output in outputs: - val_loss = output['val_loss'] - - # reduce manually when using dp - if self.trainer.use_dp or self.trainer.use_ddp2: - val_loss = torch.mean(val_loss) - val_loss_mean += val_loss + avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean() + val_acc = sum([x['n_correct_pred'] for x in outputs]) / sum(x['n_pred'] for x in outputs) + tensorboard_logs = {'val_loss': avg_loss, 'val_acc': val_acc} + return {'val_loss': avg_loss, 'log': tensorboard_logs} - # reduce manually when using dp - val_acc = output['val_acc'] - if self.trainer.use_dp or self.trainer.use_ddp2: - val_acc = torch.mean(val_acc) - - val_acc_mean += val_acc - - val_loss_mean /= len(outputs) - val_acc_mean /= len(outputs) - tqdm_dict = {'val_loss': val_loss_mean, 'val_acc': val_acc_mean} - result = {'progress_bar': tqdm_dict, 'log': tqdm_dict, 'val_loss': val_loss_mean} - return result + def test_epoch_end(self, outputs): + avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean() + test_acc = sum([x['n_correct_pred'] for x in outputs]) / sum(x['n_pred'] for x in outputs) + tensorboard_logs = {'test_loss': avg_loss, 'test_acc': test_acc} + return {'test_loss': avg_loss, 'log': tensorboard_logs} # --------------------- # TRAINING SETUP @@ -187,72 +126,23 @@ def configure_optimizers(self): scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10) return [optimizer], [scheduler] - def __dataloader(self, train): - # this is neede when you want some info about dataset before binding to trainer - self.prepare_data() - # init data generators - transform = transforms.Compose([transforms.ToTensor(), - transforms.Normalize((0.5,), (1.0,))]) - dataset = MNIST(root=self.hparams.data_root, train=train, - transform=transform, download=False) - - # when using multi-node (ddp) we need to add the datasampler - batch_size = self.hparams.batch_size - - loader = DataLoader( - dataset=dataset, - batch_size=batch_size, - num_workers=0 - ) - - return loader - def prepare_data(self): transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (1.0,))]) - _ = MNIST(root=self.hparams.data_root, train=True, - transform=transform, download=True) + self.mnist_train = MNIST(self.hparams.data_root, train=True, download=True, transform=transform) + self.mnist_test = MNIST(self.hparams.data_root, train=False, download=True, transform=transform) def train_dataloader(self): log.info('Training data loader called.') - return self.__dataloader(train=True) + return DataLoader(self.mnist_train, batch_size=self.hparams.batch_size, num_workers=4) def val_dataloader(self): log.info('Validation data loader called.') - return self.__dataloader(train=False) + return DataLoader(self.mnist_test, batch_size=self.hparams.batch_size, num_workers=4) def test_dataloader(self): log.info('Test data loader called.') - return self.__dataloader(train=False) - - def test_step(self, batch, batch_idx): - """ - Lightning calls this during testing, similar to `validation_step`, - with the data from the test dataloader passed in as `batch`. - """ - output = self.validation_step(batch, batch_idx) - # Rename output keys - output['test_loss'] = output.pop('val_loss') - output['test_acc'] = output.pop('val_acc') - - return output - - def test_epoch_end(self, outputs): - """ - Called at the end of test to aggregate outputs, similar to `validation_epoch_end`. - :param outputs: list of individual outputs of each test step - """ - results = self.validation_step_end(outputs) - - # rename some keys - results['progress_bar'].update({ - 'test_loss': results['progress_bar'].pop('val_loss'), - 'test_acc': results['progress_bar'].pop('val_acc'), - }) - results['log'] = results['progress_bar'] - results['test_loss'] = results.pop('val_loss') - - return results + return DataLoader(self.mnist_test, batch_size=self.hparams.batch_size, num_workers=4) @staticmethod def add_model_specific_args(parent_parser, root_dir): # pragma: no-cover diff --git a/pytorch_lightning/callbacks/progress.py b/pytorch_lightning/callbacks/progress.py index a397c0d2c8d56b..a770c6c9d95e7d 100644 --- a/pytorch_lightning/callbacks/progress.py +++ b/pytorch_lightning/callbacks/progress.py @@ -24,10 +24,10 @@ class LitProgressBar(ProgressBarBase): def __init__(self): super().__init__() # don't forget this :) - self.enabled = True + self.enable = True def disable(self): - self.enableenabled = False + self.enable = False def on_batch_end(self, trainer, pl_module): super().on_batch_end(trainer, pl_module) # don't forget this :) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index fc88fb8c786871..2f1de6412f0f05 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -1550,8 +1550,9 @@ def _load_model_state(cls, checkpoint: Dict[str, Any], *args, **kwargs) -> 'Ligh ) # load the state_dict on the model automatically - model_args = [hparams] if hparams else [] - model = cls(*model_args, *args, **kwargs) + if hparams: + kwargs.update(hparams=hparams) + model = cls(*args, **kwargs) model.load_state_dict(checkpoint['state_dict']) # give model a chance to load something diff --git a/pytorch_lightning/loggers/base.py b/pytorch_lightning/loggers/base.py index 39891c447bba2d..857d661fdb5b37 100644 --- a/pytorch_lightning/loggers/base.py +++ b/pytorch_lightning/loggers/base.py @@ -125,7 +125,7 @@ def agg_and_log_metrics(self, metrics: Dict[str, float], step: Optional[int] = N """ agg_step, metrics_to_log = self._aggregate_metrics(metrics=metrics, step=step) - if metrics_to_log is not None: + if metrics_to_log: self.log_metrics(metrics=metrics_to_log, step=agg_step) @abstractmethod diff --git a/pytorch_lightning/loggers/wandb.py b/pytorch_lightning/loggers/wandb.py index c348644141fca0..0d5ff9855a40df 100644 --- a/pytorch_lightning/loggers/wandb.py +++ b/pytorch_lightning/loggers/wandb.py @@ -119,9 +119,7 @@ def log_hyperparams(self, params: Union[Dict[str, Any], Namespace]) -> None: @rank_zero_only def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: - if step is not None: - metrics['global_step'] = step - self.experiment.log(metrics) + self.experiment.log(metrics, step=step) @property def name(self) -> str: diff --git a/pytorch_lightning/trainer/logging.py b/pytorch_lightning/trainer/logging.py index c1d598dc71875f..978ac5df78d816 100644 --- a/pytorch_lightning/trainer/logging.py +++ b/pytorch_lightning/trainer/logging.py @@ -196,8 +196,8 @@ def reduce_distributed_output(self, output, num_gpus): elif isinstance(output[k], torch.Tensor) and output[k].dim() == 0: pass - # reduce only metrics that have the same number of gpus - elif output[k].size(0) == num_gpus: - reduced = torch.mean(output[k]) - output[k] = reduced + # do not reduce metrics that have batch size > num gpus + elif output[k].size(0) <= num_gpus: + output[k] = torch.mean(output[k]) + return output diff --git a/tests/base/eval_model_optimizers.py b/tests/base/eval_model_optimizers.py index 1666e26ed81aa0..bcce319d4a5659 100644 --- a/tests/base/eval_model_optimizers.py +++ b/tests/base/eval_model_optimizers.py @@ -15,7 +15,7 @@ def configure_optimizers(self): def configure_optimizers_empty(self): return None - def configure_optimizers_lbfgs(self): + def configure_optimizers__lbfgs(self): """ return whatever optimizers we want here. :return: list of optimizers @@ -23,7 +23,7 @@ def configure_optimizers_lbfgs(self): optimizer = optim.LBFGS(self.parameters(), lr=self.hparams.learning_rate) return optimizer - def configure_optimizers_multiple_optimizers(self): + def configure_optimizers__multiple_optimizers(self): """ return whatever optimizers we want here. :return: list of optimizers @@ -33,12 +33,12 @@ def configure_optimizers_multiple_optimizers(self): optimizer2 = optim.Adam(self.parameters(), lr=self.hparams.learning_rate) return optimizer1, optimizer2 - def configure_optimizers_single_scheduler(self): + def configure_optimizers__single_scheduler(self): optimizer = optim.Adam(self.parameters(), lr=self.hparams.learning_rate) lr_scheduler = optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.1) return [optimizer], [lr_scheduler] - def configure_optimizers_multiple_schedulers(self): + def configure_optimizers__multiple_schedulers(self): optimizer1 = optim.Adam(self.parameters(), lr=self.hparams.learning_rate) optimizer2 = optim.Adam(self.parameters(), lr=self.hparams.learning_rate) lr_scheduler1 = optim.lr_scheduler.StepLR(optimizer1, 1, gamma=0.1) @@ -46,7 +46,7 @@ def configure_optimizers_multiple_schedulers(self): return [optimizer1, optimizer2], [lr_scheduler1, lr_scheduler2] - def configure_optimizers_mixed_scheduling(self): + def configure_optimizers__mixed_scheduling(self): optimizer1 = optim.Adam(self.parameters(), lr=self.hparams.learning_rate) optimizer2 = optim.Adam(self.parameters(), lr=self.hparams.learning_rate) lr_scheduler1 = optim.lr_scheduler.StepLR(optimizer1, 4, gamma=0.1) @@ -55,7 +55,7 @@ def configure_optimizers_mixed_scheduling(self): return [optimizer1, optimizer2], \ [{'scheduler': lr_scheduler1, 'interval': 'step'}, lr_scheduler2] - def configure_optimizers_reduce_lr_on_plateau(self): + def configure_optimizers__reduce_lr_on_plateau(self): optimizer = optim.Adam(self.parameters(), lr=self.hparams.learning_rate) lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer) return [optimizer], [lr_scheduler] diff --git a/tests/base/eval_model_template.py b/tests/base/eval_model_template.py index 77d83b483bcb99..37f4dfbd04144f 100644 --- a/tests/base/eval_model_template.py +++ b/tests/base/eval_model_template.py @@ -33,7 +33,7 @@ class EvalModelTemplate( """ This template houses all combinations of model configurations we want to test """ - def __init__(self, hparams): + def __init__(self, hparams: object) -> object: """Pass in parsed HyperOptArgumentParser to the model.""" # init superclass super().__init__() diff --git a/tests/base/eval_model_test_steps.py b/tests/base/eval_model_test_steps.py index ed8fe75cd37771..b4c80cff06421f 100644 --- a/tests/base/eval_model_test_steps.py +++ b/tests/base/eval_model_test_steps.py @@ -45,7 +45,7 @@ def test_step(self, batch, batch_idx, *args, **kwargs): }) return output - def test_step_multiple_dataloaders(self, batch, batch_idx, dataloader_idx, **kwargs): + def test_step__multiple_dataloaders(self, batch, batch_idx, dataloader_idx, **kwargs): """ Default, baseline test_step :param batch: diff --git a/tests/base/eval_model_valid_steps.py b/tests/base/eval_model_valid_steps.py index 1f40b45f804345..d6c9a847920541 100644 --- a/tests/base/eval_model_valid_steps.py +++ b/tests/base/eval_model_valid_steps.py @@ -51,7 +51,7 @@ def validation_step(self, batch, batch_idx, *args, **kwargs): }) return output - def validation_step_multiple_dataloaders(self, batch, batch_idx, dataloader_idx, **kwargs): + def validation_step__multiple_dataloaders(self, batch, batch_idx, dataloader_idx, **kwargs): """ Lightning calls this inside the validation loop :param batch: diff --git a/tests/base/utils.py b/tests/base/utils.py index 42e6d17d224d11..f27d0bbdcb39c9 100644 --- a/tests/base/utils.py +++ b/tests/base/utils.py @@ -9,7 +9,7 @@ from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.loggers import TensorBoardLogger from tests import TEMP_PATH, RANDOM_PORTS, RANDOM_SEEDS -from tests.base import LightningTestModel +from tests.base import LightningTestModel, EvalModelTemplate from tests.base.datasets import PATH_DATASETS @@ -27,6 +27,8 @@ def assert_speed_parity(pl_times, pt_times, num_epochs): def run_model_test_without_loggers(trainer_options, model, min_acc=0.50): + reset_seed() + # fit model trainer = Trainer(**trainer_options) result = trainer.fit(model) @@ -54,6 +56,7 @@ def run_model_test_without_loggers(trainer_options, model, min_acc=0.50): def run_model_test(trainer_options, model, on_gpu=True, version=None, with_hpc=True): + reset_seed() save_dir = trainer_options['default_root_dir'] # logger file to get meta @@ -95,8 +98,6 @@ def run_model_test(trainer_options, model, on_gpu=True, version=None, with_hpc=T def get_default_hparams(continue_training=False, hpc_exp_number=0): - _ = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) - args = { 'drop_prob': 0.2, 'batch_size': 32, @@ -120,18 +121,6 @@ def get_default_hparams(continue_training=False, hpc_exp_number=0): return hparams -def get_default_model(lbfgs=False): - # set up model with these hyperparams - hparams = get_default_hparams() - if lbfgs: - setattr(hparams, 'optimizer_name', 'lbfgs') - setattr(hparams, 'learning_rate', 0.005) - - model = LightningTestModel(hparams) - - return model, hparams - - def get_default_logger(save_dir, version=None): # set up logger object without actually saving logs logger = TensorBoardLogger(save_dir, name='lightning_logs', version=version) @@ -229,6 +218,7 @@ def reset_seed(): def set_random_master_port(): + reset_seed() port = RANDOM_PORTS.pop() os.environ['MASTER_PORT'] = str(port) diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py index fcd0836f2545a3..2bbcfaea1f191a 100644 --- a/tests/callbacks/test_callbacks.py +++ b/tests/callbacks/test_callbacks.py @@ -214,8 +214,6 @@ def on_test_end(self, trainer, pl_module): def test_early_stopping_no_val_step(tmpdir): """Test that early stopping callback falls back to training metrics when no validation defined.""" - tutils.reset_seed() - class ModelWithoutValStep(LightTrainDataloader, TestModelBase): def training_step(self, *args, **kwargs): @@ -224,8 +222,7 @@ def training_step(self, *args, **kwargs): output.update({'my_train_metric': loss}) return output - hparams = tutils.get_default_hparams() - model = ModelWithoutValStep(hparams) + model = ModelWithoutValStep(tutils.get_default_hparams()) stopping = EarlyStopping(monitor='my_train_metric', min_delta=0.1) @@ -269,7 +266,7 @@ class CurrentTestModel(LightTrainDataloader, TestModelBase): overfit_pct=0.20, max_epochs=5 ) - result = trainer.fit(model) + trainer.fit(model) # These should be different if the dirpath has be overridden assert trainer.ckpt_path != trainer.default_root_dir diff --git a/tests/loggers/test_all.py b/tests/loggers/test_all.py index 383ca263c77099..06e93fa6a23f41 100644 --- a/tests/loggers/test_all.py +++ b/tests/loggers/test_all.py @@ -7,6 +7,7 @@ from pytorch_lightning import Trainer from pytorch_lightning.loggers import ( TensorBoardLogger, MLFlowLogger, NeptuneLogger, TestTubeLogger, CometLogger) +from tests.base import EvalModelTemplate def _get_logger_args(logger_class, save_dir): @@ -29,14 +30,12 @@ def _get_logger_args(logger_class, save_dir): ]) def test_loggers_fit_test(tmpdir, monkeypatch, logger_class): """Verify that basic functionality of all loggers.""" - tutils.reset_seed() - # prevent comet logger from trying to print at exit, since # pytest's stdout/stderr redirection breaks it import atexit monkeypatch.setattr(atexit, 'register', lambda _: None) - model, _ = tutils.get_default_model() + model = EvalModelTemplate(tutils.get_default_hparams()) class StoreHistoryLogger(logger_class): def __init__(self, *args, **kwargs): @@ -78,8 +77,6 @@ def log_metrics(self, metrics, step): ]) def test_loggers_pickle(tmpdir, monkeypatch, logger_class): """Verify that pickling trainer with logger works.""" - tutils.reset_seed() - # prevent comet logger from trying to print at exit, since # pytest's stdout/stderr redirection breaks it import atexit diff --git a/tests/loggers/test_base.py b/tests/loggers/test_base.py index 56f6b97b0caa80..1a52dadf9c6218 100644 --- a/tests/loggers/test_base.py +++ b/tests/loggers/test_base.py @@ -7,7 +7,7 @@ from pytorch_lightning import Trainer from pytorch_lightning.loggers import LightningLoggerBase, LoggerCollection from pytorch_lightning.utilities import rank_zero_only -from tests.base import LightningTestModel +from tests.base import LightningTestModel, EvalModelTemplate def test_logger_collection(): @@ -139,7 +139,7 @@ def decorated(metrics, step): return decorated - model, hparams = tutils.get_default_model() + model = EvalModelTemplate(tutils.get_default_hparams()) model.validation_epoch_end = _validation_epoch_end model.training_epoch_end = _training_epoch_end trainer = Trainer( diff --git a/tests/loggers/test_neptune.py b/tests/loggers/test_neptune.py index 4cfdd673478ba9..11961234ed41b1 100644 --- a/tests/loggers/test_neptune.py +++ b/tests/loggers/test_neptune.py @@ -61,10 +61,7 @@ def test_neptune_additional_methods(neptune): def test_neptune_leave_open_experiment_after_fit(tmpdir): """Verify that neptune experiment was closed after training""" - tutils.reset_seed() - - hparams = tutils.get_default_hparams() - model = LightningTestModel(hparams) + model = LightningTestModel(tutils.get_default_hparams()) def _run_training(logger): logger._experiment = MagicMock() diff --git a/tests/loggers/test_trains.py b/tests/loggers/test_trains.py index e4ee78c65419cf..305d0707079b08 100644 --- a/tests/loggers/test_trains.py +++ b/tests/loggers/test_trains.py @@ -8,8 +8,6 @@ def test_trains_logger(tmpdir): """Verify that basic functionality of TRAINS logger works.""" - tutils.reset_seed() - hparams = tutils.get_default_hparams() model = LightningTestModel(hparams) TrainsLogger.set_bypass_mode(True) @@ -33,8 +31,6 @@ def test_trains_logger(tmpdir): def test_trains_pickle(tmpdir): """Verify that pickling trainer with TRAINS logger works.""" - tutils.reset_seed() - # hparams = tutils.get_default_hparams() # model = LightningTestModel(hparams) TrainsLogger.set_bypass_mode(True) diff --git a/tests/loggers/test_wandb.py b/tests/loggers/test_wandb.py index bb2739f95f6e09..3a63fcb9da57e8 100644 --- a/tests/loggers/test_wandb.py +++ b/tests/loggers/test_wandb.py @@ -11,16 +11,14 @@ def test_wandb_logger(wandb): """Verify that basic functionality of wandb logger works. Wandb doesn't work well with pytest so we have to mock it out here.""" - tutils.reset_seed() - logger = WandbLogger(anonymous=True, offline=True) logger.log_metrics({'acc': 1.0}) - wandb.init().log.assert_called_once_with({'acc': 1.0}) + wandb.init().log.assert_called_once_with({'acc': 1.0}, step=None) wandb.init().log.reset_mock() logger.log_metrics({'acc': 1.0}, step=3) - wandb.init().log.assert_called_once_with({'global_step': 3, 'acc': 1.0}) + wandb.init().log.assert_called_once_with({'acc': 1.0}, step=3) logger.log_hyperparams({'test': None}) wandb.init().config.update.assert_called_once_with({'test': None}, allow_val_change=True) @@ -38,8 +36,6 @@ def test_wandb_pickle(wandb): Wandb doesn't work well with pytest so we have to mock it out here. """ - tutils.reset_seed() - class Experiment: id = 'the_id' diff --git a/tests/models/data/horovod/train_default_model.py b/tests/models/data/horovod/train_default_model.py index 3410cdc1d50514..5b9a08c6bd420a 100644 --- a/tests/models/data/horovod/train_default_model.py +++ b/tests/models/data/horovod/train_default_model.py @@ -29,7 +29,8 @@ from pytorch_lightning import Trainer # noqa: E402 from pytorch_lightning.callbacks import ModelCheckpoint # noqa: E402 -import tests.base.utils as tutils # noqa: E402 +from tests.base import EvalModelTemplate # noqa: E402 +from tests.base.utils import set_random_master_port, get_default_hparams, run_model_test # noqa: E402 parser = argparse.ArgumentParser() @@ -39,14 +40,13 @@ def run_test_from_config(trainer_options): """Trains the default model with the given config.""" - tutils.reset_seed() - tutils.set_random_master_port() + set_random_master_port() ckpt_path = trainer_options['default_root_dir'] - trainer_options['checkpoint_callback'] = ModelCheckpoint(ckpt_path) + trainer_options.update(checkpoint_callback=ModelCheckpoint(ckpt_path)) - model, hparams = tutils.get_default_model() - tutils.run_model_test(trainer_options, model, on_gpu=args.on_gpu, version=0, with_hpc=False) + model = EvalModelTemplate(get_default_hparams()) + run_model_test(trainer_options, model, on_gpu=args.on_gpu, version=0, with_hpc=False) # Horovod should be initialized following training. If not, this will raise an exception. assert hvd.size() == 2 diff --git a/tests/models/test_amp.py b/tests/models/test_amp.py index 81a2325c098974..f4f1d9c20a6e9e 100644 --- a/tests/models/test_amp.py +++ b/tests/models/test_amp.py @@ -6,9 +6,7 @@ import tests.base.utils as tutils from pytorch_lightning import Trainer from pytorch_lightning.utilities.exceptions import MisconfigurationException -from tests.base import ( - LightningTestModel, -) +from tests.base import LightningTestModel, EvalModelTemplate @pytest.mark.spawn @@ -18,8 +16,6 @@ def test_amp_single_gpu(tmpdir, backend): """Make sure DP/DDP + AMP work.""" tutils.reset_seed() - model, hparams = tutils.get_default_model() - trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, @@ -28,6 +24,7 @@ def test_amp_single_gpu(tmpdir, backend): precision=16 ) + model = EvalModelTemplate(tutils.get_default_hparams()) # tutils.run_model_test(trainer_options, model) result = trainer.fit(model) @@ -39,10 +36,9 @@ def test_amp_single_gpu(tmpdir, backend): @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_amp_multi_gpu(tmpdir, backend): """Make sure DP/DDP + AMP work.""" - tutils.reset_seed() tutils.set_random_master_port() - model, hparams = tutils.get_default_model() + model = EvalModelTemplate(tutils.get_default_hparams()) trainer_options = dict( default_root_dir=tmpdir, @@ -63,8 +59,6 @@ def test_amp_multi_gpu(tmpdir, backend): @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_amp_gpu_ddp_slurm_managed(tmpdir): """Make sure DDP + AMP work.""" - tutils.reset_seed() - # simulate setting slurm flags tutils.set_random_master_port() os.environ['SLURM_LOCALID'] = str(0) @@ -102,8 +96,6 @@ def test_amp_gpu_ddp_slurm_managed(tmpdir): def test_cpu_model_with_amp(tmpdir): """Make sure model trains on CPU.""" - tutils.reset_seed() - trainer_options = dict( default_root_dir=tmpdir, progress_bar_refresh_rate=0, @@ -113,7 +105,7 @@ def test_cpu_model_with_amp(tmpdir): precision=16 ) - model, hparams = tutils.get_default_model() + model = EvalModelTemplate(tutils.get_default_hparams()) with pytest.raises((MisconfigurationException, ModuleNotFoundError)): tutils.run_model_test(trainer_options, model, on_gpu=False) diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index e7b422dcb22cf6..46d1ba6e44aaf3 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -15,13 +15,12 @@ LightTrainDataloader, LightningTestModel, LightTestMixin, + EvalModelTemplate, ) def test_early_stopping_cpu_model(tmpdir): """Test each of the trainer options.""" - tutils.reset_seed() - stopping = EarlyStopping(monitor='val_loss', min_delta=0.1) trainer_options = dict( default_root_dir=tmpdir, @@ -33,7 +32,7 @@ def test_early_stopping_cpu_model(tmpdir): val_percent_check=0.1, ) - model, hparams = tutils.get_default_model() + model = EvalModelTemplate(tutils.get_default_hparams()) tutils.run_model_test(trainer_options, model, on_gpu=False) # test freeze on cpu @@ -49,10 +48,8 @@ def test_early_stopping_cpu_model(tmpdir): reason="Distributed training is not supported on MacOS before Torch 1.3.0") def test_multi_cpu_model_ddp(tmpdir): """Make sure DDP works.""" - tutils.reset_seed() tutils.set_random_master_port() - model, hparams = tutils.get_default_model() trainer_options = dict( default_root_dir=tmpdir, progress_bar_refresh_rate=0, @@ -64,13 +61,12 @@ def test_multi_cpu_model_ddp(tmpdir): distributed_backend='ddp_cpu' ) + model = EvalModelTemplate(tutils.get_default_hparams()) tutils.run_model_test(trainer_options, model, on_gpu=False) def test_lbfgs_cpu_model(tmpdir): """Test each of the trainer options.""" - tutils.reset_seed() - trainer_options = dict( default_root_dir=tmpdir, max_epochs=2, @@ -80,15 +76,16 @@ def test_lbfgs_cpu_model(tmpdir): val_percent_check=0.2, ) - model, hparams = tutils.get_default_model(lbfgs=True) - # the test is there for the closure not the performance - tutils.run_model_test_without_loggers(trainer_options, model, min_acc=0.) + hparams = tutils.get_default_hparams() + setattr(hparams, 'optimizer_name', 'lbfgs') + setattr(hparams, 'learning_rate', 0.002) + model = EvalModelTemplate(hparams) + model.configure_optimizers = model.configure_optimizers__lbfgs + tutils.run_model_test_without_loggers(trainer_options, model, min_acc=0.5) def test_default_logger_callbacks_cpu_model(tmpdir): """Test each of the trainer options.""" - tutils.reset_seed() - trainer_options = dict( default_root_dir=tmpdir, max_epochs=1, @@ -99,7 +96,7 @@ def test_default_logger_callbacks_cpu_model(tmpdir): val_percent_check=0.01, ) - model, hparams = tutils.get_default_model() + model = EvalModelTemplate(tutils.get_default_hparams()) tutils.run_model_test_without_loggers(trainer_options, model) # test freeze on cpu @@ -109,8 +106,6 @@ def test_default_logger_callbacks_cpu_model(tmpdir): def test_running_test_after_fitting(tmpdir): """Verify test() on fitted model.""" - tutils.reset_seed() - hparams = tutils.get_default_hparams() model = LightningTestModel(hparams) @@ -143,8 +138,6 @@ def test_running_test_after_fitting(tmpdir): def test_running_test_no_val(tmpdir): """Verify `test()` works on a model with no `val_loader`.""" - tutils.reset_seed() - class CurrentTestModel(LightTrainDataloader, LightTestMixin, TestModelBase): pass @@ -180,8 +173,6 @@ class CurrentTestModel(LightTrainDataloader, LightTestMixin, TestModelBase): @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") def test_single_gpu_batch_parse(): - tutils.reset_seed() - trainer = Trainer() # batch is just a tensor @@ -229,8 +220,6 @@ def test_single_gpu_batch_parse(): def test_simple_cpu(tmpdir): """Verify continue training session on CPU.""" - tutils.reset_seed() - hparams = tutils.get_default_hparams() model = LightningTestModel(hparams) @@ -249,8 +238,6 @@ def test_simple_cpu(tmpdir): def test_cpu_model(tmpdir): """Make sure model trains on CPU.""" - tutils.reset_seed() - trainer_options = dict( default_root_dir=tmpdir, progress_bar_refresh_rate=0, @@ -259,15 +246,13 @@ def test_cpu_model(tmpdir): val_percent_check=0.4 ) - model, hparams = tutils.get_default_model() + model = EvalModelTemplate(tutils.get_default_hparams()) tutils.run_model_test(trainer_options, model, on_gpu=False) def test_all_features_cpu_model(tmpdir): """Test each of the trainer options.""" - tutils.reset_seed() - trainer_options = dict( default_root_dir=tmpdir, gradient_clip_val=1.0, @@ -280,14 +265,12 @@ def test_all_features_cpu_model(tmpdir): val_percent_check=0.4 ) - model, hparams = tutils.get_default_model() + model = EvalModelTemplate(tutils.get_default_hparams()) tutils.run_model_test(trainer_options, model, on_gpu=False) def test_tbptt_cpu_model(tmpdir): """Test truncated back propagation through time works.""" - tutils.reset_seed() - truncated_bptt_steps = 2 sequence_size = 30 batch_size = 30 @@ -358,10 +341,6 @@ def train_dataloader(self): @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") def test_single_gpu_model(tmpdir): """Make sure single GPU works (DP mode).""" - tutils.reset_seed() - - model, hparams = tutils.get_default_model() - trainer_options = dict( default_root_dir=tmpdir, progress_bar_refresh_rate=0, @@ -371,8 +350,5 @@ def test_single_gpu_model(tmpdir): gpus=1 ) + model = EvalModelTemplate(tutils.get_default_hparams()) tutils.run_model_test(trainer_options, model) - - -# if __name__ == '__main__': -# pytest.main([__file__]) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index dcd90b08ce9114..dbaf4db8f8ed2f 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -9,7 +9,7 @@ from pytorch_lightning.core import memory from pytorch_lightning.trainer.distrib_parts import parse_gpu_ids, determine_root_gpu_device from pytorch_lightning.utilities.exceptions import MisconfigurationException -from tests.base import LightningTestModel +from tests.base import LightningTestModel, EvalModelTemplate PRETEND_N_OF_GPUS = 16 @@ -19,11 +19,8 @@ @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_multi_gpu_model(tmpdir, backend): """Make sure DDP works.""" - - tutils.reset_seed() tutils.set_random_master_port() - model, hparams = tutils.get_default_model() trainer_options = dict( default_root_dir=tmpdir, max_epochs=1, @@ -33,6 +30,7 @@ def test_multi_gpu_model(tmpdir, backend): distributed_backend=backend, ) + model = EvalModelTemplate(tutils.get_default_hparams()) # tutils.run_model_test(trainer_options, model) trainer = Trainer(**trainer_options) result = trainer.fit(model) @@ -45,31 +43,27 @@ def test_multi_gpu_model(tmpdir, backend): @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_ddp_all_dataloaders_passed_to_fit(tmpdir): """Make sure DDP works with dataloaders passed to fit()""" - - tutils.reset_seed() tutils.set_random_master_port() - model, hparams = tutils.get_default_model() + trainer_options = dict(default_root_dir=tmpdir, + progress_bar_refresh_rate=0, + max_epochs=1, + train_percent_check=0.4, + val_percent_check=0.2, + gpus=[0, 1], + distributed_backend='ddp') - trainer = Trainer( - default_root_dir=tmpdir, - progress_bar_refresh_rate=0, - max_epochs=1, - train_percent_check=0.4, - val_percent_check=0.2, - gpus=[0, 1], - distributed_backend='ddp' - ) - result = trainer.fit(model, - train_dataloader=model.train_dataloader(), - val_dataloaders=model.val_dataloader()) + model = EvalModelTemplate(tutils.get_default_hparams()) + fit_options = dict(train_dataloader=model.train_dataloader(), + val_dataloaders=model.val_dataloader()) + + trainer = Trainer(**trainer_options) + result = trainer.fit(model, **fit_options) assert result == 1, "DDP doesn't work with dataloaders passed to fit()." def test_cpu_slurm_save_load(tmpdir): """Verify model save/load/checkpoint on CPU.""" - tutils.reset_seed() - hparams = tutils.get_default_hparams() model = LightningTestModel(hparams) @@ -139,9 +133,6 @@ def assert_pred_same(): @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_multi_gpu_none_backend(tmpdir): """Make sure when using multiple GPUs the user can't use `distributed_backend = None`.""" - tutils.reset_seed() - - model, hparams = tutils.get_default_model() trainer_options = dict( default_root_dir=tmpdir, progress_bar_refresh_rate=0, @@ -151,6 +142,7 @@ def test_multi_gpu_none_backend(tmpdir): gpus='-1' ) + model = EvalModelTemplate(tutils.get_default_hparams()) with pytest.warns(UserWarning): tutils.run_model_test(trainer_options, model) diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index 21a90c191579be..0f41dee6e4fb07 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -40,8 +40,12 @@ def _nccl_available(): def _run_horovod(trainer_options, on_gpu=False): """Execute the training script across multiple workers in parallel.""" - cmdline = ['horovodrun', '-np', '2', sys.executable, TEST_SCRIPT, - '--trainer-options', shlex.quote(json.dumps(trainer_options))] + cmdline = [ + 'horovodrun', + '-np', '2', + sys.executable, TEST_SCRIPT, + '--trainer-options', shlex.quote(json.dumps(trainer_options)) + ] if on_gpu: cmdline += ['--on-gpu'] exit_code = subprocess.call(' '.join(cmdline), shell=True, env=os.environ.copy()) diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py index 0921a3a871f10a..af0165d498ab0e 100644 --- a/tests/models/test_restore.py +++ b/tests/models/test_restore.py @@ -21,8 +21,6 @@ @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_running_test_pretrained_model_distrib(tmpdir, backend): """Verify `test()` on pretrained model.""" - - tutils.reset_seed() tutils.set_random_master_port() hparams = tutils.get_default_hparams() @@ -74,8 +72,6 @@ def test_running_test_pretrained_model_distrib(tmpdir, backend): def test_running_test_pretrained_model_cpu(tmpdir): """Verify test() on pretrained model.""" - tutils.reset_seed() - hparams = tutils.get_default_hparams() model = LightningTestModel(hparams) @@ -113,8 +109,6 @@ def test_running_test_pretrained_model_cpu(tmpdir): def test_load_model_from_checkpoint(tmpdir): """Verify test() on pretrained model.""" - tutils.reset_seed() - hparams = tutils.get_default_hparams() model = LightningTestModel(hparams) @@ -157,9 +151,6 @@ def test_load_model_from_checkpoint(tmpdir): @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_dp_resume(tmpdir): """Make sure DP continues training correctly.""" - - tutils.reset_seed() - hparams = tutils.get_default_hparams() model = LightningTestModel(hparams) @@ -232,8 +223,6 @@ def assert_good_acc(): def test_model_saving_loading(tmpdir): """Tests use case where trainer saves the model, and user loads it from tags independently.""" - tutils.reset_seed() - hparams = tutils.get_default_hparams() model = LightningTestModel(hparams) diff --git a/tests/trainer/test_checks.py b/tests/trainer/test_checks.py index d69ec1e6e243a9..7a68b2ab13bcaa 100755 --- a/tests/trainer/test_checks.py +++ b/tests/trainer/test_checks.py @@ -15,7 +15,6 @@ def test_error_on_no_train_step(tmpdir): """ Test that an error is thrown when no `training_step()` is defined """ - tutils.reset_seed() class CurrentTestModel(LightningModule): def forward(self, x): @@ -30,7 +29,6 @@ def forward(self, x): def test_error_on_no_train_dataloader(tmpdir): """ Test that an error is thrown when no `training_dataloader()` is defined """ - tutils.reset_seed() hparams = tutils.get_default_hparams() class CurrentTestModel(TestModelBase): @@ -45,7 +43,6 @@ class CurrentTestModel(TestModelBase): def test_error_on_no_configure_optimizers(tmpdir): """ Test that an error is thrown when no `configure_optimizers()` is defined """ - tutils.reset_seed() class CurrentTestModel(LightTrainDataloader, LightningModule): def forward(self, x): @@ -68,7 +65,6 @@ def test_warning_on_wrong_validation_settings(tmpdir): throw warning if `val_epoch_end()` is not defined * error if `validation_step()` is overriden but `val_dataloader()` is not """ - tutils.reset_seed() hparams = tutils.get_default_hparams() trainer = Trainer(default_root_dir=tmpdir, max_epochs=1) @@ -111,7 +107,6 @@ def test_warning_on_wrong_test_settigs(tmpdir): throw warning if `test_epoch_end()` is not defined * error if `test_step()` is overriden but `test_dataloader()` is not """ - tutils.reset_seed() hparams = tutils.get_default_hparams() trainer = Trainer(default_root_dir=tmpdir, max_epochs=1) diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index 83ff481d694f94..b6f6262ee90e12 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -2,6 +2,8 @@ import pytest import torch +from torch.utils.data.dataloader import DataLoader +from torch.utils.data.dataset import Subset import tests.base.utils as tutils from pytorch_lightning import Trainer @@ -55,7 +57,6 @@ class CurrentTestModel( def test_multiple_val_dataloader(tmpdir): """Verify multiple val_dataloader.""" - tutils.reset_seed() class CurrentTestModel( LightTrainDataloader, @@ -90,7 +91,6 @@ class CurrentTestModel( def test_multiple_test_dataloader(tmpdir): """Verify multiple test_dataloader.""" - tutils.reset_seed() class CurrentTestModel( LightTrainDataloader, @@ -127,7 +127,6 @@ class CurrentTestModel( def test_train_dataloaders_passed_to_fit(tmpdir): """Verify that train dataloader can be passed to fit """ - tutils.reset_seed() class CurrentTestModel(LightTrainDataloader, TestModelBase): pass @@ -149,7 +148,6 @@ class CurrentTestModel(LightTrainDataloader, TestModelBase): def test_train_val_dataloaders_passed_to_fit(tmpdir): """ Verify that train & val dataloader can be passed to fit """ - tutils.reset_seed() class CurrentTestModel( LightTrainDataloader, @@ -178,7 +176,6 @@ class CurrentTestModel( def test_all_dataloaders_passed_to_fit(tmpdir): """Verify train, val & test dataloader can be passed to fit """ - tutils.reset_seed() class CurrentTestModel( LightTrainDataloader, @@ -215,7 +212,6 @@ class CurrentTestModel( def test_multiple_dataloaders_passed_to_fit(tmpdir): """Verify that multiple val & test dataloaders can be passed to fit.""" - tutils.reset_seed() class CurrentTestModel( LightningTestModel, @@ -252,7 +248,6 @@ class CurrentTestModel( def test_mixing_of_dataloader_options(tmpdir): """Verify that dataloaders can be passed to fit""" - tutils.reset_seed() class CurrentTestModel( LightTrainDataloader, @@ -294,7 +289,6 @@ class CurrentTestModel( def test_inf_train_dataloader(tmpdir): """Test inf train data loader (e.g. IterableDataset)""" - tutils.reset_seed() class CurrentTestModel( LightInfTrainDataloader, @@ -336,7 +330,6 @@ class CurrentTestModel( def test_inf_val_dataloader(tmpdir): """Test inf val data loader (e.g. IterableDataset)""" - tutils.reset_seed() class CurrentTestModel( LightInfValDataloader, @@ -369,7 +362,6 @@ class CurrentTestModel( def test_inf_test_dataloader(tmpdir): """Test inf test data loader (e.g. IterableDataset)""" - tutils.reset_seed() class CurrentTestModel( LightInfTestDataloader, @@ -404,7 +396,6 @@ class CurrentTestModel( def test_error_on_zero_len_dataloader(tmpdir): """ Test that error is raised if a zero-length dataloader is defined """ - tutils.reset_seed() class CurrentTestModel( LightZeroLenDataloader, @@ -428,7 +419,6 @@ class CurrentTestModel( @pytest.mark.skipif(platform.system() == 'Windows', reason='Does not apply to Windows platform.') def test_warning_with_few_workers(tmpdir): """ Test that error is raised if dataloader with only a few workers is used """ - tutils.reset_seed() class CurrentTestModel( LightTrainDataloader, @@ -494,3 +484,46 @@ class CustomDummyObj: assert isinstance(result, torch.utils.data.DataLoader) assert isinstance(result, CustomDataLoader) assert hasattr(result, 'dummy_kwarg') + + +@pytest.mark.skipif(torch.cuda.device_count() < 3, reason='Test requires multiple GPUs') +def test_batch_size_smaller_than_num_gpus(): + # we need at least 3 gpus for this test + num_gpus = 3 + batch_size = 3 + + class CurrentTestModel( + LightTrainDataloader, + TestModelBase, + ): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.c_d1_bn = torch.nn.ReLU() + + def train_dataloader(self): + dataloader = super().train_dataloader() + # construct a dataset with a size that is not divisible by num_gpus + # therefore the last batch will have a size < num_gpus + size = num_gpus * batch_size + (num_gpus - 1) + dataset = Subset(dataloader.dataset, range(size)) + dataloader = DataLoader( + dataset, + batch_size=self.hparams.batch_size, + drop_last=False, + ) + return dataloader + + hparams = tutils.get_default_hparams() + hparams.batch_size = batch_size + model = CurrentTestModel(hparams) + + trainer = Trainer( + max_epochs=1, + gpus=num_gpus, + ) + + # we expect the reduction for the metrics also to happen on the last batch + # where we will get fewer metrics than gpus + result = trainer.fit(model) + assert 1 == result diff --git a/tests/trainer/test_lr_finder.py b/tests/trainer/test_lr_finder.py index 89154c8231c134..ea2eca3d712ad3 100755 --- a/tests/trainer/test_lr_finder.py +++ b/tests/trainer/test_lr_finder.py @@ -12,8 +12,7 @@ def test_error_on_more_than_1_optimizer(tmpdir): - ''' Check that error is thrown when more than 1 optimizer is passed ''' - tutils.reset_seed() + """ Check that error is thrown when more than 1 optimizer is passed """ class CurrentTestModel( LightTestMultipleOptimizersWithSchedulingMixin, @@ -36,8 +35,7 @@ class CurrentTestModel( def test_model_reset_correctly(tmpdir): - ''' Check that model weights are correctly reset after lr_find() ''' - tutils.reset_seed() + """ Check that model weights are correctly reset after lr_find() """ class CurrentTestModel( LightTrainDataloader, @@ -66,8 +64,7 @@ class CurrentTestModel( def test_trainer_reset_correctly(tmpdir): - ''' Check that all trainer parameters are reset correctly after lr_find() ''' - tutils.reset_seed() + """ Check that all trainer parameters are reset correctly after lr_find() """ class CurrentTestModel( LightTrainDataloader, @@ -104,7 +101,6 @@ class CurrentTestModel( def test_trainer_arg_bool(tmpdir): - tutils.reset_seed() class CurrentTestModel( LightTrainDataloader, @@ -129,7 +125,6 @@ class CurrentTestModel( def test_trainer_arg_str(tmpdir): - tutils.reset_seed() class CurrentTestModel( LightTrainDataloader, @@ -155,7 +150,6 @@ class CurrentTestModel( def test_call_to_trainer_method(tmpdir): - tutils.reset_seed() class CurrentTestModel( LightTrainDataloader, diff --git a/tests/trainer/test_optimizers.py b/tests/trainer/test_optimizers.py index b445dcb2f7173e..be0ac5471d24c7 100644 --- a/tests/trainer/test_optimizers.py +++ b/tests/trainer/test_optimizers.py @@ -12,13 +12,12 @@ LightTestMultipleOptimizersWithSchedulingMixin, LightTestOptimizersWithMixedSchedulingMixin, LightTestReduceLROnPlateauMixin, - LightTestNoneOptimizerMixin + LightTestNoneOptimizerMixin, EvalModelTemplate ) def test_optimizer_with_scheduling(tmpdir): """ Verify that learning rate scheduling is working """ - tutils.reset_seed() class CurrentTestModel( LightTestOptimizerWithSchedulingMixin, @@ -54,7 +53,6 @@ class CurrentTestModel( def test_multi_optimizer_with_scheduling(tmpdir): """ Verify that learning rate scheduling is working """ - tutils.reset_seed() class CurrentTestModel( LightTestMultipleOptimizersWithSchedulingMixin, @@ -94,7 +92,6 @@ class CurrentTestModel( def test_multi_optimizer_with_scheduling_stepping(tmpdir): - tutils.reset_seed() class CurrentTestModel( LightTestOptimizersWithMixedSchedulingMixin, @@ -138,7 +135,6 @@ class CurrentTestModel( def test_reduce_lr_on_plateau_scheduling(tmpdir): - tutils.reset_seed() class CurrentTestModel( LightTestReduceLROnPlateauMixin, @@ -168,10 +164,9 @@ class CurrentTestModel( def test_optimizer_return_options(): - tutils.reset_seed() trainer = Trainer() - model, hparams = tutils.get_default_model() + model = EvalModelTemplate(tutils.get_default_hparams()) # single optimizer opt_a = torch.optim.Adam(model.parameters(), lr=0.002) @@ -226,11 +221,10 @@ def test_optimizer_return_options(): def test_none_optimizer_warning(): - tutils.reset_seed() trainer = Trainer() - model, hparams = tutils.get_default_model() + model = EvalModelTemplate(tutils.get_default_hparams()) model.configure_optimizers = lambda: None with pytest.warns(UserWarning, match='will run with no optimizer'): @@ -238,7 +232,6 @@ def test_none_optimizer_warning(): def test_none_optimizer(tmpdir): - tutils.reset_seed() class CurrentTestModel( LightTestNoneOptimizerMixin, diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index b7344a70b3e400..d72c04ed80e365 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1,6 +1,7 @@ import glob import math import os +import types from argparse import Namespace import pytest @@ -22,7 +23,7 @@ LightValidationMultipleDataloadersMixin, LightTrainDataloader, LightTestDataloader, - LightValidationMixin, + LightValidationMixin, EvalModelTemplate, ) @@ -53,7 +54,6 @@ def test_hparams_save_load(tmpdir): def test_no_val_module(tmpdir): """Tests use case where trainer saves the model, and user loads it from tags independently.""" - tutils.reset_seed() hparams = tutils.get_default_hparams() @@ -92,7 +92,6 @@ class CurrentTestModel(LightTrainDataloader, TestModelBase): def test_no_val_end_module(tmpdir): """Tests use case where trainer saves the model, and user loads it from tags independently.""" - tutils.reset_seed() class CurrentTestModel(LightTrainDataloader, LightValidationStepMixin, TestModelBase): pass @@ -132,7 +131,6 @@ def test_gradient_accumulation_scheduling(tmpdir): """ Test grad accumulation by the freq of optimizer updates """ - tutils.reset_seed() # test incorrect configs with pytest.raises(IndexError): @@ -205,7 +203,6 @@ def _optimizer_step(self, epoch, batch_idx, optimizer, def test_loading_meta_tags(tmpdir): - tutils.reset_seed() hparams = tutils.get_default_hparams() @@ -225,7 +222,6 @@ def test_loading_meta_tags(tmpdir): def test_dp_output_reduce(): mixin = TrainerLoggingMixin() - tutils.reset_seed() # test identity when we have a single gpu out = torch.rand(3, 1) @@ -291,7 +287,6 @@ def mock_save_function(filepath): def test_model_freeze_unfreeze(): - tutils.reset_seed() hparams = tutils.get_default_hparams() model = LightningTestModel(hparams) @@ -300,11 +295,8 @@ def test_model_freeze_unfreeze(): model.unfreeze() -def test_resume_from_checkpoint(tmpdir): - """Verify resuming from checkpoint (epoch, batch numbers and on_load_checkpoint())""" - import types - - tutils.reset_seed() +def test_resume_from_checkpoint_epoch_restored(tmpdir): + """Verify resuming from checkpoint runs the right number of epochs""" hparams = tutils.get_default_hparams() @@ -371,8 +363,7 @@ def increment_on_load_checkpoint(self, _): def _init_steps_model(): """private method for initializing a model with 5% train epochs""" - tutils.reset_seed() - model, _ = tutils.get_default_model() + model = EvalModelTemplate(tutils.get_default_hparams()) # define train epoch to 5% of data train_percent = 0.5 @@ -460,7 +451,6 @@ def test_trainer_min_steps_and_epochs(tmpdir): def test_benchmark_option(tmpdir): """Verify benchmark option.""" - tutils.reset_seed() class CurrentTestModel( LightValidationMultipleDataloadersMixin, @@ -523,7 +513,6 @@ def test_epoch_end(self, outputs): def test_disabled_validation(): """Verify that `val_percent_check=0` disables the validation loop unless `fast_dev_run=True`.""" - tutils.reset_seed() class CurrentModel(LightTrainDataloader, LightValidationMixin, TestModelBase): @@ -666,7 +655,6 @@ def test_gradient_clipping(tmpdir): """ Test gradient clipping """ - tutils.reset_seed() hparams = tutils.get_default_hparams() model = LightningTestModel(hparams) diff --git a/tests/trainer/test_trainer_cli.py b/tests/trainer/test_trainer_cli.py index b2d1da957bc172..c4c23d0fff4ed3 100644 --- a/tests/trainer/test_trainer_cli.py +++ b/tests/trainer/test_trainer_cli.py @@ -13,7 +13,6 @@ return_value=Namespace(**Trainer.default_attributes())) def test_default_args(tmpdir): """Tests default argument parser for Trainer""" - tutils.reset_seed() # logger file to get meta logger = tutils.get_default_logger(tmpdir)