From bc67689068a0db11adaf10b32a41bcd33b8ae88e Mon Sep 17 00:00:00 2001 From: William Falcon Date: Fri, 17 Jan 2020 06:03:31 -0500 Subject: [PATCH] clean v2 docs (#691) * updated gitignore * Update README.md * updated gitignore * updated links in ninja file * updated docs * Update README.md * Update README.md * finished callbacks * finished callbacks * finished callbacks * fixed left menu * added callbacks to menu * added direct links to docs * added direct links to docs * added direct links to docs * added direct links to docs * added direct links to docs * fixing TensorBoard (#687) * flake8 * fix typo * fix tensorboardlogger drop test_tube dependence * formatting * fix tensorboard & tests * upgrade Tensorboard * test formatting separately * try to fix JIT issue * add tests for 1.4 * added direct links to docs * updated gitignore * updated links in ninja file * updated docs * finished callbacks * finished callbacks * finished callbacks * fixed left menu * added callbacks to menu * added direct links to docs * added direct links to docs * added direct links to docs * added direct links to docs * added direct links to docs * added direct links to docs * finished rebase * making private members * making private members * making private members * working on trainer docs * working on trainer docs * working on trainer docs * working on trainer docs * working on trainer docs * working on trainer docs * set auto dp if no backend * working on trainer docs * working on trainer docs * working on trainer docs * working on trainer docs * working on trainer docs * working on trainer docs * working on trainer docs * working on trainer docs * fixed lightning import * cleared spaces * cleared spaces * cleared spaces * cleared spaces * cleared spaces * cleared spaces * cleared spaces * cleared spaces * cleared spaces * cleared spaces * finished lightning module * finished lightning module * finished lightning module * finished lightning module * added callbacks * added loggers * added loggers * added loggers * added loggers * added loggers * added loggers * added loggers * added loggers * set auto dp if no backend * added loggers * added loggers * added loggers * added loggers * added loggers * added loggers * flake 8 * flake 8 Co-authored-by: Jirka Borovec --- .gitignore | 2 + docs/source/_templates/theme_variables.jinja | 24 +- docs/source/conf.py | 1 + docs/source/examples.rst | 32 +- docs/source/index.rst | 37 +- docs/source/new-project.rst | 15 +- pytorch_lightning/callbacks/pt_callbacks.py | 127 +++-- pytorch_lightning/core/__init__.py | 90 +-- pytorch_lightning/core/decorators.py | 2 + pytorch_lightning/core/lightning.py | 525 +++++++++++------- pytorch_lightning/logging/__init__.py | 151 ++--- pytorch_lightning/logging/comet.py | 114 ++-- pytorch_lightning/logging/mlflow.py | 18 + pytorch_lightning/logging/neptune.py | 100 +++- pytorch_lightning/logging/tensorboard.py | 27 +- pytorch_lightning/logging/test_tube.py | 37 ++ pytorch_lightning/logging/wandb.py | 43 +- pytorch_lightning/trainer/__init__.py | 24 +- .../trainer/auto_mix_precision.py | 1 + pytorch_lightning/trainer/trainer.py | 410 ++++++++++++-- pytorch_lightning/trainer/training_io.py | 31 -- tests/test_trainer.py | 4 +- 22 files changed, 1158 insertions(+), 657 deletions(-) diff --git a/.gitignore b/.gitignore index 654e70b1cbc1a..43541c9dcbe80 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,8 @@ tests/save_dir default/ lightning_logs/ tests/tests/ +*.rst +/docs/source/*.md # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/docs/source/_templates/theme_variables.jinja b/docs/source/_templates/theme_variables.jinja index 4982f35867a49..3a67ad64d384d 100644 --- a/docs/source/_templates/theme_variables.jinja +++ b/docs/source/_templates/theme_variables.jinja @@ -1,17 +1,17 @@ {%- set external_urls = { - 'github': 'https://github.com/williamFalcon/pytorch-lightning', - 'github_issues': 'https://github.com/williamFalcon/pytorch-lightning/issues', - 'contributing': 'https://github.com/williamFalcon/pytorch-lightning/blob/master/CONTRIBUTING.md', - 'docs': 'https://williamfalcon.github.io/pytorch-lightning', + 'github': 'https://github.com/PytorchLightning/pytorch-lightning', + 'github_issues': 'https://github.com/PytorchLightning/pytorch-lightning/issues', + 'contributing': 'https://github.com/PytorchLightning/pytorch-lightning/blob/master/CONTRIBUTING.md', + 'docs': 'https://pytorchlightning.github.io/pytorch-lightning', 'twitter': 'https://twitter.com/PyTorchLightnin', 'discuss': 'https://discuss.pytorch.org', - 'tutorials': 'https://williamfalcon.github.io/pytorch-lightning/', - 'previous_pytorch_versions': 'https://williamfalcon.github.io/pytorch-lightning/', - 'home': 'https://williamfalcon.github.io/pytorch-lightning/', - 'get_started': 'https://williamfalcon.github.io/pytorch-lightning/', - 'features': 'https://williamfalcon.github.io/pytorch-lightning/', - 'blog': 'https://williamfalcon.github.io/pytorch-lightning/', - 'resources': 'https://williamfalcon.github.io/pytorch-lightning/', - 'support': 'https://williamfalcon.github.io/pytorch-lightning/', + 'tutorials': 'https://pytorchlightning.github.io/pytorch-lightning/', + 'previous_pytorch_versions': 'https://pytorchlightning.github.io/pytorch-lightning/', + 'home': 'https://pytorchlightning.github.io/pytorch-lightning/', + 'get_started': 'https://pytorchlightning.github.io/pytorch-lightning/', + 'features': 'https://pytorchlightning.github.io/pytorch-lightning/', + 'blog': 'https://pytorchlightning.github.io/pytorch-lightning/', + 'resources': 'https://pytorchlightning.github.io/pytorch-lightning/', + 'support': 'https://pytorchlightning.github.io/pytorch-lightning/', } -%} diff --git a/docs/source/conf.py b/docs/source/conf.py index da0774a1d864d..fa3558ab7968c 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -83,6 +83,7 @@ 'sphinx.ext.autosummary', 'sphinx.ext.napoleon', 'recommonmark', + 'sphinx.ext.autosectionlabel', # 'm2r', 'nbsphinx', ] diff --git a/docs/source/examples.rst b/docs/source/examples.rst index 3f15f06e4e99d..a734243d5d41b 100644 --- a/docs/source/examples.rst +++ b/docs/source/examples.rst @@ -1,8 +1,34 @@ -Examples & Tutorials -==================== +GAN +==== +.. toctree:: + :maxdepth: 3 + + pl_examples.domain_templates.gan + +MNIST +==== +.. toctree:: + :maxdepth: 3 + + pl_examples.basic_examples.lightning_module_template + +Multi-node (ddp) MNIST +==== +.. toctree:: + :maxdepth: 3 + + pl_examples.multi_node_examples.multi_node_ddp_demo + +Multi-node (ddp2) MNIST +==== +.. toctree:: + :maxdepth: 3 + pl_examples.multi_node_examples.multi_node_ddp2_demo +Imagenet +==== .. toctree:: :maxdepth: 3 - pl_examples \ No newline at end of file + pl_examples.full_examples.imagenet.imagenet_example diff --git a/docs/source/index.rst b/docs/source/index.rst index 0dd7a6af9f681..234331f1e248e 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -3,23 +3,47 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -Welcome to PyTorch-Lightning! +PyTorch-Lightning Documentation ============================= .. toctree:: - :maxdepth: 4 + :maxdepth: 1 :name: start - :caption: Quick Start + :caption: Start Here new-project - examples .. toctree:: :maxdepth: 4 :name: docs - :caption: Docs + :caption: Python API + + callbacks + lightning-module + logging + trainer + +.. toctree:: + :maxdepth: 1 + :name: Examples + :caption: Examples + + examples + +.. toctree:: + :maxdepth: 1 + :name: Tutorials + :caption: Tutorials + + tutorials + +.. toctree:: + :maxdepth: 1 + :name: Common Use Cases + :caption: Common Use Cases + + common-cases - documentation .. toctree:: :maxdepth: 1 @@ -29,6 +53,7 @@ Welcome to PyTorch-Lightning! CODE_OF_CONDUCT.md CONTRIBUTING.md BECOMING_A_CORE_CONTRIBUTOR.md + governance.md Indices and tables diff --git a/docs/source/new-project.rst b/docs/source/new-project.rst index 448e7e3817090..35834518355d8 100644 --- a/docs/source/new-project.rst +++ b/docs/source/new-project.rst @@ -1,13 +1,13 @@ Quick Start =========== -To start a new project define two files, a LightningModule and a Trainer file. -To illustrate Lightning power and simplicity, here's an example of a typical research flow. +| To start a new project define two files, a LightningModule and a Trainer file. +| To illustrate the power of Lightning and its simplicity, here's an example of a typical research flow. Case 1: BERT ------------ -Let's say you're working on something like BERT but want to try different ways of training or even different networks. -You would define a single LightningModule and use flags to switch between your different ideas. +| Let's say you're working on something like BERT but want to try different ways of training or even different networks. +| You would define a single LightningModule and use flags to switch between your different ideas. .. code-block:: python @@ -66,6 +66,7 @@ Then you could do rapid research by switching between these two and using the sa **Notice a few things about this flow:** -1. You're writing pure PyTorch... no unnecessary abstractions or new libraries to learn. -2. You get free GPU and 16-bit support without writing any of that code in your model. -3. You also get all of the capabilities below (without coding or testing yourself). +1. You're writing pure PyTorch... no unnecessary abstractions or new libraries to learn. +2. You get free GPU and 16-bit support without writing any of that code in your model. +3. You also get early stopping, multi-gpu training, 16-bit and MUCH more without coding anything! + diff --git a/pytorch_lightning/callbacks/pt_callbacks.py b/pytorch_lightning/callbacks/pt_callbacks.py index 1951719877199..4c7d877a85bd6 100644 --- a/pytorch_lightning/callbacks/pt_callbacks.py +++ b/pytorch_lightning/callbacks/pt_callbacks.py @@ -1,3 +1,9 @@ +""" +Callbacks +==================================== +Callbacks supported by Lightning +""" + import os import shutil import logging @@ -8,26 +14,7 @@ class Callback(object): - """Abstract base class used to build new callbacks. - - # Properties - * params: dict. Training parameters - (eg. verbosity, batch size, number of epochs...). - Reference of the model being trained. - - The `logs` dictionary that callback methods take as argument will contain keys - for quantities relevant to the current batch or epoch. - Currently, the `.fit()` method of the `Sequential` model class will include the following - quantities in the `logs` that it passes to its callbacks: - * on_epoch_end: logs include `acc` and `loss`, and - optionally include `val_loss` - (if validation is enabled in `fit`), and `val_acc` - (if validation and accuracy monitoring are enabled). - * on_batch_begin: logs include `size`, - the number of samples in the current batch. - * on_batch_end: logs include `loss`, and optionally `acc` - (if accuracy monitoring is enabled). - + r"""Abstract base class used to build new callbacks. """ def __init__(self): @@ -43,12 +30,30 @@ def set_model(self, model): self.model = model def on_epoch_begin(self, epoch, logs=None): + """ + called when the epoch begins + + Args: + epoch (int): current epoch + logs (dict): key-value pairs of quantities to monitor + + Example: + + on_epoch_begin(epoch=2, logs={'val_loss': 0.2}) + """ pass def on_epoch_end(self, epoch, logs=None): pass def on_batch_begin(self, batch, logs=None): + """ + called when the batch starts. + + Args: + batch (Tensor): current batch tensor + logs (dict): key-value pairs of quantities to monitor + """ pass def on_batch_end(self, batch, logs=None): @@ -62,18 +67,19 @@ def on_train_end(self, logs=None): class EarlyStopping(Callback): - """Stop training when a monitored quantity has stopped improving. + r""" + Stop training when a monitored quantity has stopped improving. - # Arguments - monitor: quantity to be monitored. - min_delta: minimum change in the monitored quantity + Args: + monitor (str): quantity to be monitored. + min_delta (float): minimum change in the monitored quantity to qualify as an improvement, i.e. an absolute change of less than min_delta, will count as no improvement. - patience: number of epochs with no improvement + patience (int): number of epochs with no improvement after which training will be stopped. - verbose: verbosity mode. - mode: one of {auto, min, max}. In `min` mode, + verbose (bool): verbosity mode. + mode (str): one of {auto, min, max}. In `min` mode, training will stop when the quantity monitored has stopped decreasing; in `max` mode it will stop when the quantity @@ -81,6 +87,13 @@ class EarlyStopping(Callback): mode, the direction is automatically inferred from the name of the monitored quantity. + Example:: + + from pytorch_lightning import Trainer + from pytorch_lightning.callbacks import EarlyStopping + + early_stopping = EarlyStopping('val_loss') + Trainer(early_stop_callback=early_stopping) """ def __init__(self, monitor='val_loss', @@ -150,20 +163,22 @@ def on_train_end(self, logs=None): class ModelCheckpoint(Callback): - """Save the model after every epoch. - - The `filepath` can contain named formatting options, - which will be filled the value of `epoch` and - keys in `logs` (passed in `on_epoch_end`). - For example: if `filepath` is `weights.{epoch:02d}-{val_loss:.2f}.hdf5`, - then the model checkpoints will be saved with the epoch number and - the validation loss in the filename. - - # Arguments - filepath: string, path to save the model file. - monitor: quantity to monitor. - verbose: verbosity mode, 0 or 1. - save_top_k: if `save_top_k == k`, + r""" + + Save the model after every epoch. + + Args: + filepath (str): path to save the model file. + Can contain named formatting options to be auto-filled. + + Example:: + + # save epoch and val_loss in name + ModelCheckpoint(filepath='{epoch:02d}-{val_loss:.2f}.hdf5') + # saves file like: /path/epoch_2-val_loss_0.2.hdf5 + monitor (str): quantity to monitor. + verbose (bool): verbosity mode, 0 or 1. + save_top_k (int): if `save_top_k == k`, the best k models according to the quantity monitored will be saved. if `save_top_k == 0`, no models are saved. @@ -172,7 +187,7 @@ class ModelCheckpoint(Callback): if `save_top_k >= 2` and the callback is called multiple times inside an epoch, the name of the saved file will be appended with a version count starting with `v0`. - mode: one of {auto, min, max}. + mode (str): one of {auto, min, max}. If `save_top_k != 0`, the decision to overwrite the current save file is made based on either the maximization or the @@ -180,11 +195,20 @@ class ModelCheckpoint(Callback): this should be `max`, for `val_loss` this should be `min`, etc. In `auto` mode, the direction is automatically inferred from the name of the monitored quantity. - save_weights_only: if True, then only the model's weights will be + save_weights_only (bool): if True, then only the model's weights will be saved (`model.save_weights(filepath)`), else the full model is saved (`model.save(filepath)`). - period: Interval (number of epochs) between checkpoints. + period (int): Interval (number of epochs) between checkpoints. + + Example:: + from pytorch_lightning import Trainer + from pytorch_lightning.callbacks import ModelCheckpoint + + checkpoint_callback = ModelCheckpoint(filepath='my_path') + Trainer(checkpoint_callback=checkpoint_callback) + + # saves checkpoints to my_path whenever 'val_loss' has a new min """ def __init__(self, filepath, monitor='val_loss', verbose=0, @@ -330,11 +354,20 @@ def on_epoch_end(self, epoch, logs=None): class GradientAccumulationScheduler(Callback): - """Change gradient accumulation factor according to scheduling. + r""" + Change gradient accumulation factor according to scheduling. + + Args: + scheduling (dict): scheduling in format {epoch: accumulation_factor} + + Example:: - # Arguments - scheduling: dict, scheduling in format {epoch: accumulation_factor} + from pytorch_lightning import Trainer + from pytorch_lightning.callbacks import GradientAccumulationScheduler + # at epoch 5 start accumulating every 2 batches + accumulator = GradientAccumulationScheduler(scheduling: {5: 2}) + Trainer(accumulate_grad_batches=accumulator) """ def __init__(self, scheduling: dict): diff --git a/pytorch_lightning/core/__init__.py b/pytorch_lightning/core/__init__.py index c2694eabf5758..17d7619de3663 100644 --- a/pytorch_lightning/core/__init__.py +++ b/pytorch_lightning/core/__init__.py @@ -1,17 +1,16 @@ """ -Lightning Module interface -========================== +A LightningModule is a strict superclass of torch.nn.Module but provides an interface to standardize +the "ingredients" for a research or production system. -A lightning module is a strict superclass of nn.Module, it provides a standard interface - for the trainer to interact with the model. +- The model/system definition (__init__) +- The model/system computations (forward) +- What happens in the training loop (training_step, training_end) +- What happens in the validation loop (validation_step, validation_end) +- What happens in the test loop (test_step, test_end) +- What optimizers to use (configure_optimizers) +- What data to use (train_dataloader, val_dataloader, test_dataloader) -The easiest thing to do is copy the minimal example below and modify accordingly. - -Otherwise, to Define a Lightning Module, implement the following methods: - - -Minimal example ---------------- +Most methods are optional. Here's a minimal example. .. code-block:: python @@ -28,14 +27,12 @@ class CoolModel(pl.LightningModule): def __init__(self): super(CoolModel, self).__init__() - # not the best model... self.l1 = torch.nn.Linear(28 * 28, 10) def forward(self, x): return torch.relu(self.l1(x.view(x.size(0), -1))) def training_step(self, batch, batch_idx): - # REQUIRED x, y = batch y_hat = self.forward(x) return {'loss': F.cross_entropy(y_hat, y)} @@ -85,66 +82,19 @@ def test_dataloader(self): return DataLoader(MNIST(os.getcwd(), train=False, download=True, transform=transforms.ToTensor()), batch_size=32) - -How do these methods fit into the broader training? ---------------------------------------------------- - -The LightningModule interface is on the right. Each method corresponds - to a part of a research project. Lightning automates everything not in blue. - -.. figure:: docs/source/_static/images/overview_flat.jpg - :align: center - - Overview. - - -Optional Methods ----------------- - -**add_model_specific_args** +Once you've defined the LightningModule, fit it using a trainer. .. code-block:: python + trainer = pl.Trainer() + model = CoolModel() - @staticmethod - def add_model_specific_args(parent_parser, root_dir) + trainer.fit(model) -Lightning has a list of default argparse commands. - This method is your chance to add or modify commands specific to your model. - The `hyperparameter argument parser - `_ - is available anywhere in your model by calling self.hparams. - -**Return** -An argument parser - -**Example** - -.. code-block:: python - - @staticmethod - def add_model_specific_args(parent_parser, root_dir): - parser = HyperOptArgumentParser(strategy=parent_parser.strategy, parents=[parent_parser]) - - # param overwrites - # parser.set_defaults(gradient_clip_val=5.0) - - # network params - parser.opt_list('--drop_prob', default=0.2, options=[0.2, 0.5], type=float, tunable=False) - parser.add_argument('--in_features', default=28*28) - parser.add_argument('--out_features', default=10) - # use 500 for CPU, 50000 for GPU to see speed difference - parser.add_argument('--hidden_dim', default=50000) - - # data - parser.add_argument('--data_root', default=os.path.join(root_dir, 'mnist'), type=str) - - # training params (opt) - parser.opt_list('--learning_rate', default=0.001, type=float, - options=[0.0001, 0.0005, 0.001, 0.005], tunable=False) - parser.opt_list('--batch_size', default=256, type=int, - options=[32, 64, 128, 256], tunable=False) - parser.opt_list('--optimizer_name', default='adam', type=str, - options=['adam'], tunable=False) - return parser +Check out this +`COLAB `_ +for a live demo. """ +from .lightning import LightningModule + +__all__ = ['LightningModule'] diff --git a/pytorch_lightning/core/decorators.py b/pytorch_lightning/core/decorators.py index 0a87e00f57fc7..aeea1a7e44256 100644 --- a/pytorch_lightning/core/decorators.py +++ b/pytorch_lightning/core/decorators.py @@ -1,4 +1,5 @@ import traceback +from functools import wraps def data_loader(fn): @@ -8,6 +9,7 @@ def data_loader(fn): :return: """ + wraps(fn) attr_name = '_lazy_' + fn.__name__ def _get_data_loader(self): diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 3fca3968454ed..e1a328e48cf8b 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -1,109 +1,112 @@ + + import os import warnings import collections import logging +import pandas as pd from abc import ABC, abstractmethod from argparse import Namespace import torch import torch.distributed as dist - +# from pytorch_lightning.core.decorators import data_loader from pytorch_lightning.core.grads import GradInformation from pytorch_lightning.core.hooks import ModelHooks -from pytorch_lightning.core.memory import ModelSummary from pytorch_lightning.core.saving import ModelIO -from pytorch_lightning.trainer.training_io import load_hparams_from_tags_csv +from pytorch_lightning.core.memory import ModelSummary from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel class LightningModule(ABC, GradInformation, ModelIO, ModelHooks): - """ - A LightningModule has the following properties which you can access at any time - - **logger** - A reference to the logger you passed into trainer. - Passing a logger is optional. If you don't pass one in, Lightning will create one - for you automatically. This logger saves logs to `/os.getcwd()/lightning_logs`:: - - Trainer(logger=your_logger) - - - Call it from anywhere in your LightningModule to add metrics, images, etc... - whatever your logger supports. - - Here is an example using the TestTubeLogger (which is a wrapper - on 'PyTorch SummaryWriter `_ - with versioned folder structure). - - .. code-block:: python - - # if logger is a tensorboard logger or TestTubeLogger - self.logger.experiment.add_embedding(...) - self.logger.experiment.log({'val_loss': 0.9}) - self.logger.experiment.add_scalars(...) - - - **trainer** - Last resort access to any state the trainer has. - Changing certain properties here could affect your training run. - - .. code-block:: python - - self.trainer.optimizers - self.trainer.current_epoch - ... - - Debugging - --------- - - The LightningModule also offers these tricks to help debug. - - **example_input_array** - - In the LightningModule init, you can set a dummy tensor for this property - to get a print out of sizes coming into and out of every layer. - - .. code-block:: python - - def __init__(self): - # put the dimensions of the first input to your system - self.example_input_array = torch.rand(5, 28 * 28) - - - """ - def __init__(self, *args, **kwargs): super(LightningModule, self).__init__(*args, **kwargs) #: Current dtype self.dtype = torch.FloatTensor + self.exp_save_path = None + #: The current epoch self.current_epoch = 0 + #: Total training batches seen across all epochs self.global_step = 0 + self.loaded_optimizer_states_dict = {} + + #: Pointer to the trainer object self.trainer = None + + #: Pointer to the logger object self.logger = None self.example_input_array = None - # track if gpu was requested for checkpointing #: True if your model is currently running on GPUs. #: Useful to set flags around the LightningModule for different CPU vs GPU behavior. self.on_gpu = False + + #: True if using dp self.use_dp = False + + #: True if using ddp self.use_ddp = False + + #: True if using ddp2 self.use_ddp2 = False + + #: True if using amp self.use_amp = False @abstractmethod def forward(self, *args, **kwargs): - """ - Expand model in into whatever you need. - Also need to return the target - :param x: - :return: + r""" + Same as torch.nn.Module.forward(), however in Lightning you want this to define + the operations you want to use for prediction (ie: on a server or as a feature extractor). + + Normally you'd call self.forward() from your training_step() method. This makes it easy to write a complex + system for training with the outputs you'd want in a prediction setting. + + Args: + x (tensor): Whatever you decide to define in the forward method + + Return: + Predicted output + + Example + ------- + + .. code-block:: python + + # example if we were using this model as a feature extractor + def forward(self, x): + feature_maps = self.convnet(x) + return feature_maps + + def training_step(self, batch, batch_idx): + x, y = batch + feature_maps = self.forward(x) + logits = self.classifier(feature_maps) + + # ... + return loss + + # splitting it this way allows model to be used a feature extractor + model = MyModelAbove() + + inputs = server.get_request() + results = model(inputs) + server.write_results(results) + + # ------------- + # This is in stark contrast to torch.nn.Module where normally you would have this: + def forward(self, batch): + x, y = batch + feature_maps = self.convnet(x) + logits = self.classifier(feature_maps) + return logits + """ @abstractmethod @@ -247,12 +250,21 @@ def training_step(self, batch, batch_idx, hiddens): pass def validation_step(self, *args, **kwargs): - """return whatever outputs will need to be aggregated in validation_end + r""" - :param batch: The output of your dataloader. A tensor, tuple or list - :param int batch_idx: Integer displaying which batch this is - :param int dataloader_idx: Integer displaying which dataloader this is (only if multiple val datasets used) - :return dict: Dict or OrderedDict - passed to the validation_end step + This is the validation loop. It is called for each batch of the validation set. + Whatever is returned from here will be passed in as a list on validation_end. + In this step you'd normally generate examples or calculate anything of interest such as accuracy. + + Args: + batch (torch.nn.Tensor | (Tensor, Tensor) | [Tensor, Tensor]): The output of your dataloader. + A tensor, tuple or list + batch_idx (int): The index of this batch + dataloader_idx (int): The index of the dataloader that produced this batch (only if multiple + val datasets used) + + Return: + Dict or OrderedDict - passed to the validation_end step .. code-block:: python @@ -262,14 +274,6 @@ def validation_step(self, batch, batch_idx) # if you have multiple val dataloaders: def validation_step(self, batch, batch_idx, dataloader_idxdx) - If you don't need to validate you don't need to implement this method. - In this step you'd normally generate examples or calculate anything of interest such as accuracy. - - When the validation_step is called, the model has been put in eval mode and PyTorch gradients - have been disabled. At the end of validation, model goes back to training mode and gradients are enabled. - - The dict you return here will be available in the `validation_end` method. - Example ------- @@ -311,7 +315,10 @@ def validation_step(self, batch, batch_idx): def validation_step(self, batch, batch_idx, dataset_idx): # dataset_idx tells you which dataset this is. - The `dataset_idx` corresponds to the order of datasets returned in `val_dataloader`. + .. note:: If you don't need to validate you don't need to implement this method. + + .. note:: When the validation_step is called, the model has been put in eval mode and PyTorch gradients + have been disabled. At the end of validation, model goes back to training mode and gradients are enabled. """ pass @@ -522,20 +529,27 @@ def test_end(self, outputs): pass def configure_ddp(self, model, device_ids): - """Override to init DDP in a different way or use your own wrapper. + r""" - :param model: - :param device_ids: - :return: DDP wrapped model + Override to init DDP in your own way or with your own wrapper. + The only requirements are that: - Overwrite to define your own DDP implementation init. - The only requirement is that: 1. On a validation batch the call goes to model.validation_step. 2. On a training batch the call goes to model.training_step. 3. On a testing batch, the call goes to model.test_step + Args: + model (LightningModule): the LightningModule currently being optimized + device_ids (list): the list of GPU ids + + Return: + DDP wrapped model + + Example + ------- .. code-block:: python + # default implementation used in Trainer def configure_ddp(self, model, device_ids): # Lightning DDP simply routes to test_step, val_step, etc... model = LightningDistributedDataParallel( @@ -555,11 +569,17 @@ def configure_ddp(self, model, device_ids): return model def init_ddp_connection(self, proc_rank, world_size): - """Connect all procs in the world using the env:// init - Use the first node as the root address + r""" - Override to init DDP in your own way. + Override to define your custom way of setting up a distributed environment. + Lightning's implementation uses env:// init by default and sets the first node as root. + + Args: + proc_rank (int): The current process rank within the node. + world_size (int): Number of GPUs being use across all nodes. (num_nodes*nb_gpu_nodes). + Example + ------- .. code-block:: python def init_ddp_connection(self): @@ -590,7 +610,11 @@ def init_ddp_connection(self): root_node = self.trainer.resolve_root_node_address(root_node) os.environ['MASTER_ADDR'] = root_node - dist.init_process_group('nccl', rank=self.proc_rank, world_size=self.world_size) + dist.init_process_group( + 'nccl', + rank=self.proc_rank, + world_size=self.world_size + ) """ # use slurm job id for the port number @@ -623,19 +647,24 @@ def init_ddp_connection(self): dist.init_process_group('nccl', rank=proc_rank, world_size=world_size) def configure_apex(self, amp, model, optimizers, amp_level): - """ + r""" Override to init AMP your own way Must return a model and list of optimizers - :param amp: - :param model: - :param optimizers: - :param amp_level: - :return: Apex wrapped model and optimizers - Overwrite to define your own Apex implementation init. + Args: + amp (object): pointer to amp library object + model (LightningModule): pointer to current lightningModule + optimizers (list): list of optimizers passed in configure_optimizers() + amp_level (str): AMP mode chosen ('O1', 'O2', etc...) + + Return: + Apex wrapped model and optimizers + Example + ------- .. code-block:: python + # Default implementation used by Trainer. def configure_apex(self, amp, model, optimizers, amp_level): model, optimizers = amp.initialize( model, optimizers, opt_level=amp_level, @@ -651,25 +680,16 @@ def configure_apex(self, amp, model, optimizers, amp_level): @abstractmethod def configure_optimizers(self): - """Return a list of optimizers and a list of schedulers (could be empty) + r""" + + This is where you choose what optimizers and learning-rate schedulers to use in your optimization. + Normally you'd need one. But in the case of GANs or something more esoteric you might have multiple. - :return: any of these 3 options: + Return: any of these 3 options: - Single optimizer - List or Tuple - List of optimizers - Two lists - The first list has multiple optimizers, the second a list of learning-rate schedulers - Set up as many optimizers and (optionally) learning rate schedulers as you need. - Normally you'd need one. But in the case of GANs or something more esoteric you might have multiple. - Lightning will call .backward() and .step() on each one in every epoch. - If you use 16 bit precision it will also handle that. - - .. note:: If you use multiple optimizers, training_step will have an additional `optimizer_idx` parameter. - - .. note:: If you use LBFGS lightning handles the closure function automatically for you - - .. note:: If you use multiple optimizers, gradients will be calculated only - for the parameters of current optimizer at each training step. - Example ------- @@ -693,26 +713,39 @@ def configure_optimizers(self): discriminator_sched = CosineAnnealing(discriminator_opt, T_max=10) return [generator_opt, disriminator_opt], [discriminator_sched] - If you need to control how often those optimizers step or override the default .step() schedule, - override the `optimizer_step` hook. + .. note:: Lightning calls .backward() and .step() on each optimizer and learning rate scheduler as needed. + + .. note:: If you use 16-bit precision (use_amp=True), Lightning will automatically + handle the optimizers for you. + + .. note:: If you use multiple optimizers, training_step will have an additional `optimizer_idx` parameter. + + .. note:: If you use LBFGS lightning handles the closure function automatically for you + + .. note:: If you use multiple optimizers, gradients will be calculated only + for the parameters of current optimizer at each training step. + + .. note:: If you need to control how often those optimizers step or override the default .step() schedule, + override the `optimizer_step` hook. + """ def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None): - """Do something instead of the standard optimizer behavior - - :param int epoch: - :param int batch_idx: - :param optimizer: - :param optimizer_idx: - :param second_order_closure: closure for second order methods - :return: + r""" - Calls `.step()` and `.zero_grad` for each optimizer. - You can override this method to adjust how you do the optimizer step for each optimizer + Override this method to adjust the default way the Trainer calls each optimizer. By default, Lightning + calls .step() and zero_grad() as shown in the example once per optimizer. - Called once per optimizer + Args: + epoch (int): Current epoch + batch_idx (int): Index of current batch + optimizer (torch.nn.Optimizer): A PyTorch optimizer + optimizer_idx (int): If you used multiple optimizers this indexes into that list + second_order_closure (int): closure for second order methods + Example + ------- .. code-block:: python # DEFAULT @@ -738,7 +771,7 @@ def optimizer_step(self, current_epoch, batch_idx, optimizer, optimizer_idx, sec # add as many optimizers as you want - This step allows you to do a lot of non-standard training tricks such as learning-rate warm-up: + Here's another example showing how to use this for more advanced things such as learning-rate warm-up: .. code-block:: python @@ -764,18 +797,22 @@ def optimizer_step(self, current_epoch, batch_idx, optimizer, optimizer_idx, sec optimizer.zero_grad() def tbptt_split_batch(self, batch, split_size): - """ - Return list of batch splits. Each split will be passed to forward_step to enable truncated - back propagation through time. The default implementation splits root level Tensors and - Sequences at dim=1 (i.e. time dim). It assumes that each time dim is the same length. + r""" - :param batch: - :param split_size: - :return: + When using truncated backpropagation through time, each batch must be split along the time dimension. + Lightning handles this by default, but for custom behavior override this function. - Called in the training loop after on_batch_start if `truncated_bptt_steps > 0`. - Each returned batch split is passed separately to training_step(...). + Args: + batch (torch.nn.Tensor): Current batch + split_size (int): How big the split is + Return: + list of batch splits. Each split will be passed to forward_step to enable truncated + back propagation through time. The default implementation splits root level Tensors and + Sequences at dim=1 (i.e. time dim). It assumes that each time dim is the same length. + + Example + ------- .. code-block:: python def tbptt_split_batch(self, batch, split_size): @@ -795,6 +832,10 @@ def tbptt_split_batch(self, batch, split_size): splits.append(batch_split) return splits + + .. note:: Called in the training loop after on_batch_start if `truncated_bptt_steps > 0`. + Each returned batch split is passed separately to training_step(...). + """ time_dims = [len(x[0]) for x in batch if isinstance( x, torch.Tensor) or isinstance(x, collections.Sequence)] @@ -861,15 +902,13 @@ def tng_dataloader(self): @data_loader def test_dataloader(self): - """Implement a PyTorch DataLoader. - - :return: PyTorch DataLoader - - If you don't need a test dataset and a test_step, you don't need to implement this method. + r""" Called by lightning during test loop. Make sure to use the @pl.data_loader decorator, - this ensures not calling this function until the data are needed. - If you want to change the data during every epoch DON'T use the data_loader decorator. + this ensures not calling this function until the data are needed. + + Return: + PyTorch DataLoader Example ------- @@ -888,20 +927,44 @@ def test_dataloader(self): return loader + .. note:: If you don't need a test dataset and a test_step, you don't need to implement this method. + + .. note:: If you want to change the data during every epoch DON'T use the data_loader decorator. + """ return None @data_loader def val_dataloader(self): - """Implement a PyTorch DataLoader. + r""" - :return: PyTorch DataLoader or list of PyTorch Dataloaders. + Called by lightning during validation loop. Make sure to use the @pl.data_loader decorator, + this ensures not calling this function until the data are needed. - If you don't need a validation dataset and a validation_step, you don't need to implement this method. + Return: + PyTorch DataLoader - Called by lightning during validation loop. Make sure to use the @pl.data_loader decorator, - this ensures not calling this function until the data are needed. - If you want to change the data during every epoch DON'T use the data_loader decorator. + Example + ------- + + .. code-block:: python + + @pl.data_loader + def val_dataloader(self): + transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (1.0,))]) + dataset = MNIST(root='/path/to/mnist/', train=False, transform=transform, download=True) + loader = torch.utils.data.DataLoader( + dataset=dataset, + batch_size=self.hparams.batch_size, + shuffle=True + ) + + return loader + + # can also return multiple dataloaders + @pl.data_loader + def val_dataloader(self): + return [loader_a, loader_b, ..., loader_n] Example ------- @@ -925,30 +988,46 @@ def val_dataloader(self): def val_dataloader(self): return [loader_a, loader_b, ..., loader_n] - In the case where you return multiple `val_dataloaders`, the `validation_step` - will have an arguement `dataset_idx` which matches the order here. + .. note:: If you don't need a validation dataset and a validation_step, you don't need to implement this method. + + .. note:: If you want to change the data during every epoch DON'T use the data_loader decorator. + + .. note:: In the case where you return multiple `val_dataloaders`, the `validation_step` + will have an argument `dataset_idx` which matches the order here. """ return None @classmethod def load_from_metrics(cls, weights_path, tags_csv, map_location=None): - """Primary way of loading model from csv weights path. + r""" + + You should use `load_from_checkpoint` instead! + However, if your .ckpt weights don't have the hyperparameters saved, use this method to pass + in a .csv with the hparams you'd like to use. These will be converted into a argparse.Namespace + and passed into your LightningModule for use. + + Args: - :param str weights_path: Path to a PyTorch checkpoint - :param str tags_csv: Path to meta_tags.csv file generated by the test-tube Experiment - :param dict map_location: A dictionary mapping saved weight GPU devices to new GPU devices - for mapping storage {'cuda:1':'cuda:0'} - :return: The pretrained LightningModule + weights_path (str): Path to a PyTorch checkpoint + tags_csv (str): Path to a .csv with two columns (key, value) as in this + Example:: + key,value + drop_prob,0.2 + batch_size,32 - If you're using `test-tube`, there is an alternate method which uses the meta_tags.csv - file from test-tube to rebuild the model. The `meta_tags.csv` file can be found in the - `test-tube` experiment save_dir. + map_location (dict): A dictionary mapping saved weight GPU devices to new + GPU devices (example: {'cuda:1':'cuda:0'}) + Return: + LightningModule with loaded weights + + Example + ------- .. code-block:: python pretrained_model = MyLightningModule.load_from_metrics( weights_path='/path/to/pytorch_checkpoint.ckpt', - tags_csv='/path/to/test_tube/experiment/version/meta_tags.csv', + tags_csv='/path/to/hparams_file.csv', on_gpu=True, map_location=None ) @@ -957,22 +1036,8 @@ def load_from_metrics(cls, weights_path, tags_csv, map_location=None): pretrained_model.eval() pretrained_model.freeze() y_hat = pretrained_model(x) - - This is the easiest/fastest way which loads hyperparameters and weights from a checkpoint, - such as the one saved by the `ModelCheckpoint` callback - - .. code-block:: python - - pretrained_model = MyLightningModule.load_from_checkpoint( - checkpoint_path='/path/to/pytorch_checkpoint.ckpt' - ) - - # predict - pretrained_model.eval() - pretrained_model.freeze() - y_hat = pretrained_model(x) - """ + hparams = load_hparams_from_tags_csv(tags_csv) hparams.__setattr__('on_gpu', False) @@ -992,11 +1057,56 @@ def load_from_metrics(cls, weights_path, tags_csv, map_location=None): @classmethod def load_from_checkpoint(cls, checkpoint_path, map_location=None): - """ - Primary way of loading model from a checkpoint - :param checkpoint_path: - :param map_location: dic for mapping storage {'cuda:1':'cuda:0'} - :return: + r""" + + Primary way of loading model from a checkpoint. When Lightning saves a checkpoint + it stores the hyperparameters in the checkpoint if you initialized your LightningModule + with an argument called `hparams` which is a Namespace or dictionary of hyperparameters + + Example + ------- + .. code-block:: python + + # -------------- + # Case 1 + # when using Namespace (output of using Argparse to parse command line arguments) + from argparse import Namespace + hparams = Namespace(**{'learning_rate': 0.1}) + + model = MyModel(hparams) + + class MyModel(pl.LightningModule): + def __init__(self, hparams): + self.learning_rate = hparams.learning_rate + + # -------------- + # Case 2 + # when using a dict + model = MyModel({'learning_rate': 0.1}) + + class MyModel(pl.LightningModule): + def __init__(self, hparams): + self.learning_rate = hparams['learning_rate'] + + Args: + checkpoint_path (str): Path to checkpoint. + map_location (dic): If your checkpoint saved from a GPU model and you now load on CPUs + or a different number of GPUs, use this to map to the new setup. + + Return: + LightningModule with loaded weights. + + Example + ------- + .. code-block:: python + + # load weights without mapping + MyLightningModule.load_from_checkpoint('path/to/checkpoint.ckpt') + + # load weights mapping all weights from GPU 1 to GPU 0 + map_location = {'cuda:1':'cuda:0'} + MyLightningModule.load_from_checkpoint('path/to/checkpoint.ckpt', map_location=map_location) + """ if map_location is not None: @@ -1027,8 +1137,12 @@ def summarize(self, mode): logging.info('\n' + model_summary.__str__()) def freeze(self): - """Freeze all params for inference + r""" + + Freeze all params for inference + Example + ------- .. code-block:: python model = MyLightningModule(...) @@ -1055,13 +1169,14 @@ def unfreeze(self): self.train() def on_load_checkpoint(self, checkpoint): - """ + r""" - :param checkpoint: + Called by lightning to restore your model. + If you saved something with **on_save_checkpoint** this is your chance to restore this. + + Args: + checkpoint (dict): Loaded checkpoint - Called by lightning to restore your model. Lighting auto-restores global step, epoch, etc... - It also restores the model state_dict. - If you saved something with **on_save_checkpoint** this is your chance to restore this. Example ------- @@ -1072,17 +1187,19 @@ def on_load_checkpoint(self, checkpoint): # 99% of the time you don't need to implement this method self.something_cool_i_want_to_save = checkpoint['something_cool_i_want_to_save'] + .. note:: Lighting auto-restores global step, epoch, and all training state including amp scaling. + No need for you to restore anything regarding training. """ pass def on_save_checkpoint(self, checkpoint): - """ + r""" - :param checkpoint: + Called by lightning when saving a checkpoint to give you a chance to store anything else you + might want to save - Called by lightning to checkpoint your model. Lightning saves the training state - (current epoch, global_step, etc) and also saves the model state_dict. - If you want to save anything else, use this method to add your own key-value pair. + Args: + checkpoint (dic): Checkpoint to be saved Example ------- @@ -1093,5 +1210,37 @@ def on_save_checkpoint(self, checkpoint): # 99% of use cases you don't need to implement this method checkpoint['something_cool_i_want_to_save'] = my_cool_pickable_object + .. note:: Lighting saves all aspects of training (epoch, global step, etc...) including amp scaling. No need + for you to store anything about training. + """ pass + + +def load_hparams_from_tags_csv(tags_csv): + if not os.path.isfile(tags_csv): + logging.warning(f'Missing Tags: {tags_csv}.') + return Namespace() + + tags_df = pd.read_csv(tags_csv) + dic = tags_df.to_dict(orient='records') + ns_dict = {row['key']: convert(row['value']) for row in dic} + ns = Namespace(**ns_dict) + return ns + + +def convert(val): + constructors = [int, float, str] + + if type(val) is str: + if val.lower() == 'true': + return True + if val.lower() == 'false': + return False + + for c in constructors: + try: + return c(val) + except ValueError: + pass + return val diff --git a/pytorch_lightning/logging/__init__.py b/pytorch_lightning/logging/__init__.py index 9a588bacb87fd..5fbb93cddc14d 100644 --- a/pytorch_lightning/logging/__init__.py +++ b/pytorch_lightning/logging/__init__.py @@ -1,36 +1,20 @@ """ -Lighting offers options for logging information about model, gpu usage, etc, - via several different logging frameworks. It also offers printing options for training monitoring. - -**default_save_path** - -Lightning sets a default TestTubeLogger and CheckpointCallback for you which log to -`os.getcwd()` by default. To modify the logging path you can set:: - - Trainer(default_save_path='/your/path/to/save/checkpoints') - - -If you need more custom behavior (different paths for both, different metrics, etc...) - from the logger and the checkpointCallback, pass in your own instances as explained below. - -Setting up logging ------------------- - -The trainer inits a default logger for you (TestTubeLogger). All logs will -go to the current working directory under a folder named `os.getcwd()/lightning_logs`. - -If you want to modify the default logging behavior even more, pass in a logger - (which should inherit from `LightningBaseLogger`). +Lightning supports most popular logging frameworks (Tensorboard, comet, weights and biases, etc...). +To use a logger, simply pass it into the trainer. .. code-block:: python + from pytorch_lightning import logging - my_logger = MyLightningLogger(...) - trainer = Trainer(logger=my_logger) + # lightning uses tensorboard by default + tb_logger = logging.TensorBoardLogger() + trainer = Trainer(logger=tb_logger) + # or choose from any of the others such as MLFlow, Comet, Neptune, Wandb + comet_logger = logging.CometLogger() + trainer = Trainer(logger=comet_logger) -The path in this logger will overwrite `default_save_path`. - -Lightning supports several common experiment tracking frameworks out of the box +.. note:: All loggers log by default to `os.getcwd()`. To change the path without creating a logger set + Trainer(default_save_path='/your/path/to/save/checkpoints') Custom logger ------------- @@ -73,7 +57,7 @@ def finalize(self, status): Using loggers ------------- -You can call the logger anywhere from your LightningModule by doing: +Call the logger anywhere from your LightningModule by doing: .. code-block:: python @@ -84,85 +68,8 @@ def train_step(...): def any_lightning_module_function_or_hook(...): self.logger.experiment.add_histogram(...) -Display metrics in progress bar -------------------------------- - -.. code-block:: python - - # DEFAULT - trainer = Trainer(show_progress_bar=True) - -Log metric row every k batches ------------------------------- - -Every k batches lightning will make an entry in the metrics log - -.. code-block:: python - - # DEFAULT (ie: save a .csv log file every 10 batches) - trainer = Trainer(row_log_interval=10) - -Log GPU memory --------------- - -Logs GPU memory when metrics are logged. - -.. code-block:: python - - # DEFAULT - trainer = Trainer(log_gpu_memory=None) - - # log only the min/max utilization - trainer = Trainer(log_gpu_memory='min_max') - - # log all the GPU memory (if on DDP, logs only that node) - trainer = Trainer(log_gpu_memory='all') - -Process position ----------------- - -When running multiple models on the same machine we want to decide which progress bar to use. - Lightning will stack progress bars according to this value. - -.. code-block:: python - - # DEFAULT - trainer = Trainer(process_position=0) - - # if this is the second model on the node, show the second progress bar below - trainer = Trainer(process_position=1) - - -Save a snapshot of all hyperparameters --------------------------------------- - -Automatically log hyperparameters stored in the `hparams` attribute as an `argparse.Namespace` - -.. code-block:: python - - class MyModel(pl.Lightning): - def __init__(self, hparams): - self.hparams = hparams - - ... - - args = parser.parse_args() - model = MyModel(args) - - logger = TestTubeLogger(...) - t = Trainer(logger=logger) - trainer.fit(model) - -Write logs file to csv every k batches --------------------------------------- - -Every k batches, lightning will write the new logs to disk - -.. code-block:: python - - # DEFAULT (ie: save a .csv log file every 100 batches) - trainer = Trainer(log_save_interval=100) - +Supported Loggers +----------------- """ from os import environ @@ -170,29 +77,41 @@ def __init__(self, hparams): from .base import LightningLoggerBase, rank_zero_only from .tensorboard import TensorBoardLogger +all = [] + try: - from .test_tube import TestTubeLogger + # needed to prevent ImportError and duplicated logs. + environ["COMET_DISABLE_AUTO_LOGGING"] = "1" + + from .comet import CometLogger + all.append('CometLogger') except ImportError: - pass + del environ["COMET_DISABLE_AUTO_LOGGING"] try: from .mlflow import MLFlowLogger + all.append('MLFlowLogger') except ImportError: pass try: - from .wandb import WandbLogger + from .neptune import NeptuneLogger + all.append('NeptuneLogger') except ImportError: pass -try: - # needed to prevent ImportError and duplicated logs. - environ["COMET_DISABLE_AUTO_LOGGING"] = "1" - from .comet import CometLogger +all.append('TensorBoardLogger') + +try: + from .test_tube import TestTubeLogger + all.append('TestTubeLogger') except ImportError: - del environ["COMET_DISABLE_AUTO_LOGGING"] + pass try: - from .neptune import NeptuneLogger + from .wandb import WandbLogger + all.append('WandbLogger') except ImportError: pass + +__all__ = all diff --git a/pytorch_lightning/logging/comet.py b/pytorch_lightning/logging/comet.py index 3fe254185b827..fbf4f839cda7a 100644 --- a/pytorch_lightning/logging/comet.py +++ b/pytorch_lightning/logging/comet.py @@ -1,52 +1,3 @@ -""" -Log using `comet `_ - -Comet logger can be used in either online or offline mode. -To log in online mode, CometLogger requries an API key: - -.. code-block:: python - - from pytorch_lightning.logging import CometLogger - # arguments made to CometLogger are passed on to the comet_ml.Experiment class - comet_logger = CometLogger( - api_key=os.environ["COMET_KEY"], - workspace=os.environ["COMET_WORKSPACE"], # Optional - project_name="default_project", # Optional - rest_api_key=os.environ["COMET_REST_KEY"], # Optional - experiment_name="default" # Optional - ) - trainer = Trainer(logger=comet_logger) - -To log in offline mode, CometLogger requires a path to a local directory: - -.. code-block:: python - - from pytorch_lightning.logging import CometLogger - # arguments made to CometLogger are passed on to the comet_ml.Experiment class - comet_logger = CometLogger( - save_dir=".", - workspace=os.environ["COMET_WORKSPACE"], # Optional - project_name="default_project", # Optional - rest_api_key=os.environ["COMET_REST_KEY"], # Optional - experiment_name="default" # Optional - ) - trainer = Trainer(logger=comet_logger) - - -Use the logger anywhere in you LightningModule as follows: - -.. code-block:: python - - def train_step(...): - # example - self.logger.experiment.whatever_comet_ml_supports(...) - - def any_lightning_module_function_or_hook(...): - self.logger.experiment.whatever_comet_ml_supports(...) - - -""" - from logging import getLogger try: @@ -71,18 +22,54 @@ def any_lightning_module_function_or_hook(...): class CometLogger(LightningLoggerBase): def __init__(self, api_key=None, save_dir=None, workspace=None, rest_api_key=None, project_name=None, experiment_name=None, **kwargs): - """Initialize a Comet.ml logger. + r""" + + Log using `comet `_. + Requires either an API Key (online mode) or a local directory path (offline mode) - :param str api_key: Required in online mode. API key, found on Comet.ml - :param str save_dir: Required in offline mode. The path for the directory to save local comet logs - :param str workspace: Optional. Name of workspace for this user - :param str project_name: Optional. Send your experiment to a specific project. - Otherwise will be sent to Uncategorized Experiments. - If project name does not already exists Comet.ml will create a new project. - :param str rest_api_key: Optional. Rest API key found in Comet.ml settings. - This is used to determine version number - :param str experiment_name: Optional. String representing the name for this particular experiment on Comet.ml + .. code-block:: python + + # ONLINE MODE + from pytorch_lightning.logging import CometLogger + + # arguments made to CometLogger are passed on to the comet_ml.Experiment class + comet_logger = CometLogger( + api_key=os.environ["COMET_KEY"], + workspace=os.environ["COMET_WORKSPACE"], # Optional + project_name="default_project", # Optional + rest_api_key=os.environ["COMET_REST_KEY"], # Optional + experiment_name="default" # Optional + ) + trainer = Trainer(logger=comet_logger) + + + .. code-block:: python + + # OFFLINE MODE + from pytorch_lightning.logging import CometLogger + + # arguments made to CometLogger are passed on to the comet_ml.Experiment class + comet_logger = CometLogger( + save_dir=".", + workspace=os.environ["COMET_WORKSPACE"], # Optional + project_name="default_project", # Optional + rest_api_key=os.environ["COMET_REST_KEY"], # Optional + experiment_name="default" # Optional + ) + trainer = Trainer(logger=comet_logger) + + Args: + api_key (str): Required in online mode. API key, found on Comet.ml + save_dir (str): Required in offline mode. The path for the directory to save local comet logs + workspace (str): Optional. Name of workspace for this user + project_name (str): Optional. Send your experiment to a specific project. + Otherwise will be sent to Uncategorized Experiments. + If project name does not already exists Comet.ml will create a new project. + rest_api_key (str): Optional. Rest API key found in Comet.ml settings. + This is used to determine version number + experiment_name (str): Optional. String representing the name for this particular experiment on Comet.ml + """ super().__init__() self._experiment = None @@ -124,6 +111,15 @@ def __init__(self, api_key=None, save_dir=None, workspace=None, @property def experiment(self): + r""" + + Actual comet object. To use comet features do the following. + + Example:: + + self.logger.experiment.some_comet_function() + + """ if self._experiment is not None: return self._experiment diff --git a/pytorch_lightning/logging/mlflow.py b/pytorch_lightning/logging/mlflow.py index 5769693d6cc57..50f4843e0f6c9 100644 --- a/pytorch_lightning/logging/mlflow.py +++ b/pytorch_lightning/logging/mlflow.py @@ -39,6 +39,15 @@ def any_lightning_module_function_or_hook(...): class MLFlowLogger(LightningLoggerBase): def __init__(self, experiment_name, tracking_uri=None, tags=None): + r""" + + Logs using MLFlow + + Args: + experiment_name (str): The name of the experiment + tracking_uri (str): where this should track + tags (dict): todo this param + """ super().__init__() self._mlflow_client = mlflow.tracking.MlflowClient(tracking_uri) self.experiment_name = experiment_name @@ -47,6 +56,15 @@ def __init__(self, experiment_name, tracking_uri=None, tags=None): @property def experiment(self): + r""" + + Actual mlflow object. To use mlflow features do the following. + + Example:: + + self.logger.experiment.some_mlflow_function() + + """ return self._mlflow_client @property diff --git a/pytorch_lightning/logging/neptune.py b/pytorch_lightning/logging/neptune.py index 7ce9f1f1f1e5a..7c677962df70e 100644 --- a/pytorch_lightning/logging/neptune.py +++ b/pytorch_lightning/logging/neptune.py @@ -57,34 +57,68 @@ class NeptuneLogger(LightningLoggerBase): def __init__(self, api_key=None, project_name=None, offline_mode=False, experiment_name=None, upload_source_files=None, params=None, properties=None, tags=None, **kwargs): - """Initialize a neptune.ml logger. - Requires either an API Key (online mode) or a local directory path (offline mode) - - :param str|None api_key: Required in online mode. Neputne API token, found on https://neptune.ml. - Read how to get your API key https://docs.neptune.ml/python-api/tutorials/get-started.html#copy-api-token. - :param str project_name: Required in online mode. Qualified name of a project in a form of - "namespace/project_name" for example "tom/minst-classification". - If None, the value of NEPTUNE_PROJECT environment variable will be taken. - You need to create the project in https://neptune.ml first. - :param bool offline_mode: Optional default False. If offline_mode=True no logs will be send to neptune. - Usually used for debug purposes. - :param str|None experiment_name: Optional. Editable name of the experiment. - Name is displayed in the experiment’s Details (Metadata section) and in experiments view as a column. - :param list|None upload_source_files: Optional. List of source files to be uploaded. - Must be list of str or single str. Uploaded sources are displayed in the experiment’s Source code tab. - If None is passed, Python file from which experiment was created will be uploaded. - Pass empty list ([]) to upload no files. Unix style pathname pattern expansion is supported. - For example, you can pass '*.py' to upload all python source files from the current directory. - For recursion lookup use '**/*.py' (for Python 3.5 and later). For more information see glob library. - :param dict|None params: Optional. Parameters of the experiment. After experiment creation params are read-only. - Parameters are displayed in the experiment’s Parameters section and each key-value pair can be - viewed in experiments view as a column. - :param dict|None properties: Optional default is {}. Properties of the experiment. - They are editable after experiment is created. Properties are displayed in the experiment’s Details and - each key-value pair can be viewed in experiments view as a column. - :param list|None tags: Optional default []. Must be list of str. Tags of the experiment. - They are editable after experiment is created (see: append_tag() and remove_tag()). - Tags are displayed in the experiment’s Details and can be viewed in experiments view as a column. + r""" + + Initialize a neptune.ml logger. + + .. note:: Requires either an API Key (online mode) or a local directory path (offline mode) + + .. code-block:: python + + # ONLINE MODE + from pytorch_lightning.logging import NeptuneLogger + # arguments made to NeptuneLogger are passed on to the neptune.experiments.Experiment class + + neptune_logger = NeptuneLogger( + api_key=os.environ["NEPTUNE_API_TOKEN"], + project_name="USER_NAME/PROJECT_NAME", + experiment_name="default", # Optional, + params={"max_epochs": 10}, # Optional, + tags=["pytorch-lightning","mlp"] # Optional, + ) + trainer = Trainer(max_epochs=10, logger=neptune_logger) + + .. code-block:: python + + # OFFLINE MODE + from pytorch_lightning.logging import NeptuneLogger + # arguments made to NeptuneLogger are passed on to the neptune.experiments.Experiment class + + neptune_logger = NeptuneLogger( + project_name="USER_NAME/PROJECT_NAME", + experiment_name="default", # Optional, + params={"max_epochs": 10}, # Optional, + tags=["pytorch-lightning","mlp"] # Optional, + ) + trainer = Trainer(max_epochs=10, logger=neptune_logger) + + Args: + api_key (str | None): Required in online mode. Neputne API token, found on https://neptune.ml. + Read how to get your API key + https://docs.neptune.ml/python-api/tutorials/get-started.html#copy-api-token. + project_name (str): Required in online mode. Qualified name of a project in a form of + "namespace/project_name" for example "tom/minst-classification". + If None, the value of NEPTUNE_PROJECT environment variable will be taken. + You need to create the project in https://neptune.ml first. + offline_mode (bool): Optional default False. If offline_mode=True no logs will be send to neptune. + Usually used for debug purposes. + experiment_name (str|None): Optional. Editable name of the experiment. + Name is displayed in the experiment’s Details (Metadata section) and in experiments view as a column. + upload_source_files (list|None): Optional. List of source files to be uploaded. + Must be list of str or single str. Uploaded sources are displayed in the experiment’s Source code tab. + If None is passed, Python file from which experiment was created will be uploaded. + Pass empty list ([]) to upload no files. Unix style pathname pattern expansion is supported. + For example, you can pass '*.py' to upload all python source files from the current directory. + For recursion lookup use '**/*.py' (for Python 3.5 and later). For more information see glob library. + params (dict|None): Optional. Parameters of the experiment. After experiment creation params are read-only. + Parameters are displayed in the experiment’s Parameters section and each key-value pair can be + viewed in experiments view as a column. + properties (dict|None): Optional default is {}. Properties of the experiment. + They are editable after experiment is created. Properties are displayed in the experiment’s Details and + each key-value pair can be viewed in experiments view as a column. + tags (list|None): Optional default []. Must be list of str. Tags of the experiment. + They are editable after experiment is created (see: append_tag() and remove_tag()). + Tags are displayed in the experiment’s Details and can be viewed in experiments view as a column. """ super().__init__() self.api_key = api_key @@ -111,6 +145,16 @@ def __init__(self, api_key=None, project_name=None, offline_mode=False, @property def experiment(self): + r""" + + Actual neptune object. To use neptune features do the following. + + Example:: + + self.logger.experiment.some_neptune_function() + + """ + if self._experiment is not None: return self._experiment else: diff --git a/pytorch_lightning/logging/tensorboard.py b/pytorch_lightning/logging/tensorboard.py index 85279ce728d7b..937fc3e6fd37b 100644 --- a/pytorch_lightning/logging/tensorboard.py +++ b/pytorch_lightning/logging/tensorboard.py @@ -11,12 +11,15 @@ class TensorBoardLogger(LightningLoggerBase): - r"""Log to local file system in TensorBoard format + r""" + + Log to local file system in TensorBoard format Implemented using :class:`torch.utils.tensorboard.SummaryWriter`. Logs are saved to `os.path.join(save_dir, name, version)` - :example: + Example + -------- .. code-block:: python @@ -24,11 +27,12 @@ class TensorBoardLogger(LightningLoggerBase): trainer = Trainer(logger=logger) trainer.train(model) - :param str save_dir: Save directory - :param str name: Experiment name. Defaults to "default". - :param int version: Experiment version. If version is not specified the logger inspects the save + Args: + save_dir (str): Save directory + name (str): Experiment name. Defaults to "default". + version (int): Experiment version. If version is not specified the logger inspects the save directory for existing versions, then automatically assigns the next available version. - :param \**kwargs: Other arguments are passed directly to the :class:`SummaryWriter` constructor. + \**kwargs (dict): Other arguments are passed directly to the :class:`SummaryWriter` constructor. """ NAME_CSV_TAGS = 'meta_tags.csv' @@ -45,10 +49,15 @@ def __init__(self, save_dir, name="default", version=None, **kwargs): @property def experiment(self): - """The underlying :class:`torch.utils.tensorboard.SummaryWriter`. + r""" + + Actual tensorboard object. To use tensorboard features do the following. + + Example:: + + self.logger.experiment.some_tensorboard_function() - :rtype: torch.utils.tensorboard.SummaryWriter - """ + """ if self._experiment is not None: return self._experiment diff --git a/pytorch_lightning/logging/test_tube.py b/pytorch_lightning/logging/test_tube.py index 73da794f7119e..ccb50ef3ec700 100644 --- a/pytorch_lightning/logging/test_tube.py +++ b/pytorch_lightning/logging/test_tube.py @@ -38,6 +38,33 @@ def any_lightning_module_function_or_hook(...): class TestTubeLogger(LightningLoggerBase): + r""" + + Log to local file system in TensorBoard format but using a nicer folder structure. + + Implemented using :class:`torch.utils.tensorboard.SummaryWriter`. Logs are saved to + `os.path.join(save_dir, name, version)` + + Example + -------- + + .. code-block:: python + + logger = TestTubeLogger("tt_logs", name="my_exp_name") + trainer = Trainer(logger=logger) + trainer.train(model) + + Args: + save_dir (str): Save directory + name (str): Experiment name. Defaults to "default". + description (str): A short snippet about this experiment + debug (bool): If True, it doesn't log anything + version (int): Experiment version. If version is not specified the logger inspects the save + directory for existing versions, then automatically assigns the next available version. + create_git_tag (bool): If True creates a git tag to save the code used in this experiment + + """ + __test__ = False def __init__( @@ -55,6 +82,16 @@ def __init__( @property def experiment(self): + r""" + + Actual test-tube object. To use test-tube features do the following. + + Example:: + + self.logger.experiment.some_test_tube_function() + + """ + if self._experiment is not None: return self._experiment diff --git a/pytorch_lightning/logging/wandb.py b/pytorch_lightning/logging/wandb.py index 6846f868f650b..829006d85d0dd 100644 --- a/pytorch_lightning/logging/wandb.py +++ b/pytorch_lightning/logging/wandb.py @@ -1,27 +1,3 @@ -""" -Log using `W&B `_ - -.. code-block:: python - - >>> from pytorch_lightning.logging import WandbLogger - >>> from pytorch_lightning import Trainer - >>> wandb_logger = WandbLogger() - >>> trainer = Trainer(logger=wandb_logger) - - -Use the logger anywhere in you LightningModule as follows: - -.. code-block:: python - - def train_step(...): - # example - self.logger.experiment.whatever_wandb_supports(...) - - def any_lightning_module_function_or_hook(...): - self.logger.experiment.whatever_wandb_supports(...) - -""" - import os try: @@ -44,6 +20,16 @@ class WandbLogger(LightningLoggerBase): anonymous (bool): enables or explicitly disables anonymous logging. project (str): the name of the project to which this run will belong. tags (list of str): tags associated with this run. + + Example + -------- + .. code-block:: python + + from pytorch_lightning.logging import WandbLogger + from pytorch_lightning import Trainer + + wandb_logger = WandbLogger() + trainer = Trainer(logger=wandb_logger) """ def __init__(self, name=None, save_dir=None, offline=False, id=None, anonymous=False, @@ -68,6 +54,15 @@ def __getstate__(self): @property def experiment(self): + r""" + + Actual wandb object. To use wandb features do the following. + + Example:: + + self.logger.experiment.some_wandb_function() + + """ if self._experiment is None: if self._offline: os.environ["WANDB_MODE"] = "dryrun" diff --git a/pytorch_lightning/trainer/__init__.py b/pytorch_lightning/trainer/__init__.py index d38d208d9408b..98c2b99b56357 100644 --- a/pytorch_lightning/trainer/__init__.py +++ b/pytorch_lightning/trainer/__init__.py @@ -1,10 +1,17 @@ """ -Trainer -======= -The lightning trainer abstracts best practices for running a training, val, test routine. - It calls parts of your model when it wants to hand over full control and otherwise makes - training assumptions which are now standard practice in AI research. +The trainer de-couples the engineering code (16-bit, early stopping, GPU distribution, etc...) from the +science code (GAN, BERT, your project, etc...). It uses many assumptions which are best practices in +AI research today. + +The trainer automates all parts of training except: + +- what happens in training , test, val loop +- where the data come from +- which optimizers to use +- how to do the computations + +The Trainer delegates those calls to your LightningModule which defines how to do those parts. This is the basic use of the trainer: @@ -12,9 +19,12 @@ from pytorch_lightning import Trainer - model = LightningTemplate() + model = MyLightningModule() trainer = Trainer() trainer.fit(model) - """ + +from .trainer import Trainer + +__all__ = ['Trainer'] diff --git a/pytorch_lightning/trainer/auto_mix_precision.py b/pytorch_lightning/trainer/auto_mix_precision.py index b28193c0bd12d..2915f2465fbb9 100644 --- a/pytorch_lightning/trainer/auto_mix_precision.py +++ b/pytorch_lightning/trainer/auto_mix_precision.py @@ -1,3 +1,4 @@ + from abc import ABC try: diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index a1133004bc448..dd68293a1cf36 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1,6 +1,4 @@ -""" -The trainer handles all the logic for running a val loop, training loop, distributing, etc.. . -""" + import os import sys @@ -50,7 +48,6 @@ class Trainer(TrainerIOMixin, TrainerTrainLoopMixin, TrainerCallbackConfigMixin, ): - def __init__( self, logger=True, @@ -92,52 +89,335 @@ def __init__( truncated_bptt_steps=None, resume_from_checkpoint=None, ): + r""" + + Customize every aspect of training via flags + + Args: + logger (:class:`.Logger`): Logger for experiment tracking. + Example:: + from pytorch_lightning.logging import TensorBoardLogger + + # default logger used by trainer + logger = TensorBoardLogger( + save_dir=os.getcwd(), + version=self.slurm_job_id, + name='lightning_logs' + ) + + Trainer(logger=logger) + checkpoint_callback (:class:`CheckpointCallback`): Callback for checkpointing. + Example:: + from pytorch_lightning.callbacks import ModelCheckpoint + + # default used by the Trainer + checkpoint_callback = ModelCheckpoint( + filepath=os.getcwd(), + save_best_only=True, + verbose=True, + monitor='val_loss', + mode='min', + prefix='' + ) + + trainer = Trainer(checkpoint_callback=checkpoint_callback) + early_stop_callback (:class:`.EarlyStopping`): Callback for early stopping + Example:: + from pytorch_lightning.callbacks import EarlyStopping + + # default used by the Trainer + early_stop_callback = EarlyStopping( + monitor='val_loss', + patience=3, + verbose=True, + mode='min' + ) + + trainer = Trainer(early_stop_callback=early_stop_callback) + default_save_path (str): Default path for logs and weights when no logger/ckpt_callback passed + Example:: + # default used by the Trainer + trainer = Trainer(default_save_path=os.getcwd()) + gradient_clip_val (float): 0 means don't clip. + Example:: + # default used by the Trainer + trainer = Trainer(gradient_clip_val=0.0) + gradient_clip (int): + .. deprecated:: 0.5.0 + Use `gradient_clip_val` instead. Will remove 0.8.0. + + process_position (int): orders the tqdm bar when running multiple models on same machine. + Example:: + # default used by the Trainer + trainer = Trainer(process_position=0) + + num_nodes (int): number of GPU nodes for distributed training. + Example:: + # default used by the Trainer + trainer = Trainer(num_nodes=1) + + # to train on 8 nodes + trainer = Trainer(num_nodes=8) + + nb_gpu_nodes (int): + .. deprecated:: 0.5.0 + Use `num_nodes` instead. Will remove 0.8.0. + + gpus (list|str|int): Which GPUs to train on. + Example:: + # default used by the Trainer (ie: train on CPU) + trainer = Trainer(gpus=None) + + # int: train on 2 gpus + trainer = Trainer(gpus=2) + + # list: train on GPUs 1, 4 (by bus ordering) + trainer = Trainer(gpus=[1, 4]) + trainer = Trainer(gpus='1, 4') # equivalent + + # -1: train on all gpus + trainer = Trainer(gpus=-1) + trainer = Trainer(gpus='-1') # equivalent + + # combine with num_nodes to train on multiple GPUs across nodes + trainer = Trainer(gpus=2, num_nodes=4) # uses 8 gpus in total + + log_gpu_memory (str): None, 'min_max', 'all'. Might slow performance + because it uses the output of nvidia-smi. + Example:: + # default used by the Trainer + trainer = Trainer(log_gpu_memory=None) + + # log all the GPUs (on master node only) + trainer = Trainer(log_gpu_memory='all') + + # log only the min and max memory on the master node + trainer = Trainer(log_gpu_memory='min_max') + + show_progress_bar (bool): If true shows tqdm progress bar + Example:: + # default used by the Trainer + trainer = Trainer(show_progress_bar=True) + + overfit_pct (float): uses this much data of all datasets. + Example:: + # default used by the Trainer + trainer = Trainer(overfit_pct=0.0) + + # use only 1% of the train, test, val datasets + trainer = Trainer(overfit_pct=0.01) + + track_grad_norm (int): -1 no tracking. Otherwise tracks that norm + Example:: + # default used by the Trainer + trainer = Trainer(track_grad_norm=-1) + + # track the 2-norm + trainer = Trainer(track_grad_norm=2) + + check_val_every_n_epoch (int): check val every n train epochs + Example:: + # default used by the Trainer + trainer = Trainer(check_val_every_n_epoch=1) + + # run val loop every 10 training epochs + trainer = Trainer(check_val_every_n_epoch=10) + + fast_dev_run (bool): runs 1 batch of train, test and val to find any bugs (ie: a sort of unit test). + Example:: + # default used by the Trainer + trainer = Trainer(fast_dev_run=False) + + # runs 1 train, val, test batch and program ends + trainer = Trainer(fast_dev_run=True) + + accumulate_grad_batches (int|dict): Accumulates grads every k batches or as set up in the dict. + Example:: + # default used by the Trainer (no accumulation) + trainer = Trainer(accumulate_grad_batches=1) + + # accumulate every 4 batches (effective batch size is batch*4) + trainer = Trainer(accumulate_grad_batches=4) + + # no accumulation for epochs 1-4. accumulate 3 for epochs 5-10. accumulate 20 after that + trainer = Trainer(accumulate_grad_batches={5: 3, 10: 20}) + + max_epochs (int): Stop training once this number of epochs is reached + Example:: + # default used by the Trainer + trainer = Trainer(max_epochs=1000) + + max_nb_epochs (int): + .. deprecated:: 0.5.0 + Use `max_epochs` instead. Will remove 0.8.0. + + min_epochs (int): Force training for at least these many epochs + Example:: + # default used by the Trainer + trainer = Trainer(min_epochs=1) + + min_nb_epochs (int): + .. deprecated:: 0.5.0 + Use `min_nb_epochs` instead. Will remove 0.8.0. + + train_percent_check (int): How much of training dataset to check. + Useful when debugging or testing something that happens at the end of an epoch. + Example:: + # default used by the Trainer + trainer = Trainer(train_percent_check=1.0) + + # run through only 25% of the training set each epoch + trainer = Trainer(train_percent_check=0.25) + + val_percent_check (int): How much of validation dataset to check. + Useful when debugging or testing something that happens at the end of an epoch. + Example:: + # default used by the Trainer + trainer = Trainer(val_percent_check=1.0) + + # run through only 25% of the validation set each epoch + trainer = Trainer(val_percent_check=0.25) + + test_percent_check (int): How much of test dataset to check. + Useful when debugging or testing something that happens at the end of an epoch. + Example:: + # default used by the Trainer + trainer = Trainer(test_percent_check=1.0) + + # run through only 25% of the test set each epoch + trainer = Trainer(test_percent_check=0.25) + + val_check_interval (float|int): How often within one training epoch to check the validation set + If float, % of tng epoch. If int, check every n batch + Example:: + # default used by the Trainer + trainer = Trainer(val_check_interval=1.0) + + # check validation set 4 times during a training epoch + trainer = Trainer(val_check_interval=0.25) + + # check validation set every 1000 training batches + # use this when using iterableDataset and your dataset has no length + # (ie: production cases with streaming data) + trainer = Trainer(val_check_interval=1000) + + log_save_interval (int): Writes logs to disk this often + Example:: + # default used by the Trainer + trainer = Trainer(log_save_interval=100) + + row_log_interval (int): How often to add logging rows (does not write to disk) + Example:: + # default used by the Trainer + trainer = Trainer(row_log_interval=10) + + add_row_log_interval (int): + .. deprecated:: 0.5.0 + Use `row_log_interval` instead. Will remove 0.8.0. + + distributed_backend (str): The distributed backend to use. + Options: 'dp', 'ddp', 'ddp2'. + Example:: + # default used by the Trainer + trainer = Trainer(distributed_backend=None) + + # dp = DataParallel (split a batch onto k gpus on same machine). + trainer = Trainer(gpus=2, distributed_backend='dp') + + # ddp = DistributedDataParallel + # Each gpu trains by itself on a subset of the data. + # Gradients sync across all gpus and all machines. + trainer = Trainer(gpus=2, num_nodes=2, distributed_backend='ddp') + + # ddp2 = DistributedDataParallel + dp + # behaves like dp on every node + # syncs gradients across nodes like ddp + # useful for things like increasing the number of negative samples + trainer = Trainer(gpus=2, num_nodes=2, distributed_backend='ddp2') + + use_amp (bool): If true uses apex for 16bit precision + Example:: + # default used by the Trainer + trainer = Trainer(use_amp=False) + + print_nan_grads (bool): Prints gradients with nan values + Example:: + # default used by the Trainer + trainer = Trainer(print_nan_grads=False) + + weights_summary (str): Prints a summary of the weights when training begins. + Options: 'full', 'top', None. + Example:: + # default used by the Trainer (ie: print all weights) + trainer = Trainer(weights_summary='full') + + # print only the top level modules + trainer = Trainer(weights_summary='top') + + # don't print a summary + trainer = Trainer(weights_summary=None) + + weights_save_path (str): Where to save weights if specified. + Example:: + # default used by the Trainer + trainer = Trainer(weights_save_path=os.getcwd()) + + # save to your custom path + trainer = Trainer(weights_save_path='my/path') + + # if checkpoint callback used, then overrides the weights path + # **NOTE: this saves weights to some/path NOT my/path + checkpoint_callback = ModelCheckpoint(filepath='some/path') + trainer = Trainer( + checkpoint_callback=checkpoint_callback, + weights_save_path='my/path' + ) + + amp_level (str): The optimization level to use (O1, O2, etc...). + Check nvidia docs for level (https://nvidia.github.io/apex/amp.html#opt-levels) + Example:: + # default used by the Trainer + trainer = Trainer(amp_level='O1') + + num_sanity_val_steps (int): Sanity check runs n batches of val before starting the training routine. + This catches any bugs in your validation without having to wait for the first validation check. + The Trainer uses 5 steps by default. Turn it off or modify it here. + Example:: + # default used by the Trainer + trainer = Trainer(num_sanity_val_steps=5) + + # turn it off + trainer = Trainer(num_sanity_val_steps=0) + + nb_sanity_val_steps (int): + .. deprecated:: 0.5.0 + Use `num_sanity_val_steps` instead. Will remove 0.8.0. + + truncated_bptt_steps (int): Truncated back prop breaks performs backprop every k steps of + a much longer sequence If this is enabled, your batches will automatically get truncated + and the trainer will apply Truncated Backprop to it. Make sure your batches have a sequence + dimension. (`Williams et al. "An efficient gradient-based algorithm for on-line training of + recurrent network trajectories." + `_) + Example:: + # default used by the Trainer (ie: disabled) + trainer = Trainer(truncated_bptt_steps=None) + + # backprop every 5 steps in a batch + trainer = Trainer(truncated_bptt_steps=5) + + resume_from_checkpoint (str): To resume training from a specific checkpoint pass in the path here.k + Example:: + # default used by the Trainer + trainer = Trainer(resume_from_checkpoint=None) + + # resume from a specific checkpoint + trainer = Trainer(resume_from_checkpoint='some/path/to/my_checkpoint.ckpt') """ + # + # .. warning:: Following arguments become deprecated and they will be removed in v0.8.0: + # - `nb_sanity_val_steps` - :param logger: Logger for experiment tracking - :param checkpoint_callback: Callback for checkpointing - :param early_stop_callback: Callback for early stopping - :param str default_save_path: Default path for logs+weights if no logger/ckpt_callback passed - :param int gradient_clip_val: 0 means don't clip. - :param int gradient_clip: 0 means don't clip. Deprecated. - :param process_position: shown in the tqdm bar - :param int num_nodes: number of GPU nodes - :param list|str|int gpus: int. (ie: 2 gpus) OR list to specify which GPUs [0, 1] OR '0,1' - OR '-1' / -1 to use all available gpus - :param str log_gpu_memory: None, 'min_max', 'all' - :param bool show_progress_bar: If true shows tqdm bar - :param float overfit_pct: uses this much of all datasets - :param int track_grad_norm: -1 no tracking. Otherwise tracks that norm - :param int check_val_every_n_epoch: check val every n train epochs - :param bool fast_dev_run: runs full iteration over everything to find bugs - :param int accumulate_grad_batches: Accumulates grads every k batches - :param int max_epochs: - :param int min_epochs: - :param int train_percent_check: How much of train set to check - :param int val_percent_check: How much of val set to check - :param int test_percent_check: How much of test set to check - :param float|int val_check_interval: If float, % of tng epoch. If int, check every n batch - :param int log_save_interval: Writes logs to disk this often - :param int row_log_interval: How often to add logging rows - :param int add_row_log_interval: How often to add logging rows. Deprecated. - :param str distributed_backend: Options: 'dp', 'ddp', 'ddp2'. - :param bool use_amp: If true uses apex for 16bit precision - :param bool print_nan_grads: Prints nan gradients - :param str weights_summary: Options: 'full', 'top', None to not print. - :param bool weights_save_path: Where to save weights if on cluster - :param str amp_level: Check nvidia docs for level - :param int num_sanity_val_steps: How many val steps before a full train loop. - :param int truncated_bptt_steps: Enables multiple backward passes for each batch. - - .. warning:: Following arguments become deprecated and they will be removed in v0.8.0: - - `gradient_clip`, - - `nb_gpu_nodes`, - - `max_nb_epochs`, - - `min_nb_epochs`, - - `add_row_log_interval`, - - `nb_sanity_val_steps` - - """ # Transfer params # Backward compatibility if nb_gpu_nodes is not None: @@ -384,6 +664,16 @@ def tng_tqdm_dic(self): # MODEL TRAINING # ----------------------------- def fit(self, model): + r""" + Runs the full optimization routine. + + Example:: + + trainer = Trainer() + model = LightningModule() + + trainer.fit() + """ # when using multi-node or DDP within a node start each module in a separate process if self.use_ddp2: task = int(os.environ['SLURM_LOCALID']) @@ -539,6 +829,30 @@ def run_pretrain_routine(self, model): self.train() def test(self, model=None): + r""" + + Separates from fit to make sure you never run on your test set until you want to. + + Args: + model (LightningModule): The model to test. + + Example:: + + # Option 1 + # run test after fitting + trainer = Trainer() + model = LightningModule() + + trainer.fit() + trainer.test() + + # Option 2 + # run test from a loaded model + model = LightningModule.load_from_checkpoint('path/to/checkpoint.ckpt') + trainer = Trainer() + + trainer.test(model) + """ self.testing = True if model is not None: self.fit(model) diff --git a/pytorch_lightning/trainer/training_io.py b/pytorch_lightning/trainer/training_io.py index 91c77fea78d39..168e983585c2a 100644 --- a/pytorch_lightning/trainer/training_io.py +++ b/pytorch_lightning/trainer/training_io.py @@ -96,9 +96,7 @@ from subprocess import call import logging from abc import ABC -from argparse import Namespace -import pandas as pd import torch import torch.distributed as dist @@ -459,32 +457,3 @@ def max_ckpt_in_folder(self, path, name_key='ckpt_'): ckpt_vs.append(int(name)) return max(ckpt_vs) - - -def load_hparams_from_tags_csv(tags_csv): - if not os.path.isfile(tags_csv): - logging.warning(f'Missing Tags: {tags_csv}.') - return Namespace() - - tags_df = pd.read_csv(tags_csv) - dic = tags_df.to_dict(orient='records') - ns_dict = {row['key']: convert(row['value']) for row in dic} - ns = Namespace(**ns_dict) - return ns - - -def convert(val): - constructors = [int, float, str] - - if type(val) is str: - if val.lower() == 'true': - return True - if val.lower() == 'false': - return False - - for c in constructors: - try: - return c(val) - except ValueError: - pass - return val diff --git a/tests/test_trainer.py b/tests/test_trainer.py index 90430e5c01ff4..2301104531cec 100644 --- a/tests/test_trainer.py +++ b/tests/test_trainer.py @@ -15,7 +15,7 @@ LightningValidationMultipleDataloadersMixin, LightningTestMultipleDataloadersMixin, ) -from pytorch_lightning.trainer import training_io +from pytorch_lightning.core.lightning import load_hparams_from_tags_csv from pytorch_lightning.trainer.logging import TrainerLoggingMixin @@ -186,7 +186,7 @@ def test_loading_meta_tags(tmpdir): # load tags path_expt_dir = tutils.get_data_path(logger, path_dir=tmpdir) tags_path = os.path.join(path_expt_dir, 'meta_tags.csv') - tags = training_io.load_hparams_from_tags_csv(tags_path) + tags = load_hparams_from_tags_csv(tags_path) assert tags.batch_size == 32 and tags.hidden_dim == 1000