diff --git a/docs/source/debugging.rst b/docs/source/debugging.rst index 96edc8ab81447..bad72541f74e9 100644 --- a/docs/source/debugging.rst +++ b/docs/source/debugging.rst @@ -6,7 +6,7 @@ Debugging ========= The following are flags that make debugging much easier. ------------------ +--- fast_dev_run ------------ @@ -21,7 +21,7 @@ argument of :class:`~pytorch_lightning.trainer.trainer.Trainer`) trainer = Trainer(fast_dev_run=True) ------------------ +--- Inspect gradient norms ---------------------- @@ -35,7 +35,7 @@ argument of :class:`~pytorch_lightning.trainer.trainer.Trainer`) # the 2-norm trainer = Trainer(track_grad_norm=2) ------------------ +--- Log GPU usage ------------- @@ -48,7 +48,7 @@ argument of :class:`~pytorch_lightning.trainer.trainer.Trainer`) trainer = Trainer(log_gpu_memory=True) ------------------ +--- Make model overfit on subset of data ------------------------------------ @@ -70,7 +70,7 @@ argument of :class:`~pytorch_lightning.trainer.trainer.Trainer`) With this flag, the train, val, and test sets will all be the same train set. We will also replace the sampler in the training set to turn off shuffle for you. ------------------ +--- Print a summary of your LightningModule --------------------------------------- @@ -99,7 +99,7 @@ See Also: - :paramref:`~pytorch_lightning.trainer.trainer.Trainer.weights_summary` Trainer argument - :class:`~pytorch_lightning.core.memory.ModelSummary` ------------------ +--- Shorten epochs -------------- @@ -116,7 +116,7 @@ On larger datasets like Imagenet, this can help you debug or test a few things f # use 10 batches of train and 5 batches of val trainer = Trainer(limit_train_batches=10, limit_val_batches=5) ------------------ +--- Set the number of validation sanity steps ----------------------------------------- diff --git a/docs/source/experiment_logging.rst b/docs/source/experiment_logging.rst index 199e81c4a00c1..6d6d96a4157f3 100644 --- a/docs/source/experiment_logging.rst +++ b/docs/source/experiment_logging.rst @@ -7,7 +7,7 @@ Experiment Logging ================== -------------------- +--- Comet.ml ^^^^^^^^ @@ -49,7 +49,7 @@ The :class:`~pytorch_lightning.loggers.CometLogger` is available anywhere except .. seealso:: :class:`~pytorch_lightning.loggers.CometLogger` docs. -------------------- +--- MLflow ^^^^^^ @@ -76,7 +76,7 @@ Then configure the logger and pass it to the :class:`~pytorch_lightning.trainer. .. seealso:: :class:`~pytorch_lightning.loggers.MLFlowLogger` docs. -------------------- +--- Neptune.ai ^^^^^^^^^^ @@ -116,7 +116,7 @@ The :class:`~pytorch_lightning.loggers.NeptuneLogger` is available anywhere exce .. seealso:: :class:`~pytorch_lightning.loggers.NeptuneLogger` docs. -------------------- +--- allegro.ai TRAINS ^^^^^^^^^^^^^^^^^ @@ -160,7 +160,7 @@ The :class:`~pytorch_lightning.loggers.TrainsLogger` is available anywhere in yo .. seealso:: :class:`~pytorch_lightning.loggers.TrainsLogger` docs. -------------------- +--- Tensorboard ^^^^^^^^^^^ @@ -186,7 +186,7 @@ The :class:`~pytorch_lightning.loggers.TensorBoardLogger` is available anywhere .. seealso:: :class:`~pytorch_lightning.loggers.TensorBoardLogger` docs. -------------------- +--- Test Tube ^^^^^^^^^ @@ -221,7 +221,7 @@ The :class:`~pytorch_lightning.loggers.TestTubeLogger` is available anywhere exc .. seealso:: :class:`~pytorch_lightning.loggers.TestTubeLogger` docs. -------------------- +--- Weights and Biases ^^^^^^^^^^^^^^^^^^ @@ -257,7 +257,7 @@ The :class:`~pytorch_lightning.loggers.WandbLogger` is available anywhere except .. seealso:: :class:`~pytorch_lightning.loggers.WandbLogger` docs. -------------------- +--- Multiple Loggers ^^^^^^^^^^^^^^^^ diff --git a/docs/source/experiment_reporting.rst b/docs/source/experiment_reporting.rst index 8e534f4cc6d26..6ced7bc786fcb 100644 --- a/docs/source/experiment_reporting.rst +++ b/docs/source/experiment_reporting.rst @@ -104,6 +104,7 @@ Here we show the validation loss in the progress bar Snapshot hyperparameters ^^^^^^^^^^^^^^^^^^^^^^^^ + When training a model, it's useful to know what hyperparams went into that model. When Lightning creates a checkpoint, it stores a key "hparams" with the hyperparams. @@ -118,6 +119,7 @@ in the `hparams tab `_. ---------- +--- Callbacks --------- @@ -940,10 +939,10 @@ And pass the callbacks into the trainer .. note:: See full list of 12+ hooks in the :ref:`callbacks`. ---------- +--- .. include:: child_modules.rst ---------- +--- .. include:: transfer_learning.rst diff --git a/docs/source/metrics.rst b/docs/source/metrics.rst index 38c52dfb8565c..a712059f40cd1 100644 --- a/docs/source/metrics.rst +++ b/docs/source/metrics.rst @@ -1,5 +1,6 @@ .. testsetup:: * + import torch from torch.nn import Module from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.metrics import TensorMetric, NumpyMetric @@ -25,10 +26,6 @@ Example:: # calculates accuracy across all GPUs and all Nodes used in training accuracy(pred, target) -Out:: - - tensor(0.7500) - .. warning:: The metrics package is still in development! If we're missing a metric or you find a mistake, please send a PR! to a few metrics. Please feel free to create an issue/PR if you have a proposed @@ -228,7 +225,7 @@ Functional Metrics ------------------ Functional metrics can be called anywhere (even used with just plain PyTorch). -.. testcode:: +.. code-block:: python from pytorch_lightning.metrics.functional import accuracy @@ -238,10 +235,6 @@ Functional metrics can be called anywhere (even used with just plain PyTorch). # calculates accuracy across all GPUs and all Nodes used in training accuracy(pred, target) -.. testoutput:: - - tensor(0.7500) - These metrics even work when using distributed training: .. code-block:: python diff --git a/docs/source/optimizers.rst b/docs/source/optimizers.rst index d4cb6fe5a93f4..b0d7c5949e991 100644 --- a/docs/source/optimizers.rst +++ b/docs/source/optimizers.rst @@ -2,7 +2,7 @@ Optimization =============== Learning rate scheduling -------------------------------------- +------------------------ Every optimizer you use can be paired with any `LearningRateScheduler `_. .. testcode:: @@ -41,7 +41,7 @@ Every optimizer you use can be paired with any `LearningRateScheduler 1 optimizers from :meth:`pytorch_lightning.core.LightningModule.configure_optimizers` .. testcode:: @@ -73,7 +73,7 @@ Lightning will call each optimizer sequentially: Step optimizers at arbitrary intervals ----------------------------------------- +-------------------------------------- To do more interesting things with your optimizers such as learning rate warm-up or odd scheduling, override the :meth:`optimizer_step` function. diff --git a/docs/source/sequences.rst b/docs/source/sequences.rst index 857fd08198de8..6b75f323ed3c8 100644 --- a/docs/source/sequences.rst +++ b/docs/source/sequences.rst @@ -9,7 +9,7 @@ Lightning has built in support for dealing with sequential data. Packed sequences as inputs ----------------------------- +-------------------------- When using PackedSequence, do 2 things: 1. return either a padded tensor in dataset or a list of variable length tensors in the dataloader collate_fn (example above shows the list implementation). @@ -29,7 +29,7 @@ When using PackedSequence, do 2 things: y = rnn.pack_sequence(batch[1], enforce_sorted=False) Truncated Backpropagation Through Time ---------------------------------------- +-------------------------------------- There are times when multiple backwards passes are needed for each batch. For example, it may save memory to use Truncated Backpropagation Through Time when training RNNs. @@ -50,7 +50,7 @@ Lightning can handle TBTT automatically via this flag. a `hiddens` arg. Iterable Datasets ---------------------------------------- +----------------- Lightning supports using IterableDatasets as well as map-style Datasets. IterableDatasets provide a more natural option when using sequential data. diff --git a/docs/source/single_gpu.rst b/docs/source/single_gpu.rst index c6fa1b9af9bbc..4348197fbabf2 100644 --- a/docs/source/single_gpu.rst +++ b/docs/source/single_gpu.rst @@ -3,7 +3,7 @@ from pytorch_lightning.trainer.trainer import Trainer Single GPU Training -==================== +=================== Make sure you are running on a machine that has at least one GPU. Lightning handles all the NVIDIA flags for you, there's no need to set them yourself. diff --git a/docs/source/tpu.rst b/docs/source/tpu.rst index 774af763c78f1..ddc633b2754b2 100644 --- a/docs/source/tpu.rst +++ b/docs/source/tpu.rst @@ -5,13 +5,13 @@ Lightning supports running on TPUs. At this moment, TPUs are available on Google Cloud (GCP), Google Colab and Kaggle Environments. For more information on TPUs `watch this video `_. ---------------- +--- Live demo ---------- Check out this `Google Colab `_ to see how to train MNIST on TPUs. ---------------- +--- TPU Terminology --------------- @@ -23,17 +23,17 @@ A TPU pod hosts many TPUs on it. Currently, TPU pod v2 has 2048 cores! You can request a full pod from Google cloud or a "slice" which gives you some subset of those 2048 cores. ---------------- +--- How to access TPUs -------------------- +------------------ To access TPUs there are two main ways. 1. Using google colab. 2. Using Google Cloud (GCP). 3. Using Kaggle. ---------------- +--- Colab TPUs ----------- @@ -65,7 +65,7 @@ To get a TPU on colab, follow these steps: 6. Then set up your LightningModule as normal. ---------------- +--- DistributedSamplers ------------------- @@ -122,27 +122,27 @@ To use a full TPU pod skip to the TPU pod section. That's it! Your model will train on all 8 TPU cores. ---------------- +--- Single TPU core training ----------------------------- +------------------------ Lightning supports training on a single TPU core. Just pass the TPU core ID [1-8] in a list. .. code-block:: python trainer = pl.Trainer(tpu_cores=[1]) ---------------- +--- Distributed Backend with TPU ---------------------------- The ```distributed_backend``` option used for GPUs does not apply to TPUs. TPUs work in DDP mode by default (distributing over each core) ---------------- +--- TPU Pod --------- +------- To train on more than 8 cores, your code actually doesn't change! All you need to do is submit the following command: @@ -153,7 +153,7 @@ All you need to do is submit the following command: --conda-env=torch-xla-nightly -- python /usr/share/torch-xla-0.5/pytorch/xla/test/test_train_imagenet.py --fake_data ---------------- +--- 16 bit precision ----------------- @@ -171,7 +171,7 @@ set the 16-bit flag. Under the hood the xla library will use the `bfloat16 type `_. ---------------- +--- About XLA ---------- diff --git a/docs/source/training_tricks.rst b/docs/source/training_tricks.rst index 9140f52aba363..1d15981a0910d 100644 --- a/docs/source/training_tricks.rst +++ b/docs/source/training_tricks.rst @@ -8,7 +8,7 @@ Training Tricks Lightning implements various tricks to help during training Accumulate gradients -------------------------------------- +-------------------- Accumulated gradients runs K small batches of size N before doing a backwards pass. The effect is a large effective batch size of size KxN. @@ -21,7 +21,7 @@ The effect is a large effective batch size of size KxN. Gradient Clipping -------------------------------------- +----------------- Gradient clipping may be enabled to avoid exploding gradients. Specifically, this will `clip the gradient norm `_ computed over all model parameters together. diff --git a/pytorch_lightning/metrics/functional/classification.py b/pytorch_lightning/metrics/functional/classification.py index 4578851cf660b..74d8cf3d62a7e 100644 --- a/pytorch_lightning/metrics/functional/classification.py +++ b/pytorch_lightning/metrics/functional/classification.py @@ -209,14 +209,14 @@ def accuracy( Example: - >>> x = torch.tensor([1, 2, 3]) - >>> y = torch.tensor([0, 2, 3]) + >>> x = torch.tensor([0, 1, 2, 3]) + >>> y = torch.tensor([0, 1, 2, 2]) >>> accuracy(x, y) - tensor(0.6667) + tensor(0.7500) """ - tps, fps, tns, fns, sups = stat_scores_multiple_classes(pred=pred, target=target, - num_classes=num_classes) + tps, fps, tns, fns, sups = stat_scores_multiple_classes( + pred=pred, target=target, num_classes=num_classes) if not (target > 0).any() and num_classes is None: raise RuntimeError("cannot infer num_classes when target is all zero") @@ -539,7 +539,7 @@ def roc( >>> x = torch.tensor([0, 1, 2, 3]) >>> y = torch.tensor([0, 1, 2, 2]) - >>> fpr, tpr, thresholds = roc(x,y) + >>> fpr, tpr, thresholds = roc(x, y) >>> fpr tensor([0.0000, 0.3333, 0.6667, 0.6667, 1.0000]) >>> tpr diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 5b7b0aeac76db..86ef38b2e2f90 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -285,8 +285,6 @@ def _reset_eval_dataloader( for i, dataloader in enumerate(dataloaders): num_batches = 0 self._worker_check(dataloader, f'{mode} dataloader {i}') - if not _has_len(dataloader): - num_batches = float('inf') # percent or num_steps limit_eval_batches = getattr(self, f'limit_{mode}_batches') diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py index 230538ed89c01..b6037d7338ae5 100644 --- a/pytorch_lightning/trainer/evaluation_loop.py +++ b/pytorch_lightning/trainer/evaluation_loop.py @@ -222,13 +222,20 @@ def reset_test_dataloader(self, *args): def reset_val_dataloader(self, *args): """Warning: this is just empty shell for code implemented in other class.""" - def _evaluate(self, model: LightningModule, dataloaders, max_batches: List[int], test_mode: bool = False): + def _evaluate( + self, + model: LightningModule, + dataloaders: List[DataLoader], + max_batches: Union[int, List[int]], + test_mode: bool = False + ): """Run evaluation code. Args: - model: PT model - dataloaders: list of PT dataloaders - max_batches: List of scalars + model: The model to evaluate. + dataloaders: A list of PyTorch dataloaders. + max_batches: An integer or list of integers with length of the number of dataloaders. Each + entry is the number of batches to process in the corresponding dataloader. test_mode: """ # enable eval mode @@ -244,6 +251,10 @@ def _evaluate(self, model: LightningModule, dataloaders, max_batches: List[int], # bookkeeping outputs = [] + # convert max_batches to list + if isinstance(max_batches, int): + max_batches = [max_batches] * len(dataloaders) + # run validation for dataloader_idx, dataloader in enumerate(dataloaders): dl_outputs = [] diff --git a/tests/test_deprecated.py b/tests/test_deprecated.py index be2fda9c905ba..119067520cea3 100644 --- a/tests/test_deprecated.py +++ b/tests/test_deprecated.py @@ -134,7 +134,7 @@ def test_tbd_remove_in_v1_0_0_model_hooks(): with pytest.deprecated_call(match='v1.0'): trainer = Trainer(logger=False) # TODO: why `dataloder` is required if it is not used - result = trainer._evaluate(model, dataloaders=[[None]], max_batches=[1]) + result = trainer._evaluate(model, dataloaders=[[None]], max_batches=1) assert result == {'val_loss': torch.tensor(0.6)} model = ModelVer0_7(hparams) @@ -147,5 +147,5 @@ def test_tbd_remove_in_v1_0_0_model_hooks(): with pytest.deprecated_call(match='v1.0'): trainer = Trainer(logger=False) # TODO: why `dataloder` is required if it is not used - result = trainer._evaluate(model, dataloaders=[[None]], max_batches=[1]) + result = trainer._evaluate(model, dataloaders=[[None]], max_batches=1) assert result == {'val_loss': torch.tensor(0.7)}