From 8e4d284861ee5051406ed342ef2d85e37de42871 Mon Sep 17 00:00:00 2001
From: edenlightning <66261195+edenlightning@users.noreply.github.com>
Date: Wed, 17 Feb 2021 16:46:00 -0500
Subject: [PATCH 01/20] docs for prun + quantization

---
 docs/source/advanced/pruning_quantization.rst | 94 +++++++++++++++++++
 docs/source/extensions/callbacks.rst          |  1 +
 docs/source/index.rst                         |  1 +
 pytorch_lightning/callbacks/quantization.py   | 55 ++++++-----
 4 files changed, 125 insertions(+), 26 deletions(-)
 create mode 100644 docs/source/advanced/pruning_quantization.rst
diff --git a/docs/source/advanced/pruning_quantization.rst b/docs/source/advanced/pruning_quantization.rst
new file mode 100644
index 0000000000000..46616cc513571
--- /dev/null
+++ b/docs/source/advanced/pruning_quantization.rst
@@ -0,0 +1,94 @@
+.. testsetup:: *
+
+    import os
+    from pytorch_lightning.trainer.trainer import Trainer
+    from pytorch_lightning.core.lightning import LightningModule
+
+.. _pruning_quantization:
+
+########################
+Pruning and Quantization
+########################
+
+Pruning and Quantization are tecniques to compress models for deployment, allowing memory and energy reduction without significant accuracy losses.
+
+*******
+Pruning
+*******
+
+.. warning ::
+     Pruning is in beta and subject to change.
+
+Pruning is a technique to optimize model memory, hardware, and energy requirements by eliminating some of the model weights. Pruning has been shown to achieve significant efficiency improvements while minimizing the drop in performance. The pruned model is smaller in size, more memory-efficient requires less energy and memory, and is faster to run with minimal accuracy drop.
+
+TODO: when is it recomended?
+
+To enable pruning during training in Lightning, simply pass in the :class:`~pytorch_lightning.callbacks.ModelPruning` callback to the Lighting Trainer (using native PyTorch pruning implementation under the hood).
+
+This callback suports multiple pruning functions: pass any `torch.nn.utils.prune <https://pytorch.org/docs/stable/nn.html#utilities>`_ function as a string to select which weights to pruned (`random_unstructured <https://pytorch.org/docs/stable/generated/torch.nn.utils.prune.random_unstructured.html#torch.nn.utils.prune.random_unstructured>`_, `RandomStructured <https://pytorch.org/docs/stable/generated/torch.nn.utils.prune.RandomStructured.html#torch.nn.utils.prune.RandomStructured>`_, etc) or implement your own by subclassing `BasePruningMethod <https://pytorch.org/tutorials/intermediate/pruning_tutorial.html#extending-torch-nn-utils-prune-with-custom-pruning-functions>`_.
+
+TODO: what do you have to set?
+
+You can also set the pruning percentage, perform iterative pruning, apply the <lottery ticket hypothesis <https://arxiv.org/pdf/1803.03635.pdf>`_ and more!
+
+.. code-block:: python
+
+
+	from pytorch_lightning.callbacks import ModelPruning
+
+	trainer = Trainer(callbacks=[ModelPruning("random_unstructured")])
+
+
+************
+Quantization
+************
+
+.. warning ::
+     Quantization is in beta and subject to change.
+
+Model quantization is another performance optimization technique allows speeding up inference and decreasing memory requirements by performing computations and storing tensors at lower bitwidths (such as INT8 or FLOAT16) than floating point precision. Quantization not only reduces the model size, but also speeds up loading since operations on fixpoint are faster than on floating-point. 
+
+Quantization Aware Training (QAT) mimics the effects of quantization during training: all computations are carried out in floating points while training, simulating the effects of ints, and weights and activations are quantized into lower precision only once training is completed.
+
+TODO: when is it recomended?
+
+Lightning includes :class:`~pytorch_lightning.callbacks.QuantizationAwareTraining` callback (using PyTorch native quantization, read more `here <https://pytorch.org/docs/stable/quantization.html#quantization-aware-training>`_), which allows creating fully quantized models (compatible with torchscript).
+
+To quantize your model, specify TODO(borda).
+
+.. code-block:: python
+
+	from pytorch_lightning.callbacks import QuantizationAwareTraining
+
+	class RegressionModel(LightningModule):
+
+	    def __init__(self):
+	        super().__init__()
+	        self.layer_0 = nn.Linear(16, 64)
+	        self.layer_0a = torch.nn.ReLU()
+	        self.layer_1 = nn.Linear(64, 64)
+	        self.layer_1a = torch.nn.ReLU()
+	        self.layer_end = nn.Linear(64, 1)
+
+			def forward(self, x):
+	        x = self.layer_0(x)
+	        x = self.layer_0a(x)
+	        x = self.layer_1(x)
+	        x = self.layer_1a(x)
+	        x = self.layer_end(x)
+	        return x
+
+	qcb = QuantizationAwareTraining(
+			# specification of quant estimation quaity
+			observer_type='histogram',
+			# specify which layers shall be merged together to increase efficiency
+			modules_to_fuse=[(f'layer_{i}', f'layer_{i}a') for i in range(2)],
+	)
+
+	trainer = Trainer(callbacks=[qcb])
+	trainer.fit(model, ...)
+
+ You can also make your model compatible with all original input/outputs, in such case the model is wrapped in a shell with entry/exit layers.
+
+ TODO(borda): add code example
+
diff --git a/docs/source/extensions/callbacks.rst b/docs/source/extensions/callbacks.rst
index b4e45042aca5b..63a221a06119f 100644
--- a/docs/source/extensions/callbacks.rst
+++ b/docs/source/extensions/callbacks.rst
@@ -106,6 +106,7 @@ Lightning has a few built-in callbacks.
     ModelPruning
     ProgressBar
     ProgressBarBase
+    QuantizationAwareTraining
     StochasticWeightAveraging
 
 ----------
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 6b435c3d5828b..81011cbf14724 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -111,6 +111,7 @@ PyTorch Lightning Documentation
    common/single_gpu
    advanced/sequences
    advanced/training_tricks
+   advanced/pruning_quantization
    advanced/transfer_learning
    advanced/tpu
    advanced/cluster
diff --git a/pytorch_lightning/callbacks/quantization.py b/pytorch_lightning/callbacks/quantization.py
index f0458ff3b1369..ba8c7b30a3f50 100644
--- a/pytorch_lightning/callbacks/quantization.py
+++ b/pytorch_lightning/callbacks/quantization.py
@@ -83,15 +83,6 @@ def _recursive_hasattr(obj: Any, attribs: str, state: bool = True) -> bool:
 
 
 class QuantizationAwareTraining(Callback):
-    """
-    Quantization allows speeding up inference and decreasing memory requirements by performing computations
-     and storing tensors at lower bitwidths (such as INT8 or FLOAT16) than floating point precision.
-    We use native PyTorch API so for more information see
-     `Quantization <https://pytorch.org/docs/stable/quantization.html#quantization-aware-training>_`
-
-    .. warning:: ``QuantizationAwareTraining`` is in beta and subject to change.
-    """
-
     OBSERVER_TYPES = ('histogram', 'average')
 
     def __init__(
@@ -103,30 +94,42 @@ def __init__(
         input_compatible: bool = True,
     ) -> None:
         """
+        Quantization allows speeding up inference and decreasing memory requirements by performing computations and storing tensors at lower bitwidths (such as INT8 or FLOAT16) than floating point precision.
+        We use native PyTorch API so for more information see `Quantization <https://pytorch.org/docs/stable/quantization.html#quantization-aware-training>`_.
+
+        .. warning:: ``QuantizationAwareTraining`` is in beta and subject to change.
+
+
         Args:
-            qconfig: define quantization configuration see: `torch.quantization.QConfig
-             <https://pytorch.org/docs/stable/torch.quantization.html#torch.quantization.QConfig>_`
-                or use pre-defined: 'fbgemm' for server inference and 'qnnpack' for mobile inference
+
+            qconfig: quantization configuration:
+
+                - 'fbgemm' for server inference.
+                - 'qnnpack' for mobile inference.
+                -  define custom quantization configuration (see `torch.quantization.QConfig <https://pytorch.org/docs/stable/torch.quantization.html#torch.quantization.QConfig>`_).
+
             observer_type: allows switching between ``MovingAverageMinMaxObserver`` as "average" (default)
-                and ``HistogramObserver`` as "histogram" which is more computationally expensive
-            collect_quantization: count or custom function to collect quantization statistics
+                and ``HistogramObserver`` as "histogram" which is more computationally expensive.
+
+            collect_quantization: count or custom function to collect quantization statistics:
 
-                - with default ``None`` the quantization observer is called each module forward,
-                    typical use-case can be collecting extended statistic when user uses image/data augmentation
-                - custom call count to set a fixed number of calls, starting from the beginning
-                - custom ``Callable`` function with single trainer argument,
-                    see example when you limit call only for last epoch::
+                - ``None``(deafult). The quantization observer is called in each module forward (useful for collecting extended statistic when useing image/data augmentation).
+                - ``int``. Use to set a fixed number of calls, starting from the beginning.
+                - ``Callable``. Custom function with single trainer argument. See this example to trigger only the last epoch:
 
-                    def custom_trigger_last(trainer):
-                        return trainer.current_epoch == (trainer.max_epochs - 1)
+                    .. code-block:: python
 
-                    QuantizationAwareTraining(collect_quantization=custom_trigger_last)
+                        def custom_trigger_last(trainer):
+                            return trainer.current_epoch == (trainer.max_epochs - 1)
+
+                        QuantizationAwareTraining(collect_quantization=custom_trigger_last)
+
+            modules_to_fuse: allows you fuse a few layers together as shown in `diagram <https://pytorch.org/docs/stable/quantization.html#quantization-aware-training>`_
+                to find which layer types can be fused, check https://github.com/pytorch/pytorch/pull/43286.
 
-            modules_to_fuse: allows you fuse a few layers together as shown in `diagram
-             <https://pytorch.org/docs/stable/quantization.html#quantization-aware-training>_`
-                to find which layer types can be fused, check https://github.com/pytorch/pytorch/pull/43286
             input_compatible: preserve quant/dequant layers. This allows to feat any input as to the original model,
-                but break compatibility to torchscript
+                but break compatibility to torchscript.
+
         """
         if not isinstance(qconfig, (str, QConfig)):
             raise MisconfigurationException(f"Unsupported qconfig: f{qconfig}.")

From 5c7bdfc783232aff8d9693d1995e2f295a10ae88 Mon Sep 17 00:00:00 2001
From: edenlightning <66261195+edenlightning@users.noreply.github.com>
Date: Wed, 17 Feb 2021 17:30:23 -0500
Subject: [PATCH 02/20] docs for prun + quantization

---
 docs/source/advanced/pruning_quantization.rst | 19 ++++++++++++++-----
 pytorch_lightning/callbacks/quantization.py   |  2 +-
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/docs/source/advanced/pruning_quantization.rst b/docs/source/advanced/pruning_quantization.rst
index 46616cc513571..3620ce062c09f 100644
--- a/docs/source/advanced/pruning_quantization.rst
+++ b/docs/source/advanced/pruning_quantization.rst
@@ -29,7 +29,7 @@ This callback suports multiple pruning functions: pass any `torch.nn.utils.prune
 
 TODO: what do you have to set?
 
-You can also set the pruning percentage, perform iterative pruning, apply the <lottery ticket hypothesis <https://arxiv.org/pdf/1803.03635.pdf>`_ and more!
+You can also set the pruning percentage, perform iterative pruning, apply the `lottery ticket hypothesis <https://arxiv.org/pdf/1803.03635.pdf>`_ and more!
 
 .. code-block:: python
 
@@ -82,13 +82,22 @@ To quantize your model, specify TODO(borda).
 			# specification of quant estimation quaity
 			observer_type='histogram',
 			# specify which layers shall be merged together to increase efficiency
-			modules_to_fuse=[(f'layer_{i}', f'layer_{i}a') for i in range(2)],
+			modules_to_fuse=[(f'layer_{i}', f'layer_{i}a') for i in range(2)]
+			# make the model torchanble
+			input_compatible=False,
 	)
 
 	trainer = Trainer(callbacks=[qcb])
-	trainer.fit(model, ...)
+	qmodel = RegressionModel()
+	trainer.fit(qmodel, ...)
 
- You can also make your model compatible with all original input/outputs, in such case the model is wrapped in a shell with entry/exit layers.
+	batch = iter(my_dataloader()).next()
+	qmodel(qmodel.quant(batch[0]))
 
- TODO(borda): add code example
+	tsmodel = qmodel.to_torchscript()
+	tsmodel(tsmodel.quant(batch[0]))
+
+You can also set `input_compatible=True` to make your model compatible with all original input/outputs, in such case the model is wrapped in a shell with entry/exit layers.
+
+TODO(borda): add code example
 
diff --git a/pytorch_lightning/callbacks/quantization.py b/pytorch_lightning/callbacks/quantization.py
index ba8c7b30a3f50..c1577148de748 100644
--- a/pytorch_lightning/callbacks/quantization.py
+++ b/pytorch_lightning/callbacks/quantization.py
@@ -113,7 +113,7 @@ def __init__(
 
             collect_quantization: count or custom function to collect quantization statistics:
 
-                - ``None``(deafult). The quantization observer is called in each module forward (useful for collecting extended statistic when useing image/data augmentation).
+                - ``None`` (deafult). The quantization observer is called in each module forward (useful for collecting extended statistic when useing image/data augmentation).
                 - ``int``. Use to set a fixed number of calls, starting from the beginning.
                 - ``Callable``. Custom function with single trainer argument. See this example to trigger only the last epoch:
 

From dd3e98771979db8f04905c98f72ff5f29d834750 Mon Sep 17 00:00:00 2001
From: edenlightning <66261195+edenlightning@users.noreply.github.com>
Date: Wed, 17 Feb 2021 17:50:38 -0500
Subject: [PATCH 03/20] Apply suggestions from code review

Co-authored-by: chaton <thomas@grid.ai>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 docs/source/advanced/pruning_quantization.rst | 29 ++++++++++---------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/docs/source/advanced/pruning_quantization.rst b/docs/source/advanced/pruning_quantization.rst
index 3620ce062c09f..97c43f07e56ad 100644
--- a/docs/source/advanced/pruning_quantization.rst
+++ b/docs/source/advanced/pruning_quantization.rst
@@ -10,18 +10,19 @@
 Pruning and Quantization
 ########################
 
-Pruning and Quantization are tecniques to compress models for deployment, allowing memory and energy reduction without significant accuracy losses.
+Pruning and Quantization are techniques to compress model size for deployment, allowing inference speed up and energy saving without significant accuracy losses.
 
 *******
 Pruning
 *******
 
-.. warning ::
+.. warning::
+
      Pruning is in beta and subject to change.
 
-Pruning is a technique to optimize model memory, hardware, and energy requirements by eliminating some of the model weights. Pruning has been shown to achieve significant efficiency improvements while minimizing the drop in performance. The pruned model is smaller in size, more memory-efficient requires less energy and memory, and is faster to run with minimal accuracy drop.
+Pruning is focussing on eliminating some of the model weights to reduce the model size and increase inference performance. Pruning has been shown to achieve significant efficiency improvements while minimizing the drop in model performance (prediction quality).
 
-TODO: when is it recomended?
+Pruning your model is recommended for cloud endpoint, on edge device, or mobile deployment.
 
 To enable pruning during training in Lightning, simply pass in the :class:`~pytorch_lightning.callbacks.ModelPruning` callback to the Lighting Trainer (using native PyTorch pruning implementation under the hood).
 
@@ -46,11 +47,11 @@ Quantization
 .. warning ::
      Quantization is in beta and subject to change.
 
-Model quantization is another performance optimization technique allows speeding up inference and decreasing memory requirements by performing computations and storing tensors at lower bitwidths (such as INT8 or FLOAT16) than floating point precision. Quantization not only reduces the model size, but also speeds up loading since operations on fixpoint are faster than on floating-point. 
+Model quantization is another performance optimization technique that allows speeding up inference and decreasing memory requirements by performing computations and storing tensors at lower bitwidths (such as INT8 or FLOAT16) than floating-point precision. Moreover smaller models also speed up model loading. 
 
 Quantization Aware Training (QAT) mimics the effects of quantization during training: all computations are carried out in floating points while training, simulating the effects of ints, and weights and activations are quantized into lower precision only once training is completed.
 
-TODO: when is it recomended?
+Quantization is useful when serving large models on machines with limited memory or when there's a need to switch between models where each model has to be loaded from the drive.
 
 Lightning includes :class:`~pytorch_lightning.callbacks.QuantizationAwareTraining` callback (using PyTorch native quantization, read more `here <https://pytorch.org/docs/stable/quantization.html#quantization-aware-training>`_), which allows creating fully quantized models (compatible with torchscript).
 
@@ -79,12 +80,12 @@ To quantize your model, specify TODO(borda).
 	        return x
 
 	qcb = QuantizationAwareTraining(
-			# specification of quant estimation quaity
-			observer_type='histogram',
-			# specify which layers shall be merged together to increase efficiency
-			modules_to_fuse=[(f'layer_{i}', f'layer_{i}a') for i in range(2)]
-			# make the model torchanble
-			input_compatible=False,
+	    # specification of quant estimation quaity
+	    observer_type='histogram',
+	    # specify which layers shall be merged together to increase efficiency
+	    modules_to_fuse=[(f'layer_{i}', f'layer_{i}a') for i in range(2)]
+	    # make the model torchanble
+	    input_compatible=False,
 	)
 
 	trainer = Trainer(callbacks=[qcb])
@@ -99,5 +100,7 @@ To quantize your model, specify TODO(borda).
 
 You can also set `input_compatible=True` to make your model compatible with all original input/outputs, in such case the model is wrapped in a shell with entry/exit layers.
 
-TODO(borda): add code example
+.. code-block:: python
 
+        batch = iter(my_dataloader()).next()
+        qmodel(batch[0])

From 269278389c62258bb60764c9be8e9739cf0e2558 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Thu, 18 Feb 2021 00:11:44 +0100
Subject: [PATCH 04/20] Apply suggestions from code review

Co-authored-by: chaton <thomas@grid.ai>
---
 docs/source/advanced/pruning_quantization.rst | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/docs/source/advanced/pruning_quantization.rst b/docs/source/advanced/pruning_quantization.rst
index 97c43f07e56ad..35c98bae07d84 100644
--- a/docs/source/advanced/pruning_quantization.rst
+++ b/docs/source/advanced/pruning_quantization.rst
@@ -20,13 +20,13 @@ Pruning
 
      Pruning is in beta and subject to change.
 
-Pruning is focussing on eliminating some of the model weights to reduce the model size and increase inference performance. Pruning has been shown to achieve significant efficiency improvements while minimizing the drop in model performance (prediction quality).
+Pruning is a technique which focuses on eliminating some of the model weights to reduce the model size and decrease inference requirements. Pruning has been shown to achieve significant efficiency improvements while minimizing the drop in model performance (prediction quality).
 
 Pruning your model is recommended for cloud endpoint, on edge device, or mobile deployment.
 
-To enable pruning during training in Lightning, simply pass in the :class:`~pytorch_lightning.callbacks.ModelPruning` callback to the Lighting Trainer (using native PyTorch pruning implementation under the hood).
+To enable pruning during training in Lightning, simply pass in the :class:`~pytorch_lightning.callbacks.ModelPruning` callback to the Lightning Trainer. PyTorch's native pruning implementation is used under the hood.
 
-This callback suports multiple pruning functions: pass any `torch.nn.utils.prune <https://pytorch.org/docs/stable/nn.html#utilities>`_ function as a string to select which weights to pruned (`random_unstructured <https://pytorch.org/docs/stable/generated/torch.nn.utils.prune.random_unstructured.html#torch.nn.utils.prune.random_unstructured>`_, `RandomStructured <https://pytorch.org/docs/stable/generated/torch.nn.utils.prune.RandomStructured.html#torch.nn.utils.prune.RandomStructured>`_, etc) or implement your own by subclassing `BasePruningMethod <https://pytorch.org/tutorials/intermediate/pruning_tutorial.html#extending-torch-nn-utils-prune-with-custom-pruning-functions>`_.
+This callback supports multiple pruning functions: pass any `torch.nn.utils.prune <https://pytorch.org/docs/stable/nn.html#utilities>`_ function as a string to select which weights to prune (`random_unstructured <https://pytorch.org/docs/stable/generated/torch.nn.utils.prune.random_unstructured.html#torch.nn.utils.prune.random_unstructured>`_, `RandomStructured <https://pytorch.org/docs/stable/generated/torch.nn.utils.prune.RandomStructured.html#torch.nn.utils.prune.RandomStructured>`_, etc) or implement your own by subclassing `BasePruningMethod <https://pytorch.org/tutorials/intermediate/pruning_tutorial.html#extending-torch-nn-utils-prune-with-custom-pruning-functions>`_.
 
 TODO: what do you have to set?
 
@@ -34,10 +34,19 @@ You can also set the pruning percentage, perform iterative pruning, apply the `l
 
 .. code-block:: python
 
+    from pytorch_lightning.callbacks import ModelPruning
 
-	from pytorch_lightning.callbacks import ModelPruning
+    def compute_amount(epoch):
+        if epoch == 10:
+            return 0.5
 
-	trainer = Trainer(callbacks=[ModelPruning("random_unstructured")])
+        elif epoch == 50:
+            return 0.25
+
+        elif 75 < epoch < 99 :
+            return 0.01         
+
+    trainer = Trainer(callbacks=[ModelPruning("l1_unstructured", amount=compute_amount)])
 
 
 ************

From cc53b88e925b16aaec847762e3ea788ca14f3791 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Thu, 18 Feb 2021 00:12:05 +0100
Subject: [PATCH 05/20] Update docs/source/advanced/pruning_quantization.rst

---
 docs/source/advanced/pruning_quantization.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/advanced/pruning_quantization.rst b/docs/source/advanced/pruning_quantization.rst
index 35c98bae07d84..1515c721077c0 100644
--- a/docs/source/advanced/pruning_quantization.rst
+++ b/docs/source/advanced/pruning_quantization.rst
@@ -22,7 +22,7 @@ Pruning
 
 Pruning is a technique which focuses on eliminating some of the model weights to reduce the model size and decrease inference requirements. Pruning has been shown to achieve significant efficiency improvements while minimizing the drop in model performance (prediction quality).
 
-Pruning your model is recommended for cloud endpoint, on edge device, or mobile deployment.
+Pruning your model is recommended for cloud endpoints, deploying models on edge devices, or mobile inference (among others).
 
 To enable pruning during training in Lightning, simply pass in the :class:`~pytorch_lightning.callbacks.ModelPruning` callback to the Lightning Trainer. PyTorch's native pruning implementation is used under the hood.
 

From cdb3a724635b0bf1547296f4084a73056192a517 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Thu, 18 Feb 2021 00:13:32 +0100
Subject: [PATCH 06/20] Update docs/source/advanced/pruning_quantization.rst

---
 docs/source/advanced/pruning_quantization.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/advanced/pruning_quantization.rst b/docs/source/advanced/pruning_quantization.rst
index 1515c721077c0..3918212667f3c 100644
--- a/docs/source/advanced/pruning_quantization.rst
+++ b/docs/source/advanced/pruning_quantization.rst
@@ -20,9 +20,9 @@ Pruning
 
      Pruning is in beta and subject to change.
 
-Pruning is a technique which focuses on eliminating some of the model weights to reduce the model size and decrease inference requirements. Pruning has been shown to achieve significant efficiency improvements while minimizing the drop in model performance (prediction quality).
+Pruning is a technique which focuses on eliminating some of the model weights to reduce the model size and decrease inference requirements.
 
-Pruning your model is recommended for cloud endpoints, deploying models on edge devices, or mobile inference (among others).
+Pruning has been shown to achieve significant efficiency improvements while minimizing the drop in model performance (prediction quality). Model pruning is recommended for cloud endpoints, deploying models on edge devices, or mobile inference (among others).
 
 To enable pruning during training in Lightning, simply pass in the :class:`~pytorch_lightning.callbacks.ModelPruning` callback to the Lightning Trainer. PyTorch's native pruning implementation is used under the hood.
 

From 3e7ad2a94de6fa87cfb98e238239e2c804be7b7d Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Wed, 17 Feb 2021 23:20:43 +0000
Subject: [PATCH 07/20] update doc

---
 docs/source/advanced/pruning_quantization.rst | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/docs/source/advanced/pruning_quantization.rst b/docs/source/advanced/pruning_quantization.rst
index 3918212667f3c..fb1f7d6ab036e 100644
--- a/docs/source/advanced/pruning_quantization.rst
+++ b/docs/source/advanced/pruning_quantization.rst
@@ -28,15 +28,19 @@ To enable pruning during training in Lightning, simply pass in the :class:`~pyto
 
 This callback supports multiple pruning functions: pass any `torch.nn.utils.prune <https://pytorch.org/docs/stable/nn.html#utilities>`_ function as a string to select which weights to prune (`random_unstructured <https://pytorch.org/docs/stable/generated/torch.nn.utils.prune.random_unstructured.html#torch.nn.utils.prune.random_unstructured>`_, `RandomStructured <https://pytorch.org/docs/stable/generated/torch.nn.utils.prune.RandomStructured.html#torch.nn.utils.prune.RandomStructured>`_, etc) or implement your own by subclassing `BasePruningMethod <https://pytorch.org/tutorials/intermediate/pruning_tutorial.html#extending-torch-nn-utils-prune-with-custom-pruning-functions>`_.
 
-TODO: what do you have to set?
-
 You can also set the pruning percentage, perform iterative pruning, apply the `lottery ticket hypothesis <https://arxiv.org/pdf/1803.03635.pdf>`_ and more!
 
 .. code-block:: python
 
     from pytorch_lightning.callbacks import ModelPruning
 
+	# the amount can be a float between 0 and 1
+	trainer = Trainer(callbacks=[ModelPruning("l1_unstructured", amount=0.5)])
+
+	# or
+
     def compute_amount(epoch):
+		# the sum of all returned values need to be smaller than 1
         if epoch == 10:
             return 0.5
 
@@ -44,8 +48,9 @@ You can also set the pruning percentage, perform iterative pruning, apply the `l
             return 0.25
 
         elif 75 < epoch < 99 :
-            return 0.01         
+            return 0.01
 
+	# the amount can be also be a callable
     trainer = Trainer(callbacks=[ModelPruning("l1_unstructured", amount=compute_amount)])
 
 
@@ -56,7 +61,7 @@ Quantization
 .. warning ::
      Quantization is in beta and subject to change.
 
-Model quantization is another performance optimization technique that allows speeding up inference and decreasing memory requirements by performing computations and storing tensors at lower bitwidths (such as INT8 or FLOAT16) than floating-point precision. Moreover smaller models also speed up model loading. 
+Model quantization is another performance optimization technique that allows speeding up inference and decreasing memory requirements by performing computations and storing tensors at lower bitwidths (such as INT8 or FLOAT16) than floating-point precision. Moreover smaller models also speed up model loading.
 
 Quantization Aware Training (QAT) mimics the effects of quantization during training: all computations are carried out in floating points while training, simulating the effects of ints, and weights and activations are quantized into lower precision only once training is completed.
 

From fd13bd398e90c38365104506d28a0ed735ce4124 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Wed, 17 Feb 2021 23:26:47 +0000
Subject: [PATCH 08/20] replace for spaces

---
 docs/source/advanced/pruning_quantization.rst | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/source/advanced/pruning_quantization.rst b/docs/source/advanced/pruning_quantization.rst
index fb1f7d6ab036e..7a4b7adda73a3 100644
--- a/docs/source/advanced/pruning_quantization.rst
+++ b/docs/source/advanced/pruning_quantization.rst
@@ -34,23 +34,23 @@ You can also set the pruning percentage, perform iterative pruning, apply the `l
 
     from pytorch_lightning.callbacks import ModelPruning
 
-	# the amount can be a float between 0 and 1
-	trainer = Trainer(callbacks=[ModelPruning("l1_unstructured", amount=0.5)])
+    # the amount can be a float between 0 and 1
+    trainer = Trainer(callbacks=[ModelPruning("l1_unstructured", amount=0.5)])
 
-	# or
+    # or
 
     def compute_amount(epoch):
-		# the sum of all returned values need to be smaller than 1
+        # the sum of all returned values need to be smaller than 1
         if epoch == 10:
             return 0.5
 
         elif epoch == 50:
             return 0.25
 
-        elif 75 < epoch < 99 :
+       elif 75 < epoch < 99 :
             return 0.01
 
-	# the amount can be also be a callable
+    # the amount can be also be a callable
     trainer = Trainer(callbacks=[ModelPruning("l1_unstructured", amount=compute_amount)])
 
 

From 49a70df57904458f677a4d8c981c78ea86a17f66 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Thu, 18 Feb 2021 00:31:07 +0100
Subject: [PATCH 09/20] Update docs/source/advanced/pruning_quantization.rst

---
 docs/source/advanced/pruning_quantization.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/advanced/pruning_quantization.rst b/docs/source/advanced/pruning_quantization.rst
index 7a4b7adda73a3..d55533b55adb8 100644
--- a/docs/source/advanced/pruning_quantization.rst
+++ b/docs/source/advanced/pruning_quantization.rst
@@ -61,7 +61,7 @@ Quantization
 .. warning ::
      Quantization is in beta and subject to change.
 
-Model quantization is another performance optimization technique that allows speeding up inference and decreasing memory requirements by performing computations and storing tensors at lower bitwidths (such as INT8 or FLOAT16) than floating-point precision. Moreover smaller models also speed up model loading.
+Model quantization is another performance optimization technique that allows speeding up inference and decreasing memory requirements by performing computations and storing tensors at lower bitwidths (such as INT8 or FLOAT16) than floating-point precision. Moreover, smaller models also speed up model loading.
 
 Quantization Aware Training (QAT) mimics the effects of quantization during training: all computations are carried out in floating points while training, simulating the effects of ints, and weights and activations are quantized into lower precision only once training is completed.
 

From 57fde148687a390e7982f49da54cc82a00163f9c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Thu, 18 Feb 2021 00:32:11 +0100
Subject: [PATCH 10/20] Update docs/source/advanced/pruning_quantization.rst

---
 docs/source/advanced/pruning_quantization.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/advanced/pruning_quantization.rst b/docs/source/advanced/pruning_quantization.rst
index d55533b55adb8..1c66cf7a57222 100644
--- a/docs/source/advanced/pruning_quantization.rst
+++ b/docs/source/advanced/pruning_quantization.rst
@@ -94,7 +94,7 @@ To quantize your model, specify TODO(borda).
 	        return x
 
 	qcb = QuantizationAwareTraining(
-	    # specification of quant estimation quaity
+	    # specification of quant estimation quality
 	    observer_type='histogram',
 	    # specify which layers shall be merged together to increase efficiency
 	    modules_to_fuse=[(f'layer_{i}', f'layer_{i}a') for i in range(2)]

From 0c1354373eada29d544f3b367ea55b6163e9e3d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Thu, 18 Feb 2021 00:32:24 +0100
Subject: [PATCH 11/20] Update docs/source/advanced/pruning_quantization.rst

---
 docs/source/advanced/pruning_quantization.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/advanced/pruning_quantization.rst b/docs/source/advanced/pruning_quantization.rst
index 1c66cf7a57222..b391847cd7e9a 100644
--- a/docs/source/advanced/pruning_quantization.rst
+++ b/docs/source/advanced/pruning_quantization.rst
@@ -67,7 +67,7 @@ Quantization Aware Training (QAT) mimics the effects of quantization during trai
 
 Quantization is useful when serving large models on machines with limited memory or when there's a need to switch between models where each model has to be loaded from the drive.
 
-Lightning includes :class:`~pytorch_lightning.callbacks.QuantizationAwareTraining` callback (using PyTorch native quantization, read more `here <https://pytorch.org/docs/stable/quantization.html#quantization-aware-training>`_), which allows creating fully quantized models (compatible with torchscript).
+Lightning includes :class:`~pytorch_lightning.callbacks.QuantizationAwareTraining` callback (using PyTorch's native quantization, read more `here <https://pytorch.org/docs/stable/quantization.html#quantization-aware-training>`__), which allows creating fully quantized models (compatible with torchscript).
 
 To quantize your model, specify TODO(borda).
 

From 1915a80d54ad93ef4a4586574d0bf2aeb808689b Mon Sep 17 00:00:00 2001
From: edenlightning <66261195+edenlightning@users.noreply.github.com>
Date: Wed, 17 Feb 2021 21:24:44 -0500
Subject: [PATCH 12/20] docs for prun + quantization

---
 docs/source/advanced/pruning_quantization.rst | 39 +++++++++----------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/docs/source/advanced/pruning_quantization.rst b/docs/source/advanced/pruning_quantization.rst
index b391847cd7e9a..ad5d600b55e5a 100644
--- a/docs/source/advanced/pruning_quantization.rst
+++ b/docs/source/advanced/pruning_quantization.rst
@@ -28,16 +28,16 @@ To enable pruning during training in Lightning, simply pass in the :class:`~pyto
 
 This callback supports multiple pruning functions: pass any `torch.nn.utils.prune <https://pytorch.org/docs/stable/nn.html#utilities>`_ function as a string to select which weights to prune (`random_unstructured <https://pytorch.org/docs/stable/generated/torch.nn.utils.prune.random_unstructured.html#torch.nn.utils.prune.random_unstructured>`_, `RandomStructured <https://pytorch.org/docs/stable/generated/torch.nn.utils.prune.RandomStructured.html#torch.nn.utils.prune.RandomStructured>`_, etc) or implement your own by subclassing `BasePruningMethod <https://pytorch.org/tutorials/intermediate/pruning_tutorial.html#extending-torch-nn-utils-prune-with-custom-pruning-functions>`_.
 
-You can also set the pruning percentage, perform iterative pruning, apply the `lottery ticket hypothesis <https://arxiv.org/pdf/1803.03635.pdf>`_ and more!
-
 .. code-block:: python
 
     from pytorch_lightning.callbacks import ModelPruning
 
-    # the amount can be a float between 0 and 1
+    # set the amount to be the fraction of parameters to prune
     trainer = Trainer(callbacks=[ModelPruning("l1_unstructured", amount=0.5)])
 
-    # or
+You can also perform iterative pruning, apply the `lottery ticket hypothesis <https://arxiv.org/pdf/1803.03635.pdf>`_ and more!
+
+.. code-block:: python
 
     def compute_amount(epoch):
         # the sum of all returned values need to be smaller than 1
@@ -69,8 +69,6 @@ Quantization is useful when serving large models on machines with limited memory
 
 Lightning includes :class:`~pytorch_lightning.callbacks.QuantizationAwareTraining` callback (using PyTorch's native quantization, read more `here <https://pytorch.org/docs/stable/quantization.html#quantization-aware-training>`__), which allows creating fully quantized models (compatible with torchscript).
 
-To quantize your model, specify TODO(borda).
-
 .. code-block:: python
 
 	from pytorch_lightning.callbacks import QuantizationAwareTraining
@@ -85,7 +83,7 @@ To quantize your model, specify TODO(borda).
 	        self.layer_1a = torch.nn.ReLU()
 	        self.layer_end = nn.Linear(64, 1)
 
-			def forward(self, x):
+	    def forward(self, x):
 	        x = self.layer_0(x)
 	        x = self.layer_0a(x)
 	        x = self.layer_1(x)
@@ -93,16 +91,7 @@ To quantize your model, specify TODO(borda).
 	        x = self.layer_end(x)
 	        return x
 
-	qcb = QuantizationAwareTraining(
-	    # specification of quant estimation quality
-	    observer_type='histogram',
-	    # specify which layers shall be merged together to increase efficiency
-	    modules_to_fuse=[(f'layer_{i}', f'layer_{i}a') for i in range(2)]
-	    # make the model torchanble
-	    input_compatible=False,
-	)
-
-	trainer = Trainer(callbacks=[qcb])
+	trainer = Trainer(callbacks=[QuantizationAwareTraining()])
 	qmodel = RegressionModel()
 	trainer.fit(qmodel, ...)
 
@@ -112,9 +101,19 @@ To quantize your model, specify TODO(borda).
 	tsmodel = qmodel.to_torchscript()
 	tsmodel(tsmodel.quant(batch[0]))
 
-You can also set `input_compatible=True` to make your model compatible with all original input/outputs, in such case the model is wrapped in a shell with entry/exit layers.
+You can further customize the callback:
 
 .. code-block:: python
 
-        batch = iter(my_dataloader()).next()
-        qmodel(batch[0])
+
+	qcb = QuantizationAwareTraining(
+	    # specification of quant estimation quality
+	    observer_type='histogram',
+	    # specify which layers shall be merged together to increase efficiency
+	    modules_to_fuse=[(f'layer_{i}', f'layer_{i}a') for i in range(2)]
+	    # make your model compatible with all original input/outputs, in such case the model is wrapped in a shell with entry/exit layers.
+	    input_compatible=True
+	)
+
+    batch = iter(my_dataloader()).next()
+    qmodel(batch[0])

From dae2d1ee0a22e8aaa63a623228e45323f3ab07e0 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Thu, 18 Feb 2021 11:02:43 +0100
Subject: [PATCH 13/20] Apply suggestions from code review

---
 docs/source/advanced/pruning_quantization.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/advanced/pruning_quantization.rst b/docs/source/advanced/pruning_quantization.rst
index ad5d600b55e5a..28ded68ef5274 100644
--- a/docs/source/advanced/pruning_quantization.rst
+++ b/docs/source/advanced/pruning_quantization.rst
@@ -63,7 +63,7 @@ Quantization
 
 Model quantization is another performance optimization technique that allows speeding up inference and decreasing memory requirements by performing computations and storing tensors at lower bitwidths (such as INT8 or FLOAT16) than floating-point precision. Moreover, smaller models also speed up model loading.
 
-Quantization Aware Training (QAT) mimics the effects of quantization during training: all computations are carried out in floating points while training, simulating the effects of ints, and weights and activations are quantized into lower precision only once training is completed.
+Quantization Aware Training (QAT) mimics the effects of quantization during training: all computations are carried out in floating points while training, fake_quant simulats the effects of ints, and weights and activations are quantized into lower precision only once training is completed.
 
 Quantization is useful when serving large models on machines with limited memory or when there's a need to switch between models where each model has to be loaded from the drive.
 

From 851aeff1049562b6391831f3c7d84ab2ebb8be7a Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Thu, 18 Feb 2021 10:38:57 +0000
Subject: [PATCH 14/20] update doc

---
 docs/source/advanced/training_tricks.rst    | 16 ++++++++++++++++
 pytorch_lightning/callbacks/quantization.py | 19 +++++++++++++------
 pytorch_lightning/callbacks/swa.py          |  6 ++++++
 3 files changed, 35 insertions(+), 6 deletions(-)

diff --git a/docs/source/advanced/training_tricks.rst b/docs/source/advanced/training_tricks.rst
index d7230a1fd687a..d3649dbf90ca2 100644
--- a/docs/source/advanced/training_tricks.rst
+++ b/docs/source/advanced/training_tricks.rst
@@ -41,6 +41,22 @@ norm <https://pytorch.org/docs/stable/nn.html#torch.nn.utils.clip_grad_norm_>`_
 
 ----------
 
+Stochastic Weight Averaging
+---------------------------
+Stochastic Weight Averaging (SWA) is a training method where
+the weights of the model are averaged on the N last epochs of the training.
+
+See this link for an in-depth explanation: <https://pytorch.org/blog/pytorch-1.6-now-includes-stochastic-weight-averaging>
+
+.. seealso:: :class:`~pytorch_lightning.callbacks.StochasticWeightAveraging`
+
+.. testcode::
+
+    # activate SWA from Trainer directly.
+    trainer = Trainer(stochastic_weight_avg=True)
+
+----------
+
 Auto scaling of batch size
 --------------------------
 Auto scaling of batch size may be enabled to find the largest batch size that fits into
diff --git a/pytorch_lightning/callbacks/quantization.py b/pytorch_lightning/callbacks/quantization.py
index c1577148de748..5e910ffcacc1e 100644
--- a/pytorch_lightning/callbacks/quantization.py
+++ b/pytorch_lightning/callbacks/quantization.py
@@ -94,8 +94,11 @@ def __init__(
         input_compatible: bool = True,
     ) -> None:
         """
-        Quantization allows speeding up inference and decreasing memory requirements by performing computations and storing tensors at lower bitwidths (such as INT8 or FLOAT16) than floating point precision.
-        We use native PyTorch API so for more information see `Quantization <https://pytorch.org/docs/stable/quantization.html#quantization-aware-training>`_.
+        Quantization allows speeding up inference and decreasing memory requirements
+        by performing computations and storing tensors at lower bitwidths
+        (such as INT8 or FLOAT16) than floating point precision.
+        We use native PyTorch API so for more information
+        see `Quantization <https://pytorch.org/docs/stable/quantization.html#quantization-aware-training>`_.
 
         .. warning:: ``QuantizationAwareTraining`` is in beta and subject to change.
 
@@ -106,16 +109,19 @@ def __init__(
 
                 - 'fbgemm' for server inference.
                 - 'qnnpack' for mobile inference.
-                -  define custom quantization configuration (see `torch.quantization.QConfig <https://pytorch.org/docs/stable/torch.quantization.html#torch.quantization.QConfig>`_).
+                -  define custom quantization configuration `torch.quantization.QConfig
+                    <https://pytorch.org/docs/stable/torch.quantization.html#torch.quantization.QConfig>`_).
 
             observer_type: allows switching between ``MovingAverageMinMaxObserver`` as "average" (default)
                 and ``HistogramObserver`` as "histogram" which is more computationally expensive.
 
             collect_quantization: count or custom function to collect quantization statistics:
 
-                - ``None`` (deafult). The quantization observer is called in each module forward (useful for collecting extended statistic when useing image/data augmentation).
+                - ``None`` (deafult). The quantization observer is called in each module forward
+                    (useful for collecting extended statistic when useing image/data augmentation).
                 - ``int``. Use to set a fixed number of calls, starting from the beginning.
-                - ``Callable``. Custom function with single trainer argument. See this example to trigger only the last epoch:
+                - ``Callable``. Custom function with single trainer argument.
+                    See this example to trigger only the last epoch:
 
                     .. code-block:: python
 
@@ -124,7 +130,8 @@ def custom_trigger_last(trainer):
 
                         QuantizationAwareTraining(collect_quantization=custom_trigger_last)
 
-            modules_to_fuse: allows you fuse a few layers together as shown in `diagram <https://pytorch.org/docs/stable/quantization.html#quantization-aware-training>`_
+            modules_to_fuse: allows you fuse a few layers together as shown in
+                `diagram <https://pytorch.org/docs/stable/quantization.html#quantization-aware-training>`_
                 to find which layer types can be fused, check https://github.com/pytorch/pytorch/pull/43286.
 
             input_compatible: preserve quant/dequant layers. This allows to feat any input as to the original model,
diff --git a/pytorch_lightning/callbacks/swa.py b/pytorch_lightning/callbacks/swa.py
index fc7a2c75c0d51..b35677d47c5f8 100644
--- a/pytorch_lightning/callbacks/swa.py
+++ b/pytorch_lightning/callbacks/swa.py
@@ -63,6 +63,12 @@ def __init__(
 
         .. warning:: ``StochasticWeightAveraging`` is currently not supported for multiple optimizers/schedulers.
 
+        SWA can easily be activate directly from the Trainer as follow:
+
+        .. code-block:: python
+
+            Trainer(stochastic_weight_avg=True)
+
         Arguments:
 
             swa_epoch_start: If provided as int, the procedure will start from

From 524fae19f3ffeaad3f240f038eaa725984de3b2b Mon Sep 17 00:00:00 2001
From: chaton <thomas@grid.ai>
Date: Thu, 18 Feb 2021 11:44:11 +0000
Subject: [PATCH 15/20] Update docs/source/advanced/pruning_quantization.rst

Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
---
 docs/source/advanced/pruning_quantization.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/advanced/pruning_quantization.rst b/docs/source/advanced/pruning_quantization.rst
index 28ded68ef5274..bba28883384d7 100644
--- a/docs/source/advanced/pruning_quantization.rst
+++ b/docs/source/advanced/pruning_quantization.rst
@@ -65,7 +65,7 @@ Model quantization is another performance optimization technique that allows spe
 
 Quantization Aware Training (QAT) mimics the effects of quantization during training: all computations are carried out in floating points while training, fake_quant simulats the effects of ints, and weights and activations are quantized into lower precision only once training is completed.
 
-Quantization is useful when serving large models on machines with limited memory or when there's a need to switch between models where each model has to be loaded from the drive.
+Quantization is useful when it is required to serve large models on machines with limited memory, or when there's a need to switch between models and reducing the IO time is important i.e monolingual speech recognition models across multiple languages.
 
 Lightning includes :class:`~pytorch_lightning.callbacks.QuantizationAwareTraining` callback (using PyTorch's native quantization, read more `here <https://pytorch.org/docs/stable/quantization.html#quantization-aware-training>`__), which allows creating fully quantized models (compatible with torchscript).
 

From 9433e17700a57d1dc44e2e7b01b786b963a42aa6 Mon Sep 17 00:00:00 2001
From: Sean Naren <sean.narenthiran@gmail.com>
Date: Thu, 18 Feb 2021 12:19:48 +0000
Subject: [PATCH 16/20] Update docs/source/advanced/pruning_quantization.rst

---
 docs/source/advanced/pruning_quantization.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/advanced/pruning_quantization.rst b/docs/source/advanced/pruning_quantization.rst
index bba28883384d7..04b5424e7c566 100644
--- a/docs/source/advanced/pruning_quantization.rst
+++ b/docs/source/advanced/pruning_quantization.rst
@@ -65,7 +65,7 @@ Model quantization is another performance optimization technique that allows spe
 
 Quantization Aware Training (QAT) mimics the effects of quantization during training: all computations are carried out in floating points while training, fake_quant simulats the effects of ints, and weights and activations are quantized into lower precision only once training is completed.
 
-Quantization is useful when it is required to serve large models on machines with limited memory, or when there's a need to switch between models and reducing the IO time is important i.e monolingual speech recognition models across multiple languages.
+Quantization is useful when it is required to serve large models on machines with limited memory, or when there's a need to switch between models and reducing the IO time is important i.e switching between monolingual speech recognition models across multiple languages.
 
 Lightning includes :class:`~pytorch_lightning.callbacks.QuantizationAwareTraining` callback (using PyTorch's native quantization, read more `here <https://pytorch.org/docs/stable/quantization.html#quantization-aware-training>`__), which allows creating fully quantized models (compatible with torchscript).
 

From 8a8068b5db67340905329b9e547693368ca9c09d Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 18 Feb 2021 14:08:04 +0100
Subject: [PATCH 17/20] Quantization improvements

---
 docs/source/advanced/pruning_quantization.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/advanced/pruning_quantization.rst b/docs/source/advanced/pruning_quantization.rst
index 04b5424e7c566..cd3ae2065db76 100644
--- a/docs/source/advanced/pruning_quantization.rst
+++ b/docs/source/advanced/pruning_quantization.rst
@@ -35,7 +35,7 @@ This callback supports multiple pruning functions: pass any `torch.nn.utils.prun
     # set the amount to be the fraction of parameters to prune
     trainer = Trainer(callbacks=[ModelPruning("l1_unstructured", amount=0.5)])
 
-You can also perform iterative pruning, apply the `lottery ticket hypothesis <https://arxiv.org/pdf/1803.03635.pdf>`_ and more!
+You can also perform iterative pruning, apply the `lottery ticket hypothesis <https://arxiv.org/pdf/1803.03635.pdf>`__, and more!
 
 .. code-block:: python
 
@@ -61,11 +61,11 @@ Quantization
 .. warning ::
      Quantization is in beta and subject to change.
 
-Model quantization is another performance optimization technique that allows speeding up inference and decreasing memory requirements by performing computations and storing tensors at lower bitwidths (such as INT8 or FLOAT16) than floating-point precision. Moreover, smaller models also speed up model loading.
+Model quantization is another performance optimization technique that allows speeding up inference and decreasing memory requirements by performing computations and storing tensors at lower bitwidths (such as INT8 or FLOAT16) than floating-point precision. This is particularly beneficial during model deployment.
 
-Quantization Aware Training (QAT) mimics the effects of quantization during training: all computations are carried out in floating points while training, fake_quant simulats the effects of ints, and weights and activations are quantized into lower precision only once training is completed.
+Quantization Aware Training (QAT) mimics the effects of quantization during training: The computations are carried-out in floating-point precision but the subsequent quantization effect is taken into account. The weights and activations are quantized into lower precision only for inference, when training is completed.
 
-Quantization is useful when it is required to serve large models on machines with limited memory, or when there's a need to switch between models and reducing the IO time is important i.e switching between monolingual speech recognition models across multiple languages.
+Quantization is useful when it is required to serve large models on machines with limited memory, or when there's a need to switch between models and reducing the I/O time is important. For example, switching between monolingual speech recognition models across multiple languages.
 
 Lightning includes :class:`~pytorch_lightning.callbacks.QuantizationAwareTraining` callback (using PyTorch's native quantization, read more `here <https://pytorch.org/docs/stable/quantization.html#quantization-aware-training>`__), which allows creating fully quantized models (compatible with torchscript).
 

From d795ea7b7fee3d61bf7adb2024d6ed27b6196fa6 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 18 Feb 2021 14:14:21 +0100
Subject: [PATCH 18/20] Improve SWA section

---
 docs/source/advanced/training_tricks.rst | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/docs/source/advanced/training_tricks.rst b/docs/source/advanced/training_tricks.rst
index d3649dbf90ca2..fc4dd69aa0af1 100644
--- a/docs/source/advanced/training_tricks.rst
+++ b/docs/source/advanced/training_tricks.rst
@@ -43,16 +43,18 @@ norm <https://pytorch.org/docs/stable/nn.html#torch.nn.utils.clip_grad_norm_>`_
 
 Stochastic Weight Averaging
 ---------------------------
-Stochastic Weight Averaging (SWA) is a training method where
-the weights of the model are averaged on the N last epochs of the training.
+Stochastic Weight Averaging (SWA) can make your models generalize better at virtually no additional cost.
+This can be used with both non-trained and trained models. The SWA procedure smooths the loss landscape thus making
+it harder to end up in a local minimum during optimization.
 
-See this link for an in-depth explanation: <https://pytorch.org/blog/pytorch-1.6-now-includes-stochastic-weight-averaging>
+For a more detailed explanation of SWA and how it works,
+read `this <https://pytorch.org/blog/pytorch-1.6-now-includes-stochastic-weight-averaging>__` post by the PyTorch team.
 
 .. seealso:: :class:`~pytorch_lightning.callbacks.StochasticWeightAveraging`
 
 .. testcode::
 
-    # activate SWA from Trainer directly.
+    # Enable Stochastic Weight Averaging
     trainer = Trainer(stochastic_weight_avg=True)
 
 ----------

From 6c9e855b3a8f131f710583efa35400f04861ebb9 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 18 Feb 2021 14:20:52 +0100
Subject: [PATCH 19/20] Fix docs

---
 docs/source/advanced/training_tricks.rst    | 4 ++--
 pytorch_lightning/callbacks/quantization.py | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/docs/source/advanced/training_tricks.rst b/docs/source/advanced/training_tricks.rst
index fc4dd69aa0af1..4f7452c2da1de 100644
--- a/docs/source/advanced/training_tricks.rst
+++ b/docs/source/advanced/training_tricks.rst
@@ -48,9 +48,9 @@ This can be used with both non-trained and trained models. The SWA procedure smo
 it harder to end up in a local minimum during optimization.
 
 For a more detailed explanation of SWA and how it works,
-read `this <https://pytorch.org/blog/pytorch-1.6-now-includes-stochastic-weight-averaging>__` post by the PyTorch team.
+read `this <https://pytorch.org/blog/pytorch-1.6-now-includes-stochastic-weight-averaging>`__ post by the PyTorch team.
 
-.. seealso:: :class:`~pytorch_lightning.callbacks.StochasticWeightAveraging`
+.. seealso:: :class:`~pytorch_lightning.callbacks.StochasticWeightAveraging` (Callback)
 
 .. testcode::
 
diff --git a/pytorch_lightning/callbacks/quantization.py b/pytorch_lightning/callbacks/quantization.py
index 5a980c0fc9731..ae75438e6fff7 100644
--- a/pytorch_lightning/callbacks/quantization.py
+++ b/pytorch_lightning/callbacks/quantization.py
@@ -109,8 +109,7 @@ def __init__(
 
                 - 'fbgemm' for server inference.
                 - 'qnnpack' for mobile inference.
-                -  define custom quantization configuration `torch.quantization.QConfig
-                    <https://pytorch.org/docs/stable/torch.quantization.html#torch.quantization.QConfig>`_).
+                -  a custom `torch.quantization.QConfig <https://pytorch.org/docs/stable/torch.quantization.html#torch.quantization.QConfig>`_.  # noqa: E501
 
             observer_type: allows switching between ``MovingAverageMinMaxObserver`` as "average" (default)
                 and ``HistogramObserver`` as "histogram" which is more computationally expensive.

From f4c1e4746f64f2858a51ea2b35813a03a84bc122 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 18 Feb 2021 14:26:28 +0100
Subject: [PATCH 20/20] Fix docs

---
 pytorch_lightning/callbacks/quantization.py | 4 ++--
 pytorch_lightning/callbacks/swa.py          | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/callbacks/quantization.py b/pytorch_lightning/callbacks/quantization.py
index ae75438e6fff7..2b6064e232da7 100644
--- a/pytorch_lightning/callbacks/quantization.py
+++ b/pytorch_lightning/callbacks/quantization.py
@@ -109,7 +109,7 @@ def __init__(
 
                 - 'fbgemm' for server inference.
                 - 'qnnpack' for mobile inference.
-                -  a custom `torch.quantization.QConfig <https://pytorch.org/docs/stable/torch.quantization.html#torch.quantization.QConfig>`_.  # noqa: E501
+                -  a custom `torch.quantization.QConfig <https://pytorch.org/docs/stable/torch.quantization.html#torch.quantization.QConfig>`_.
 
             observer_type: allows switching between ``MovingAverageMinMaxObserver`` as "average" (default)
                 and ``HistogramObserver`` as "histogram" which is more computationally expensive.
@@ -136,7 +136,7 @@ def custom_trigger_last(trainer):
             input_compatible: preserve quant/dequant layers. This allows to feat any input as to the original model,
                 but break compatibility to torchscript.
 
-        """
+        """  # noqa: E501
         _valid_qconf_str = isinstance(qconfig, str) and qconfig in torch.backends.quantized.supported_engines
         if not isinstance(qconfig, QConfig) and not _valid_qconf_str:
             raise MisconfigurationException(
diff --git a/pytorch_lightning/callbacks/swa.py b/pytorch_lightning/callbacks/swa.py
index b35677d47c5f8..c8cf367cb4d5e 100644
--- a/pytorch_lightning/callbacks/swa.py
+++ b/pytorch_lightning/callbacks/swa.py
@@ -63,7 +63,7 @@ def __init__(
 
         .. warning:: ``StochasticWeightAveraging`` is currently not supported for multiple optimizers/schedulers.
 
-        SWA can easily be activate directly from the Trainer as follow:
+        SWA can easily be activated directly from the Trainer as follow:
 
         .. code-block:: python