From 745d54d1b84e844a0e64b4d572f9f66f7b610d16 Mon Sep 17 00:00:00 2001
From: Sean Naren <sean.narenthiran@gmail.com>
Date: Wed, 10 Mar 2021 09:40:24 +0000
Subject: [PATCH] Set find unused parameters to True by default to fix breaking
 compatibility (#6438)

* Set find unused parameters to True by default to fix breaking models, add suggestion to re-enable

* Add changelog

(cherry picked from commit c81b2a81891cc39e454b841b1ed221b734e39cee)
---
 CHANGELOG.md                                      |  3 +++
 docs/source/benchmarking/performance.rst          | 15 +++++++++++++++
 pytorch_lightning/plugins/training_type/ddp.py    |  7 +++++++
 .../plugins/training_type/ddp_spawn.py            |  7 +++++++
 4 files changed, 32 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2f9d90faaef4d..26b444763eca7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -44,6 +44,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Changed `setup()` and `teardown()` stage argument to take any of `{fit,validate,test,predict}` ([#6386](https://github.com/PyTorchLightning/pytorch-lightning/pull/6386))
 
 
+- Changed the default of `find_unused_parameters` back to `True` in DDP and DDP Spawn ([#6438](https://github.com/PyTorchLightning/pytorch-lightning/pull/6438))
+
+
 ### Deprecated
 
 
diff --git a/docs/source/benchmarking/performance.rst b/docs/source/benchmarking/performance.rst
index 5f89c759e49bc..d1bc2c9ebc009 100644
--- a/docs/source/benchmarking/performance.rst
+++ b/docs/source/benchmarking/performance.rst
@@ -94,6 +94,21 @@ DP performs three GPU transfers for EVERY batch:
 
 Whereas DDP only performs 1 transfer to sync gradients. Because of this, DDP is MUCH faster than DP.
 
+When using DDP set find_unused_parameters=False
+-----------------------------------------------
+
+By default we have enabled find unused parameters to True. This is for compatibility issues that have arisen in the past (see the `discussion <https://github.com/PyTorchLightning/pytorch-lightning/discussions/6219>`_ for more information).
+This by default comes with a performance hit, and can be disabled in most cases.
+
+.. code-block:: python
+
+    from pytorch_lightning.plugins import DDPPlugin
+
+    trainer = pl.Trainer(
+        gpus=2,
+        plugins=DDPPlugin(find_unused_parameters=False),
+    )
+
 ----------
 
 16-bit precision
diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py
index 6a4e948e899cf..e0a52fc7609d6 100644
--- a/pytorch_lightning/plugins/training_type/ddp.py
+++ b/pytorch_lightning/plugins/training_type/ddp.py
@@ -182,6 +182,13 @@ def set_world_ranks(self):
         self.world_size = self.num_nodes * self.num_processes
 
     def pre_configure_ddp(self):
+        # if unset, default `find_unused_parameters` `True`
+        # Many models require setting this parameter to True, as there are corner cases
+        # when not all parameter backward hooks are fired by the autograd engine even if require_grad is set to True.
+        # This flag does come with a performance hit, so it is suggested to disable in cases where it is possible.
+        self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get(
+            "find_unused_parameters", True
+        )
         # todo: PyTorch 1.7.0 DDP introduces ``self.reducer._rebuild_buckets()`` breaking manual_optimization
         if _TORCH_GREATER_EQUAL_1_7 and not self.lightning_module.automatic_optimization and not self._ddp_kwargs.get(
             "find_unused_parameters", False
diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py
index 66d3cb7bf4619..cde2f3dea711c 100644
--- a/pytorch_lightning/plugins/training_type/ddp_spawn.py
+++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py
@@ -171,6 +171,13 @@ def post_dispatch(self):
         self.__recover_child_process_weights(best_path, last_path)
 
     def pre_configure_ddp(self):
+        # if unset, default `find_unused_parameters` `True`
+        # Many models require setting this parameter to True, as there are corner cases
+        # when not all parameter backward hooks are fired by the autograd engine even if require_grad is set to True.
+        # This flag does come with a performance hit, so it is suggested to disable in cases where it is possible.
+        self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get(
+            "find_unused_parameters", True
+        )
         # todo: PyTorch 1.7.0 DDP introduces ``self.reducer._rebuild_buckets()`` breaking manual_optimization
         if _TORCH_GREATER_EQUAL_1_7 and not self.lightning_module.automatic_optimization and not self._ddp_kwargs.get(
             "find_unused_parameters", False