diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index d88a31ae9775a..c48faadc4d976 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -82,7 +82,7 @@ jobs:
       displayName: 'Testing: standard'
 
     - bash: |
-        sh tests/special_tests.sh
+        bash tests/special_tests.sh
       displayName: 'Testing: special'
 
     - bash: |
diff --git a/benchmarks/test_sharded_parity.py b/benchmarks/test_sharded_parity.py
index 231556079e1ed..28cbd7828b108 100644
--- a/benchmarks/test_sharded_parity.py
+++ b/benchmarks/test_sharded_parity.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 import time
 from typing import Type
 
@@ -21,113 +20,13 @@
 
 from pytorch_lightning import seed_everything, Trainer
 from pytorch_lightning.plugins import DDPSpawnShardedPlugin
-from tests.accelerators import DDPLauncher
 from tests.helpers.boring_model import BoringModel, RandomDataset
 from tests.helpers.runif import RunIf
 
 
-@RunIf(min_gpus=1, skip_windows=True, fairscale=True)
-def test_ddp_sharded_plugin_correctness_one_gpu():
-    plugin_parity_test(
-        gpus=1,
-        model_cls=SeedTrainLoaderModel,
-    )
-
-
-@RunIf(min_gpus=1, skip_windows=True, fairscale=True, amp_native=True)
-def test_ddp_sharded_plugin_correctness_amp_one_gpu():
-    plugin_parity_test(
-        gpus=1,
-        precision=16,
-        model_cls=SeedTrainLoaderModel,
-    )
-
-
-@pytest.mark.skip(reason="Not a critical test, skip till drone CI performance improves.")
-@RunIf(min_gpus=2, skip_windows=True, fairscale=True)
-def test_ddp_sharded_plugin_correctness_multi_gpu():
-    plugin_parity_test(
-        gpus=2,
-        model_cls=SeedTrainLoaderModel,
-        max_percent_speed_diff=0.25,  # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers
-    )
-
-
-@RunIf(min_gpus=2, skip_windows=True, fairscale=True, amp_native=True)
-def test_ddp_sharded_plugin_correctness_amp_multi_gpu():
-    plugin_parity_test(
-        gpus=2,
-        precision=16,
-        model_cls=SeedTrainLoaderModel,
-        max_percent_speed_diff=0.25,  # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers
-    )
-
-
-@RunIf(min_gpus=2, skip_windows=True, fairscale=True, amp_native=True)
-def test_ddp_string_sharded_plugin_correctness_amp_multi_gpu():
-    plugin_parity_test(
-        gpus=2,
-        precision=16,
-        model_cls=SeedTrainLoaderModel,
-        max_percent_speed_diff=0.25,  # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers
-    )
-
-
-@RunIf(min_gpus=2, fairscale=True)
-@pytest.mark.skipif(
-    not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest"
-)
-@DDPLauncher.run("--accelerator ddp --gpus 2 --precision 32")
-def test_ddp_sharded_plugin_correctness_multi_gpu_ddp(tmpdir, args=None):
-    plugin_parity_test(
-        gpus=args.gpus,
-        precision=args.precision,
-        model_cls=SeedTrainLoaderModel,
-    )
-
-
-@RunIf(min_gpus=2, fairscale=True)
-@pytest.mark.skipif(
-    not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest"
-)
-@DDPLauncher.run("--accelerator ddp --gpus 2  --precision 16")
-def test_ddp_sharded_plugin_correctness_amp_multi_gpu_ddp(tmpdir, args=None):
-    plugin_parity_test(
-        gpus=args.gpus,
-        precision=args.precision,
-        model_cls=SeedTrainLoaderModel,
-    )
-
-
-@pytest.mark.skip(reason="Current issue with multiple optimizers and FairScale.")
-@RunIf(min_gpus=2, skip_windows=True, fairscale=True)
-def test_ddp_sharded_plugin_correctness_multi_gpu_multi_optim():
-    """
-        Ensures same results using multiple optimizers across multiple GPUs
-    """
-    plugin_parity_test(
-        gpus=2,
-        model_cls=SeedTrainLoaderMultipleOptimizersModel,
-        max_percent_speed_diff=0.25,  # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers
-    )
-
-
-@pytest.mark.skip(reason="Current issue with multiple optimizers and FairScale.")
-@RunIf(min_gpus=2, skip_windows=True, fairscale=True)
-def test_ddp_sharded_plugin_correctness_multi_gpu_multi_optim_manual(tmpdir):
-    """
-        Ensures using multiple optimizers across multiple GPUs with manual optimization
-    """
-    plugin_parity_test(
-        gpus=2,
-        model_cls=SeedTrainLoaderManualModel,
-        max_percent_speed_diff=0.25,  # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers
-    )
-
-
 class SeedTrainLoaderModel(BoringModel):
     """
-        Overrides training loader to ensure we enforce the same seed for all DDP processes.
+    Overrides training loader to ensure we enforce the same seed for all DDP processes.
     """
 
     def train_dataloader(self):
@@ -177,7 +76,7 @@ class SeedTrainLoaderMultipleOptimizersModel(SeedTrainLoaderModel):
     def training_step(self, batch, batch_idx, optimizer_idx):
         output = self.layer(batch)
         loss = self.loss(batch, output)
-        return {"loss": loss}
+        return {'loss': loss}
 
     def training_epoch_end(self, outputs) -> None:
         # outputs should be an array with an entry per optimizer
@@ -279,11 +178,48 @@ def plugin_parity_test(
     # Assert speed parity by ensuring percentage difference between custom/ddp is below threshold
     percent_diff = (custom_model_time - ddp_time) / custom_model_time
 
-    assert percent_diff <= max_percent_speed_diff, \
-        f'Custom DDP plugin was too slow compared to DDP, Custom Plugin Time: {custom_model_time}, DDP Time: {ddp_time}'
+    assert (
+        percent_diff <= max_percent_speed_diff
+    ), f'Custom DDP plugin was too slow compared to DDP, Custom Plugin Time: {custom_model_time}, DDP Time: {ddp_time}'
 
     if use_cuda:
         # Assert CUDA memory parity
-        assert max_memory_custom <= max_memory_ddp, \
-            f'Custom plugin used too much memory compared to DDP,' \
+        assert max_memory_custom <= max_memory_ddp, (
+            'Custom plugin used too much memory compared to DDP, '
             f'Custom Mem: {max_memory_custom}, DDP Mem: {max_memory_ddp}'
+        )
+
+
+@RunIf(skip_windows=True, fairscale=True)
+@pytest.mark.parametrize(
+    'kwargs',
+    [
+        pytest.param(dict(gpus=1, model_cls=SeedTrainLoaderModel), marks=RunIf(min_gpus=1)),
+        pytest.param(
+            dict(gpus=1, precision=16, model_cls=SeedTrainLoaderModel), marks=RunIf(min_gpus=1, amp_native=True)
+        ),
+        pytest.param(dict(gpus=2, model_cls=SeedTrainLoaderModel), marks=RunIf(min_gpus=2)),
+        pytest.param(
+            dict(gpus=2, precision=16, model_cls=SeedTrainLoaderModel), marks=RunIf(min_gpus=2, amp_native=True)
+        ),
+        pytest.param(
+            dict(gpus=2, model_cls=SeedTrainLoaderMultipleOptimizersModel),
+            marks=[
+                RunIf(min_gpus=2),
+                pytest.mark.skip(reason='TODO: Current issue with multiple optimizers and FairScale.'),
+            ],
+        ),
+        pytest.param(
+            dict(gpus=2, model_cls=SeedTrainLoaderManualModel),
+            marks=[
+                RunIf(min_gpus=2),
+                pytest.mark.skip(reason='TODO: Current issue with multiple optimizers and FairScale.'),
+            ],
+        ),
+    ],
+)
+def test_ddp_spawn_sharded_plugin(kwargs):
+    if kwargs['gpus'] > 1:
+        # TODO: decrease speed diff since only 2 GPUs sharding 2 optimizers
+        kwargs['max_percent_speed_diff'] = 0.25
+    plugin_parity_test(**kwargs)
diff --git a/tests/accelerators/__init__.py b/tests/accelerators/__init__.py
index 9583ec9537437..e69de29bb2d1d 100644
--- a/tests/accelerators/__init__.py
+++ b/tests/accelerators/__init__.py
@@ -1,12 +0,0 @@
-try:
-    from dtrun.launcher import DDPLauncher
-except ImportError:
-
-    class DDPLauncher:
-
-        def run(cmd_line, **kwargs):
-
-            def inner(func):
-                pass
-
-            return inner
diff --git a/tests/accelerators/test_ddp.py b/tests/accelerators/test_ddp.py
index 541110ac8846b..06aed2c1020ff 100644
--- a/tests/accelerators/test_ddp.py
+++ b/tests/accelerators/test_ddp.py
@@ -20,7 +20,7 @@
 import torch
 
 from pytorch_lightning import Trainer
-from tests.accelerators import ddp_model, DDPLauncher
+from tests.accelerators import ddp_model
 from tests.helpers.boring_model import BoringModel
 from tests.helpers.runif import RunIf
 from tests.utilities.distributed import call_training_script
@@ -71,19 +71,6 @@ def test_multi_gpu_model_ddp_fit_test(tmpdir):
         assert out['test_acc'] > 0.7
 
 
-@RunIf(min_gpus=2)
-@DDPLauncher.run(
-    "--max_epochs [max_epochs] --gpus 2 --accelerator [accelerator]",
-    max_epochs=["1"],
-    accelerator=["ddp", "ddp_spawn"]
-)
-def test_cli_to_pass(tmpdir, args=None):
-    """
-    This test verify we can call function using test_cli name
-    """
-    return '1'
-
-
 @RunIf(skip_windows=True)
 @pytest.mark.skipif(torch.cuda.is_available(), reason="test doesn't requires GPU machine")
 def test_torch_distributed_backend_env_variables(tmpdir):
diff --git a/tests/accelerators/test_multi_nodes_gpu.py b/tests/accelerators/test_multi_nodes_gpu.py
index 3a5ba5f0d1d71..c086150a60528 100644
--- a/tests/accelerators/test_multi_nodes_gpu.py
+++ b/tests/accelerators/test_multi_nodes_gpu.py
@@ -15,6 +15,7 @@
 import sys
 from unittest import mock
 
+import pytest
 import torch
 
 from tests.helpers.runif import RunIf
@@ -28,6 +29,9 @@
 from tests.helpers.boring_model import BoringModel  # noqa: E402
 
 
+# TODO(Borda): When multi-node tests are re-enabled (.github/workflows/ci_test-mnodes.yml)
+# use an environment variable `PL_RUNNING_MULTINODE_TESTS` and set `RunIf(multinode=True)`
+@pytest.mark.skip("Multi-node testing is currently disabled")
 @RunIf(special=True)
 def test_logging_sync_dist_true_ddp(tmpdir):
     """
@@ -65,6 +69,9 @@ def validation_step(self, batch, batch_idx):
     assert trainer.logged_metrics['bar'] == fake_result
 
 
+# TODO(Borda): When multi-node tests are re-enabled (.github/workflows/ci_test-mnodes.yml)
+# use an environment variable `PL_RUNNING_MULTINODE_TESTS` and set `RunIf(multinode=True)`
+@pytest.mark.skip("Multi-node testing is currently disabled")
 @RunIf(special=True)
 @mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"})
 def test__validation_step__log(tmpdir):
diff --git a/tests/special_tests.sh b/tests/special_tests.sh
old mode 100644
new mode 100755
index c381b5e9feeb6..aa5d65844a1c5
--- a/tests/special_tests.sh
+++ b/tests/special_tests.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 # Copyright The PyTorch Lightning team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -11,32 +12,58 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# Running special tests
 set -e
+
+# this environment variable allows special tests to run
 export PL_RUNNING_SPECIAL_TESTS=1
-DEFAULTS="-m coverage run --source pytorch_lightning --append -m pytest --verbose --capture=no"
-python ${DEFAULTS} tests/trainer/optimization/test_manual_optimization.py::test_step_with_optimizer_closure_with_different_frequencies_ddp
-python ${DEFAULTS} tests/models/test_sync_batchnorm.py::test_sync_batchnorm_ddp
-python ${DEFAULTS} tests/plugins/test_deepspeed_plugin.py::test_invalid_deepspeed_defaults_no_precision
-python ${DEFAULTS} tests/plugins/test_deepspeed_plugin.py::test_warn_deepspeed_override_backward
-python ${DEFAULTS} tests/plugins/test_deepspeed_plugin.py::test_deepspeed_run_configure_optimizers
-python ${DEFAULTS} tests/plugins/test_deepspeed_plugin.py::test_deepspeed_config
-python ${DEFAULTS} tests/plugins/test_deepspeed_plugin.py::test_deepspeed_custom_precision_params
-python ${DEFAULTS} tests/plugins/test_deepspeed_plugin.py::test_deepspeed_assert_config_zero_offload_disabled
-python ${DEFAULTS} tests/plugins/test_deepspeed_plugin.py::test_deepspeed_multigpu
-python ${DEFAULTS} tests/plugins/test_rpc_plugin.py::test_rpc_function_calls_ddp
-python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_manual
-python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_manual_amp
-python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_automatic
-python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_with_wrong_balance
-python ${DEFAULTS} tests/utilities/test_all_gather_grad.py::test_all_gather_collection
-python ${DEFAULTS} tests/trainer/test_trainer.py::test_trainer_predict_ddp
-python ${DEFAULTS} tests/trainer/test_trainer.py::test_trainer_predict_dp
-python ${DEFAULTS} tests/trainer/logging_/test_train_loop_logging_1_0.py::test_logging_sync_dist_true_ddp
-python ${DEFAULTS} tests/callbacks/test_pruning.py::test_pruning_callback_ddp
-python ${DEFAULTS} tests/test_profiler.py::test_pytorch_profiler_trainer_ddp
-python ${DEFAULTS} tests/models/test_hooks.py::test_transfer_batch_hook_ddp
-python ${DEFAULTS} tests/trainer/test_data_loading.py::test_replace_distrubuted_sampler_custom_dataloader_custom_batch_sampler
-python ${DEFAULTS} tests/trainer/optimization/test_manual_optimization.py::test_step_with_optimizer_closure_with_different_frequencies_ddp_with_toggle_model
-python ${DEFAULTS} tests/checkpointing/test_checkpoint_callback_frequency.py::test_top_k_ddp
-nvprof --profile-from-start off -o trace_name.prof -- python ${DEFAULTS} tests/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx
+# python arguments
+defaults='-m coverage run --source pytorch_lightning --append -m pytest --verbose --capture=no'
+
+# find tests marked as `@RunIf(special=True)`
+grep_output=$(grep --recursive --line-number --word-regexp 'tests' 'benchmarks' --regexp 'special=True')
+# file paths
+files=$(echo "$grep_output" | cut -f1 -d:)
+files_arr=($files)
+# line numbers
+linenos=$(echo "$grep_output" | cut -f2 -d:)
+linenos_arr=($linenos)
+
+# tests to skip - space separated
+blocklist='test_pytorch_profiler_nested_emit_nvtx'
+report=''
+
+for i in "${!files_arr[@]}"; do
+  file=${files_arr[$i]}
+  lineno=${linenos_arr[$i]}
+
+  # get code from `@RunIf(special=True)` line to EOF
+  test_code=$(tail -n +"$lineno" "$file")
+
+  # read line by line
+  while read -r line; do
+    # if it's a test
+    if [[ $line == def\ test_* ]]; then
+      # get the name
+      test_name=$(echo $line | cut -c 5- | cut -f1 -d\()
+
+      # check blocklist
+      if echo $blocklist | grep --word-regexp "$test_name" > /dev/null; then
+        report+="Skipped\t$file:$lineno::$test_name\n"
+        break
+      fi
+
+      # run the test
+      report+="Ran\t$file:$lineno::$test_name\n"
+      python ${defaults} "${file}::${test_name}"
+      break
+    fi
+  done < <(echo "$test_code")
+done
+
+nvprof --profile-from-start off -o trace_name.prof -- python ${defaults} tests/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx
+
+# echo test report
+printf '=%.s' {1..80}
+printf "\n$report"
+printf '=%.s' {1..80}
+printf '\n'
diff --git a/tests/utilities/test_all_gather_grad.py b/tests/utilities/test_all_gather_grad.py
index d67c9473bbb2e..6bad31634ce83 100644
--- a/tests/utilities/test_all_gather_grad.py
+++ b/tests/utilities/test_all_gather_grad.py
@@ -55,7 +55,6 @@ class TestModel(BoringModel):
         training_epoch_end_called = False
 
         def training_epoch_end(self, outputs) -> None:
-            self.training_epoch_end_called = True
             losses = torch.stack([x["loss"] for x in outputs])
             gathered_loss = self.all_gather({
                 "losses_tensor_int": torch.rand(2, 2).int().t(),
@@ -67,7 +66,7 @@ def training_epoch_end(self, outputs) -> None:
                 "losses": losses,
                 "losses_list": [losses, losses]
             })
-            assert gathered_loss["losses_tensor_int"][0].dtype == torch.int64
+            assert gathered_loss["losses_tensor_int"][0].dtype == torch.int32
             assert gathered_loss["losses_tensor_float"][0].dtype == torch.float
             assert gathered_loss["losses_np_ndarray"][0].dtype == torch.int64
             # torch.bool can't be all_gathered
@@ -76,6 +75,7 @@ def training_epoch_end(self, outputs) -> None:
             assert gathered_loss["losses_int"][0].dtype == torch.int
             assert gathered_loss["losses_list"][0].numel() == 2 * len(losses)
             assert gathered_loss["losses"].numel() == 2 * len(losses)
+            self.training_epoch_end_called = True
 
     seed_everything(42)
 
@@ -115,6 +115,6 @@ def training_step(self, batch, batch_idx):
             return loss
 
     model = TestModel()
-    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, gpus=2)
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, gpus=2, accelerator="ddp")
     trainer.fit(model)
     assert model.training_step_called