From 838674af60a9779174d9b04a8022d852ff932c19 Mon Sep 17 00:00:00 2001
From: Nithin Rao <nithinrao.koluguri@gmail.com>
Date: Thu, 11 Aug 2022 08:20:29 -0700
Subject: [PATCH] upgrade to PTL 1.7 (#4672)

* upgrade to PTL 1.7

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>

* min version

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>

* replace progressbar_refresh_rate with enable progressbar, this is callback now

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>

* progressbar

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>

* replace removed PTL 1.7 args, fix cpu tests, remove p-tune older script

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>

* revert ssl test fixes

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>

* override trainer property and fix numba grad check

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>

* NLPDDPlugin -> NLPDDPStrategy

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>

* style fix

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>

* set max_steps default as -1

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>

* fix maxsteps in notebooks

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>

* update trainer config

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>

* fix speech2label jenkins

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>

* fix speech2text jenkins

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>

* DDPPlugin -> DDPStrategy

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>

* remove provided strategy keys from trainer config nlp

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>

* check other examples

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>

* override LightningModule .cuda call to maintain pytorch default behavior

Signed-off-by: ericharper <complex451@gmail.com>

* revert gpt eval jenkins test

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>

* overwrite cuda class to PTL

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>

* review feedback

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>

* remove checkpoint callback from main config

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>

* patch fix for intentslot classification test

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>

* style fix

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>
Signed-off-by: ericharper <complex451@gmail.com>
Co-authored-by: ericharper <complex451@gmail.com>
Signed-off-by: Hainan Xu <hainanx@nvidia.com>
---
 Jenkinsfile                                   |   4 +-
 docs/source/core/core.rst                     |   2 +-
 docs/source/nlp/megatron.rst                  |  10 +-
 .../asr/conf/asr_adapters/asr_adaptation.yaml |   2 +-
 .../asr/conf/carnelinet/carnelinet_384.yaml   |   2 +-
 examples/asr/conf/citrinet/citrinet_1024.yaml |   2 +-
 examples/asr/conf/citrinet/citrinet_384.yaml  |   2 +-
 examples/asr/conf/citrinet/citrinet_512.yaml  |   2 +-
 examples/asr/conf/citrinet/config_bpe.yaml    |   2 +-
 examples/asr/conf/config.yaml                 |   2 +-
 .../asr/conf/conformer/conformer_ctc_bpe.yaml |   4 +-
 .../conf/conformer/conformer_ctc_char.yaml    |   4 +-
 .../conformer/conformer_transducer_bpe.yaml   |   4 +-
 .../conformer/conformer_transducer_char.yaml  |   4 +-
 .../conformer_ctc_bpe_multilang.yaml          |   4 +-
 .../conformer_transducer_bpe_multilang.yaml   |   4 +-
 .../conformer_ctc_bpe_streaming.yaml          |   4 +-
 .../conformer_transducer_bpe_streaming.yaml   |   4 +-
 .../asr/conf/contextnet_rnnt/config_rnnt.yaml |   2 +-
 .../conf/contextnet_rnnt/config_rnnt_bpe.yaml |   2 +-
 .../conf/contextnet_rnnt/contextnet_rnnt.yaml |   2 +-
 .../contextnet_rnnt/contextnet_rnnt_char.yaml |   2 +-
 .../contextnet_rnnt_multilang.yaml            |   2 +-
 examples/asr/conf/jasper/jasper_10x5dr.yaml   |   2 +-
 examples/asr/conf/lstm/lstm_ctc_bpe.yaml      |   4 +-
 .../asr/conf/lstm/lstm_transducer_bpe.yaml    |   4 +-
 .../asr/conf/marblenet/marblenet_3x2x64.yaml  |   2 +-
 .../matchboxnet/matchboxnet_3x1x64_v1.yaml    |   2 +-
 .../matchboxnet/matchboxnet_3x1x64_v2.yaml    |   2 +-
 .../asr/conf/quartznet/quartznet_15x5.yaml    |   2 +-
 .../conf/quartznet/quartznet_15x5_aug.yaml    |   2 +-
 .../asr/conf/quartznet/quartznet_15x5_ru.yaml |   2 +-
 .../asr/conf/quartznet/quartznet_15x5_zh.yaml |   2 +-
 .../squeezeformer/squeezeformer_ctc_bpe.yaml  |   4 +-
 .../squeezeformer/squeezeformer_ctc_char.yaml |   4 +-
 .../conf/ssl/citrinet/citrinet_ssl_1024.yaml  |   4 +-
 .../conf/ssl/citrinet/citrinet_ssl_ci.yaml    |   2 +-
 .../asr/conf/ssl/conformer/conformer_ssl.yaml |   4 +-
 .../conf/ssl/contextnet/contextnet_ssl.yaml   |   4 +-
 examples/asr/conf/ssl/wav2vec/wav2vec_ci.yaml |   2 +-
 examples/asr/conf/wav2vec_ctc/wav2vecCTC.yaml |   2 +-
 .../k2/conf/citrinet/citrinet_mmi_1024.yaml   |   2 +-
 .../nlp/dialogue/conf/dialogue_config.yaml    |   2 +-
 examples/nlp/dialogue/dialogue.py             |  10 +-
 .../tiny_example_entity_linking_config.yaml   |   2 +-
 .../umls_medical_entity_linking_config.yaml   |   2 +-
 .../glue_benchmark/glue_benchmark_config.yaml |   2 +-
 .../conf/bert_ir_config.yaml                  |   2 +-
 .../intent_slot_classification_config.yaml    |   8 +-
 ...bel_intent_slot_classification_config.yaml |   6 +-
 .../nlp/language_modeling/bert_pretraining.py |   4 +-
 .../bert_pretraining_from_text_config.yaml    |   2 +-
 .../conf/megatron_bart_config.yaml            |   2 +-
 .../conf/megatron_bert_config.yaml            |   2 +-
 .../conf/megatron_gpt_config.yaml             |   2 +-
 .../megatron_gpt_prompt_learning_config.yaml  |   2 +-
 .../conf/megatron_retro_config.yaml           |   2 +-
 .../conf/megatron_retro_mutransfer.yaml       |   2 +-
 .../conf/megatron_t0_config.yaml              |   2 +-
 .../conf/megatron_t5_config.yaml              |   2 +-
 ...megatron_t5_config_finetune_glue_mnli.yaml |   2 +-
 ...megatron_t5_config_finetune_glue_xnli.yaml |   2 +-
 .../conf/megatron_t5_finetune.yaml            |   2 +-
 .../megatron_t5_lm_adaptation_finetune.yaml   |   2 +-
 .../conf/megatron_t5_prompt_learning.yaml     |   2 +-
 .../conf/megatron_ul2_config.yaml             |   2 +-
 .../megatron_bart_pretraining.py              |  17 ++-
 .../megatron_bert_pretraining.py              |   7 +-
 .../megatron_change_num_partitions.py         |   8 +-
 .../language_modeling/megatron_gpt_eval.py    |   4 +-
 .../megatron_gpt_pretraining.py               |  17 ++-
 .../megatron_gpt_prompt_learning.py           |   7 +-
 .../megatron_gpt_prompt_learning_eval.py      |   4 +-
 .../language_modeling/megatron_gpt_test.py    |   8 +-
 .../megatron_retro_cal_shape.py               |  17 ++-
 .../megatron_retro_mutransfer_pretrain.py     |  17 ++-
 .../megatron_retro_pretraining.py             |  17 ++-
 .../nlp/language_modeling/megatron_t5_eval.py |   4 +-
 .../megatron_t5_lm_adaptation_finetune.py     |  17 ++-
 .../megatron_t5_pretraining.py                |  17 ++-
 .../megatron_t5_prompt_learning.py            |   7 +-
 .../megatron_t5_prompt_learning_eval.py       |   4 +-
 .../language_modeling/megatron_t5_ptune.py    | 135 ------------------
 .../megatron_t5_seq2seq_eval.py               |  17 ++-
 .../megatron_t5_seq2seq_finetune.py           |  17 ++-
 .../enc_dec_nmt-bottleneck.py                 |   6 +-
 .../nlp/machine_translation/enc_dec_nmt.py    |   6 +-
 .../enc_dec_nmt_finetune.py                   |   6 +-
 .../megatron_nmt_training.py                  |  17 ++-
 .../nmt_transformer_infer_megatron.py         |   4 +-
 .../conf/question_answering_squad_config.yaml |   2 +-
 .../text2sparql/conf/text2sparql_config.yaml  |   2 +-
 .../ptune_text_classification_config.yaml     |   2 +-
 .../conf/text_classification_config.yaml      |   2 +-
 ...parallel_text_classification_evaluation.py |   4 +-
 .../text_classification_with_bert.py          |   8 +-
 .../conf/thutmose_tagger_itn_config.yaml      |   2 +-
 .../punctuation_capitalization_config.yaml    |   2 +-
 .../conf/token_classification_config.yaml     |   2 +-
 .../token_classification_train.py             |   8 +-
 .../conf/zero_shot_intent_config.yaml         |   2 +-
 .../conf/SpeakerNet_recognition_3x2x512.yaml  |   4 +-
 nemo/collections/asr/modules/conv_asr.py      |   2 +-
 .../machine_translation/megatron_nmt_model.py |   4 -
 nemo/collections/nlp/parts/nlp_overrides.py   |   6 +-
 nemo/core/classes/modelPT.py                  |  37 ++++-
 nemo/core/config/pytorch_lightning.py         |  13 +-
 nemo/core/optim/lr_scheduler.py               |   2 +-
 requirements/requirements_lightning.txt       |   2 +-
 .../megatron_checkpoint_averaging.py          |   4 +-
 scripts/export.py                             |   2 +-
 .../nemo_legacy_import/nlp_checkpoint_port.py |   2 +-
 scripts/speaker_tasks/filelist_to_manifest.py |   2 +-
 .../asr/numba/rnnt_loss/test_rnnt_pytorch.py  |   2 +-
 tests/collections/nlp/test_gpt_eval.py        |   4 +-
 tests/collections/nlp/test_gpt_model.py       |   6 +-
 tests/collections/nlp/test_nlp_exportables.py |   5 +
 .../collections/nlp/test_retrieval_module.py  |   7 +-
 .../nlp/test_retrieval_module_inference.py    |   7 +-
 tests/core/test_optimizers_schedulers.py      |   6 +-
 tests/core_ptl/test_ptl_stateless_timer.py    |   2 +-
 tutorials/00_NeMo_Primer.ipynb                |   2 +-
 tutorials/asr/Multilang_ASR.ipynb             |   6 +-
 tutorials/asr/Speech_Commands.ipynb           |   4 +-
 .../nlp/Multitask_Prompt_and_PTuning.ipynb    |  10 +-
 .../Speaker_Identification_Verification.ipynb |   2 +-
 126 files changed, 314 insertions(+), 437 deletions(-)
 delete mode 100644 examples/nlp/language_modeling/megatron_t5_ptune.py

diff --git a/Jenkinsfile b/Jenkinsfile
index f88d93fef38e..68c8e7fb8413 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -597,7 +597,7 @@ pipeline {
             trainer.devices=[0] \
             trainer.accelerator="gpu" \
             trainer.max_epochs=1 \
-            +trainer.max_steps=1 \
+            trainer.max_steps=1 \
             +trainer.num_sanity_val_steps=1 \
             exp_manager.exp_dir=examples/asr/speech_to_text_results'
             sh 'rm -rf examples/asr/speech_to_text_results'
@@ -612,7 +612,7 @@ pipeline {
             trainer.devices=[1] \
             trainer.accelerator="gpu" \
             trainer.max_epochs=1 \
-            +trainer.max_steps=1 \
+            trainer.max_steps=1 \
             +trainer.num_sanity_val_steps=1 \
             model.preprocessor._target_=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \
             ~model.preprocessor.window_size \
diff --git a/docs/source/core/core.rst b/docs/source/core/core.rst
index 098f3e04c643..32a36a4fbb6c 100644
--- a/docs/source/core/core.rst
+++ b/docs/source/core/core.rst
@@ -399,7 +399,7 @@ configuration for a Novograd optimizer with Cosine Annealing learning rate sched
             name: CosineAnnealing
     
             # Optional arguments
-            max_steps: null # computed at runtime or explicitly set here
+            max_steps: -1 # computed at runtime or explicitly set here
             monitor: val_loss
             reduce_on_plateau: false
     
diff --git a/docs/source/nlp/megatron.rst b/docs/source/nlp/megatron.rst
index b6ef3c293063..743aa2f84b53 100644
--- a/docs/source/nlp/megatron.rst
+++ b/docs/source/nlp/megatron.rst
@@ -30,15 +30,15 @@ the same features as other NeMo Models.
 Training
 ^^^^^^^^
 
-All of the necessary logic to train model parallel models in NeMo with PyTorch Lightning is contained in the ``NLPDDPPlugin``. 
-The ``NLPDDPPlugin`` subclasses the PyTorch Lightning training type plugin ``DDPPlugin``.
-See `plugins <https://pytorch-lightning.readthedocs.io/en/latest/extensions/plugins.html>`_ for more information on PyTorch Lightning Plugins.
+All of the necessary logic to train model parallel models in NeMo with PyTorch Lightning is contained in the ``NLPDDPStrategy``. 
+The ``NLPDDPStrategy`` subclasses the PyTorch Lightning strategy type ``DDPStrategy``.
+See `strategies <https://pytorch-lightning.readthedocs.io/en/latest/extensions/strategy.html>`_ for more information on PyTorch Lightning Strategies
 
 To enable model parallel training in NeMo:
 
 .. code-block:: python
 
-    trainer = Trainer(plugins=[NLPDDPPlugin()], **cfg.trainer)
+    trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer)
 
 Megatron-LM checkpoints have a specific format. One checkpoint is saved for each model parallel rank:
 
@@ -157,7 +157,7 @@ Since model parallel models always require more than one GPU, the ``Trainer`` is
 
 .. code-block:: python
 
-    trainer = pl.Trainer(plugins=[NLPDDPPlugin()], **cfg.trainer)
+    trainer = pl.Trainer(strategy=NLPDDPStrategy(), **cfg.trainer)
 
     model = TextClassificationModel.restore_from(cfg.model.nemo_path, trainer=trainer)
     model.setup_test_data(test_data_config=cfg.model.test_ds)
diff --git a/examples/asr/conf/asr_adapters/asr_adaptation.yaml b/examples/asr/conf/asr_adapters/asr_adaptation.yaml
index 7584e2220d10..59df7ee41ca7 100644
--- a/examples/asr/conf/asr_adapters/asr_adaptation.yaml
+++ b/examples/asr/conf/asr_adapters/asr_adaptation.yaml
@@ -164,7 +164,7 @@ trainer:
   gradient_clip_val: null
   precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
   log_every_n_steps: 10  # Interval of logging.
-  progress_bar_refresh_rate: 10
+  enable_progress_bar: True
   resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
   num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
   check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
diff --git a/examples/asr/conf/carnelinet/carnelinet_384.yaml b/examples/asr/conf/carnelinet/carnelinet_384.yaml
index 2d3d567be510..6693247ab340 100644
--- a/examples/asr/conf/carnelinet/carnelinet_384.yaml
+++ b/examples/asr/conf/carnelinet/carnelinet_384.yaml
@@ -238,7 +238,7 @@ model:
 trainer:
   devices: 1 # number of gpus
   max_epochs: 100
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   num_nodes: 1
   accelerator: gpu
   strategy: ddp
diff --git a/examples/asr/conf/citrinet/citrinet_1024.yaml b/examples/asr/conf/citrinet/citrinet_1024.yaml
index 324623c5fd88..0722a7ec740a 100644
--- a/examples/asr/conf/citrinet/citrinet_1024.yaml
+++ b/examples/asr/conf/citrinet/citrinet_1024.yaml
@@ -448,7 +448,7 @@ model:
 trainer:
   devices: 1 # number of gpus
   max_epochs: 100
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   num_nodes: 1
   accelerator: gpu
   strategy: ddp
diff --git a/examples/asr/conf/citrinet/citrinet_384.yaml b/examples/asr/conf/citrinet/citrinet_384.yaml
index b49ab1f5aee5..f2ceb5f45f6c 100644
--- a/examples/asr/conf/citrinet/citrinet_384.yaml
+++ b/examples/asr/conf/citrinet/citrinet_384.yaml
@@ -403,7 +403,7 @@ model:
 trainer:
   devices: 1 # number of gpus
   max_epochs: 100
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   num_nodes: 1
   accelerator: gpu
   strategy: ddp
diff --git a/examples/asr/conf/citrinet/citrinet_512.yaml b/examples/asr/conf/citrinet/citrinet_512.yaml
index f5dc5ecd229b..a36cb1df7375 100644
--- a/examples/asr/conf/citrinet/citrinet_512.yaml
+++ b/examples/asr/conf/citrinet/citrinet_512.yaml
@@ -402,7 +402,7 @@ model:
 trainer:
   devices: 1 # number of gpus
   max_epochs: 100
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   num_nodes: 1
   accelerator: gpu
   strategy: ddp
diff --git a/examples/asr/conf/citrinet/config_bpe.yaml b/examples/asr/conf/citrinet/config_bpe.yaml
index 2cb2768793c0..887160142c1c 100644
--- a/examples/asr/conf/citrinet/config_bpe.yaml
+++ b/examples/asr/conf/citrinet/config_bpe.yaml
@@ -165,7 +165,7 @@ model:
 trainer:
   devices: 1 # number of gpus
   max_epochs: 5
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   num_nodes: 1
   accelerator: gpu
   strategy: ddp
diff --git a/examples/asr/conf/config.yaml b/examples/asr/conf/config.yaml
index 2b2163b57474..6ab764c0907b 100644
--- a/examples/asr/conf/config.yaml
+++ b/examples/asr/conf/config.yaml
@@ -168,7 +168,7 @@ model:
 trainer:
   devices: 1 # number of gpus
   max_epochs: 5
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   num_nodes: 1
   accelerator: gpu
   strategy: ddp
diff --git a/examples/asr/conf/conformer/conformer_ctc_bpe.yaml b/examples/asr/conf/conformer/conformer_ctc_bpe.yaml
index 1870d3069631..b0ec44a6a424 100644
--- a/examples/asr/conf/conformer/conformer_ctc_bpe.yaml
+++ b/examples/asr/conf/conformer/conformer_ctc_bpe.yaml
@@ -165,7 +165,7 @@ trainer:
   devices: -1 # number of GPUs, -1 would use all available GPUs
   num_nodes: 1
   max_epochs: 1000
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
   accelerator: auto
   strategy: ddp
@@ -173,7 +173,7 @@ trainer:
   gradient_clip_val: 0.0
   precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
   log_every_n_steps: 10  # Interval of logging.
-  progress_bar_refresh_rate: 10
+  enable_progress_bar: True
   resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
   num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
   check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
diff --git a/examples/asr/conf/conformer/conformer_ctc_char.yaml b/examples/asr/conf/conformer/conformer_ctc_char.yaml
index a7c6e3f1e916..ad6152311f6d 100644
--- a/examples/asr/conf/conformer/conformer_ctc_char.yaml
+++ b/examples/asr/conf/conformer/conformer_ctc_char.yaml
@@ -140,7 +140,7 @@ trainer:
   devices: -1 # number of GPUs, -1 would use all available GPUs
   num_nodes: 1
   max_epochs: 1000
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
   accelerator: auto
   strategy: ddp
@@ -148,7 +148,7 @@ trainer:
   gradient_clip_val: 0.0
   precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
   log_every_n_steps: 10  # Interval of logging.
-  progress_bar_refresh_rate: 10
+  enable_progress_bar: True
   resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
   num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
   check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
diff --git a/examples/asr/conf/conformer/conformer_transducer_bpe.yaml b/examples/asr/conf/conformer/conformer_transducer_bpe.yaml
index aa2f9ea2edeb..35cfe811953b 100644
--- a/examples/asr/conf/conformer/conformer_transducer_bpe.yaml
+++ b/examples/asr/conf/conformer/conformer_transducer_bpe.yaml
@@ -215,7 +215,7 @@ trainer:
   devices: -1 # number of GPUs, -1 would use all available GPUs
   num_nodes: 1
   max_epochs: 500
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
   accelerator: auto
   strategy: ddp
@@ -223,7 +223,7 @@ trainer:
   gradient_clip_val: 0.0
   precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
   log_every_n_steps: 10  # Interval of logging.
-  progress_bar_refresh_rate: 10
+  enable_progress_bar: True
   resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
   num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
   check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
diff --git a/examples/asr/conf/conformer/conformer_transducer_char.yaml b/examples/asr/conf/conformer/conformer_transducer_char.yaml
index 1d4c4d04db7e..24b28204176a 100644
--- a/examples/asr/conf/conformer/conformer_transducer_char.yaml
+++ b/examples/asr/conf/conformer/conformer_transducer_char.yaml
@@ -210,7 +210,7 @@ trainer:
   devices: -1 # number of GPUs, -1 would use all available GPUs
   num_nodes: 1
   max_epochs: 500
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
   accelerator: auto
   strategy: ddp
@@ -218,7 +218,7 @@ trainer:
   gradient_clip_val: 0.0
   precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
   log_every_n_steps: 10  # Interval of logging.
-  progress_bar_refresh_rate: 10
+  enable_progress_bar: True
   resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
   num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
   check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
diff --git a/examples/asr/conf/conformer/multilang/conformer_ctc_bpe_multilang.yaml b/examples/asr/conf/conformer/multilang/conformer_ctc_bpe_multilang.yaml
index 6be1461686f0..3a999e8819a0 100644
--- a/examples/asr/conf/conformer/multilang/conformer_ctc_bpe_multilang.yaml
+++ b/examples/asr/conf/conformer/multilang/conformer_ctc_bpe_multilang.yaml
@@ -166,7 +166,7 @@ trainer:
   devices: -1 # number of GPUs, -1 would use all available GPUs
   num_nodes: 1
   max_epochs: 1000
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
   accelerator: auto
   strategy: ddp
@@ -174,7 +174,7 @@ trainer:
   gradient_clip_val: 0.0
   precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
   log_every_n_steps: 10  # Interval of logging.
-  progress_bar_refresh_rate: 10
+  enable_progress_bar: True
   resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
   num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
   check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
diff --git a/examples/asr/conf/conformer/multilang/conformer_transducer_bpe_multilang.yaml b/examples/asr/conf/conformer/multilang/conformer_transducer_bpe_multilang.yaml
index 61bb35247494..0e73d166d2be 100644
--- a/examples/asr/conf/conformer/multilang/conformer_transducer_bpe_multilang.yaml
+++ b/examples/asr/conf/conformer/multilang/conformer_transducer_bpe_multilang.yaml
@@ -216,7 +216,7 @@ trainer:
   devices: -1 # number of GPUs, -1 would use all available GPUs
   num_nodes: 1
   max_epochs: 1000
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
   accelerator: auto
   strategy: ddp
@@ -224,7 +224,7 @@ trainer:
   gradient_clip_val: 0.0
   precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
   log_every_n_steps: 10  # Interval of logging.
-  progress_bar_refresh_rate: 10
+  enable_progress_bar: True
   resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
   num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
   check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
diff --git a/examples/asr/conf/conformer/streaming/conformer_ctc_bpe_streaming.yaml b/examples/asr/conf/conformer/streaming/conformer_ctc_bpe_streaming.yaml
index dc2eb4cdf131..f6563787ca55 100644
--- a/examples/asr/conf/conformer/streaming/conformer_ctc_bpe_streaming.yaml
+++ b/examples/asr/conf/conformer/streaming/conformer_ctc_bpe_streaming.yaml
@@ -153,7 +153,7 @@ trainer:
   devices: -1 # number of GPUs, -1 would use all available GPUs
   num_nodes: 1
   max_epochs: 1000
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
   accelerator: auto
   strategy: ddp
@@ -161,7 +161,7 @@ trainer:
   gradient_clip_val: 1.0
   precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
   log_every_n_steps: 10  # Interval of logging.
-  progress_bar_refresh_rate: 10
+  enable_progress_bar: True
   resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
   num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
   check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
diff --git a/examples/asr/conf/conformer/streaming/conformer_transducer_bpe_streaming.yaml b/examples/asr/conf/conformer/streaming/conformer_transducer_bpe_streaming.yaml
index 0034a0b194a2..1f75ddb265e2 100644
--- a/examples/asr/conf/conformer/streaming/conformer_transducer_bpe_streaming.yaml
+++ b/examples/asr/conf/conformer/streaming/conformer_transducer_bpe_streaming.yaml
@@ -213,7 +213,7 @@ trainer:
   devices: -1 # number of GPUs, -1 would use all available GPUs
   num_nodes: 1
   max_epochs: 1000
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
   accelerator: auto
   strategy: ddp
@@ -221,7 +221,7 @@ trainer:
   gradient_clip_val: 0.0
   precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
   log_every_n_steps: 10  # Interval of logging.
-  progress_bar_refresh_rate: 10
+  enable_progress_bar: True
   resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
   num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
   check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
diff --git a/examples/asr/conf/contextnet_rnnt/config_rnnt.yaml b/examples/asr/conf/contextnet_rnnt/config_rnnt.yaml
index a58c467b8110..ba8eca6833f3 100644
--- a/examples/asr/conf/contextnet_rnnt/config_rnnt.yaml
+++ b/examples/asr/conf/contextnet_rnnt/config_rnnt.yaml
@@ -235,7 +235,7 @@ model:
 trainer:
   devices: 1 # number of gpus
   max_epochs: 5
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   num_nodes: 1
   accelerator: gpu
   strategy: ddp
diff --git a/examples/asr/conf/contextnet_rnnt/config_rnnt_bpe.yaml b/examples/asr/conf/contextnet_rnnt/config_rnnt_bpe.yaml
index 1f4dd0e954c9..74cb0c92f194 100644
--- a/examples/asr/conf/contextnet_rnnt/config_rnnt_bpe.yaml
+++ b/examples/asr/conf/contextnet_rnnt/config_rnnt_bpe.yaml
@@ -235,7 +235,7 @@ model:
 trainer:
   devices: 1 # number of gpus
   max_epochs: 5
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   num_nodes: 1
   accelerator: gpu
   strategy: ddp
diff --git a/examples/asr/conf/contextnet_rnnt/contextnet_rnnt.yaml b/examples/asr/conf/contextnet_rnnt/contextnet_rnnt.yaml
index 2ec7590d8840..596434cd8a79 100644
--- a/examples/asr/conf/contextnet_rnnt/contextnet_rnnt.yaml
+++ b/examples/asr/conf/contextnet_rnnt/contextnet_rnnt.yaml
@@ -474,7 +474,7 @@ model:
 trainer:
   devices: 1 # number of gpus
   max_epochs: 100
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   num_nodes: 1  # Should be set via SLURM variable `SLURM_JOB_NUM_NODES`
   accelerator: gpu
   strategy: ddp
diff --git a/examples/asr/conf/contextnet_rnnt/contextnet_rnnt_char.yaml b/examples/asr/conf/contextnet_rnnt/contextnet_rnnt_char.yaml
index 490394676acd..b190d7159529 100644
--- a/examples/asr/conf/contextnet_rnnt/contextnet_rnnt_char.yaml
+++ b/examples/asr/conf/contextnet_rnnt/contextnet_rnnt_char.yaml
@@ -476,7 +476,7 @@ model:
 trainer:
   devices: 1 # number of gpus
   max_epochs: 100
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   num_nodes: 1  # Should be set via SLURM variable `SLURM_JOB_NUM_NODES`
   accelerator: gpu
   strategy: ddp
diff --git a/examples/asr/conf/contextnet_rnnt/contextnet_rnnt_multilang.yaml b/examples/asr/conf/contextnet_rnnt/contextnet_rnnt_multilang.yaml
index 972c3cd9a761..34ffdc923efb 100644
--- a/examples/asr/conf/contextnet_rnnt/contextnet_rnnt_multilang.yaml
+++ b/examples/asr/conf/contextnet_rnnt/contextnet_rnnt_multilang.yaml
@@ -481,7 +481,7 @@ model:
 trainer:
   devices: -1 # number of GPUs, -1 would use all available GPUs
   max_epochs: 100
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   num_nodes: 1  # Should be set via SLURM variable `SLURM_JOB_NUM_NODES`
   accelerator: auto
   strategy: ddp
diff --git a/examples/asr/conf/jasper/jasper_10x5dr.yaml b/examples/asr/conf/jasper/jasper_10x5dr.yaml
index ad2f0536c133..e93b8b6043c8 100644
--- a/examples/asr/conf/jasper/jasper_10x5dr.yaml
+++ b/examples/asr/conf/jasper/jasper_10x5dr.yaml
@@ -190,7 +190,7 @@ model:
 trainer:
   devices: 1 # number of gpus
   max_epochs: 5
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   num_nodes: 1
   accelerator: gpu
   strategy: ddp
diff --git a/examples/asr/conf/lstm/lstm_ctc_bpe.yaml b/examples/asr/conf/lstm/lstm_ctc_bpe.yaml
index f81cb43ecc07..e899f44f97ef 100644
--- a/examples/asr/conf/lstm/lstm_ctc_bpe.yaml
+++ b/examples/asr/conf/lstm/lstm_ctc_bpe.yaml
@@ -123,7 +123,7 @@ trainer:
   devices: -1 # number of GPUs, -1 would use all available GPUs
   num_nodes: 1
   max_epochs: 500
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
   accelerator: gpu
   strategy: ddp
@@ -131,7 +131,7 @@ trainer:
   gradient_clip_val: 0.3
   precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
   log_every_n_steps: 10  # Interval of logging.
-  progress_bar_refresh_rate: 10
+  enable_progress_bar: True
   resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
   num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
   check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
diff --git a/examples/asr/conf/lstm/lstm_transducer_bpe.yaml b/examples/asr/conf/lstm/lstm_transducer_bpe.yaml
index 2ac1408c508c..e35bb95d291f 100644
--- a/examples/asr/conf/lstm/lstm_transducer_bpe.yaml
+++ b/examples/asr/conf/lstm/lstm_transducer_bpe.yaml
@@ -186,7 +186,7 @@ trainer:
   devices: -1 # number of GPUs, -1 would use all available GPUs
   num_nodes: 1
   max_epochs: 500
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
   accelerator: auto
   strategy: ddp
@@ -194,7 +194,7 @@ trainer:
   gradient_clip_val: 0.3
   precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
   log_every_n_steps: 10  # Interval of logging.
-  progress_bar_refresh_rate: 10
+  enable_progress_bar: True
   resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
   num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
   check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
diff --git a/examples/asr/conf/marblenet/marblenet_3x2x64.yaml b/examples/asr/conf/marblenet/marblenet_3x2x64.yaml
index fe4dcc537f06..f9b3f26d114c 100644
--- a/examples/asr/conf/marblenet/marblenet_3x2x64.yaml
+++ b/examples/asr/conf/marblenet/marblenet_3x2x64.yaml
@@ -165,7 +165,7 @@ model:
 trainer:
   devices: 1 # number of gpus
   max_epochs: 150
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   num_nodes: 1
   accelerator: gpu
   strategy: ddp
diff --git a/examples/asr/conf/matchboxnet/matchboxnet_3x1x64_v1.yaml b/examples/asr/conf/matchboxnet/matchboxnet_3x1x64_v1.yaml
index ac462a273d96..af054aac2aba 100644
--- a/examples/asr/conf/matchboxnet/matchboxnet_3x1x64_v1.yaml
+++ b/examples/asr/conf/matchboxnet/matchboxnet_3x1x64_v1.yaml
@@ -177,7 +177,7 @@ model:
 trainer:
   devices: 1 # number of gpus
   max_epochs: 200
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   num_nodes: 1
   accelerator: gpu
   strategy: ddp
diff --git a/examples/asr/conf/matchboxnet/matchboxnet_3x1x64_v2.yaml b/examples/asr/conf/matchboxnet/matchboxnet_3x1x64_v2.yaml
index a7d4974ed7f3..f3f4639766c5 100644
--- a/examples/asr/conf/matchboxnet/matchboxnet_3x1x64_v2.yaml
+++ b/examples/asr/conf/matchboxnet/matchboxnet_3x1x64_v2.yaml
@@ -177,7 +177,7 @@ model:
 trainer:
   devices: 1 # number of gpus
   max_epochs: 200
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   num_nodes: 1
   accelerator: gpu
   strategy: ddp
diff --git a/examples/asr/conf/quartznet/quartznet_15x5.yaml b/examples/asr/conf/quartznet/quartznet_15x5.yaml
index 269be113e7be..d5f225365017 100644
--- a/examples/asr/conf/quartznet/quartznet_15x5.yaml
+++ b/examples/asr/conf/quartznet/quartznet_15x5.yaml
@@ -261,7 +261,7 @@ model:
 trainer:
   devices: 1 # number of gpus
   max_epochs: 5
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   num_nodes: 1
   accelerator: gpu
   strategy: ddp
diff --git a/examples/asr/conf/quartznet/quartznet_15x5_aug.yaml b/examples/asr/conf/quartznet/quartznet_15x5_aug.yaml
index 93487d86dff9..4daec79ff1dd 100644
--- a/examples/asr/conf/quartznet/quartznet_15x5_aug.yaml
+++ b/examples/asr/conf/quartznet/quartznet_15x5_aug.yaml
@@ -267,7 +267,7 @@ model:
 trainer:
   devices: 1 # number of gpus
   max_epochs: 5
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   num_nodes: 1
   accelerator: gpu
   strategy: ddp
diff --git a/examples/asr/conf/quartznet/quartznet_15x5_ru.yaml b/examples/asr/conf/quartznet/quartznet_15x5_ru.yaml
index 1a96811640d9..37f58f08ecf8 100644
--- a/examples/asr/conf/quartznet/quartznet_15x5_ru.yaml
+++ b/examples/asr/conf/quartznet/quartznet_15x5_ru.yaml
@@ -258,7 +258,7 @@ model:
 trainer:
   devices: 1 # number of gpus
   max_epochs: 5
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   num_nodes: 1
   accelerator: gpu
   strategy: ddp
diff --git a/examples/asr/conf/quartznet/quartznet_15x5_zh.yaml b/examples/asr/conf/quartznet/quartznet_15x5_zh.yaml
index b10c922db7fd..c26b63b2a23b 100644
--- a/examples/asr/conf/quartznet/quartznet_15x5_zh.yaml
+++ b/examples/asr/conf/quartznet/quartznet_15x5_zh.yaml
@@ -457,7 +457,7 @@ model:
 trainer:
   devices: 1 # number of gpus
   max_epochs: 5
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   num_nodes: 1
   accelerator: gpu
   strategy: ddp
diff --git a/examples/asr/conf/squeezeformer/squeezeformer_ctc_bpe.yaml b/examples/asr/conf/squeezeformer/squeezeformer_ctc_bpe.yaml
index c5d66043536f..430414994cb3 100644
--- a/examples/asr/conf/squeezeformer/squeezeformer_ctc_bpe.yaml
+++ b/examples/asr/conf/squeezeformer/squeezeformer_ctc_bpe.yaml
@@ -161,7 +161,7 @@ trainer:
   devices: -1 # number of GPUs, -1 would use all available GPUs
   num_nodes: 1
   max_epochs: 1000
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
   accelerator: auto
   strategy: ddp
@@ -169,7 +169,7 @@ trainer:
   gradient_clip_val: 0.0
   precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
   log_every_n_steps: 10  # Interval of logging.
-  progress_bar_refresh_rate: 10
+  enable_progress_bar: True
   resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
   num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
   check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
diff --git a/examples/asr/conf/squeezeformer/squeezeformer_ctc_char.yaml b/examples/asr/conf/squeezeformer/squeezeformer_ctc_char.yaml
index 8fd06e24ee26..eb1abafe74eb 100644
--- a/examples/asr/conf/squeezeformer/squeezeformer_ctc_char.yaml
+++ b/examples/asr/conf/squeezeformer/squeezeformer_ctc_char.yaml
@@ -146,7 +146,7 @@ trainer:
   devices: -1 # number of GPUs, -1 would use all available GPUs
   num_nodes: 1
   max_epochs: 1000
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
   accelerator: auto
   strategy: ddp
@@ -154,7 +154,7 @@ trainer:
   gradient_clip_val: 0.0
   precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
   log_every_n_steps: 10  # Interval of logging.
-  progress_bar_refresh_rate: 10
+  enable_progress_bar: True
   resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
   num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
   check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
diff --git a/examples/asr/conf/ssl/citrinet/citrinet_ssl_1024.yaml b/examples/asr/conf/ssl/citrinet/citrinet_ssl_1024.yaml
index bc6fc7536972..2579b9777199 100644
--- a/examples/asr/conf/ssl/citrinet/citrinet_ssl_1024.yaml
+++ b/examples/asr/conf/ssl/citrinet/citrinet_ssl_1024.yaml
@@ -472,7 +472,7 @@ trainer:
   devices: -1 # number of GPUs, -1 would use all available GPUs
   num_nodes: 1
   max_epochs: 1000
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
   accelerator: auto
   strategy: ddp
@@ -480,7 +480,7 @@ trainer:
   gradient_clip_val: 1.0
   precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
   log_every_n_steps: 10  # Interval of logging.
-  progress_bar_refresh_rate: 10
+  enable_progress_bar: True
   resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
   num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
   check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
diff --git a/examples/asr/conf/ssl/citrinet/citrinet_ssl_ci.yaml b/examples/asr/conf/ssl/citrinet/citrinet_ssl_ci.yaml
index ac3e1bc8dffe..749b97587814 100644
--- a/examples/asr/conf/ssl/citrinet/citrinet_ssl_ci.yaml
+++ b/examples/asr/conf/ssl/citrinet/citrinet_ssl_ci.yaml
@@ -431,7 +431,7 @@ model:
 trainer:
   devices: 1 # number of gpus
   max_epochs: 100
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   num_nodes: 1
   accelerator: gpu
   strategy: ddp
diff --git a/examples/asr/conf/ssl/conformer/conformer_ssl.yaml b/examples/asr/conf/ssl/conformer/conformer_ssl.yaml
index 6a200a3ba0f9..cb3843cdcdd5 100644
--- a/examples/asr/conf/ssl/conformer/conformer_ssl.yaml
+++ b/examples/asr/conf/ssl/conformer/conformer_ssl.yaml
@@ -181,7 +181,7 @@ trainer:
   devices: -1 # number of GPUs, -1 would use all available GPUs
   num_nodes: 1
   max_epochs: 1000
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
   accelerator: auto
   strategy: ddp
@@ -189,7 +189,7 @@ trainer:
   gradient_clip_val: 1.0
   precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
   log_every_n_steps: 10  # Interval of logging.
-  progress_bar_refresh_rate: 10
+  enable_progress_bar: True
   resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
   num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
   check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
diff --git a/examples/asr/conf/ssl/contextnet/contextnet_ssl.yaml b/examples/asr/conf/ssl/contextnet/contextnet_ssl.yaml
index e62d28511ed9..54e73213ae45 100644
--- a/examples/asr/conf/ssl/contextnet/contextnet_ssl.yaml
+++ b/examples/asr/conf/ssl/contextnet/contextnet_ssl.yaml
@@ -436,7 +436,7 @@ trainer:
   devices: -1 # number of GPUs, -1 would use all available GPUs
   num_nodes: 1
   max_epochs: 1000
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
   accelerator: auto
   strategy: ddp
@@ -444,7 +444,7 @@ trainer:
   gradient_clip_val: 1.0
   precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
   log_every_n_steps: 10  # Interval of logging.
-  progress_bar_refresh_rate: 10
+  enable_progress_bar: True
   resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
   num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
   check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
diff --git a/examples/asr/conf/ssl/wav2vec/wav2vec_ci.yaml b/examples/asr/conf/ssl/wav2vec/wav2vec_ci.yaml
index 2f9cb76db760..7746b9a17e59 100644
--- a/examples/asr/conf/ssl/wav2vec/wav2vec_ci.yaml
+++ b/examples/asr/conf/ssl/wav2vec/wav2vec_ci.yaml
@@ -131,7 +131,7 @@ model:
 trainer:
   devices: 1 # number of gpus
   num_nodes: 1
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   accelerator: gpu
   strategy: ddp
   accumulate_grad_batches: 1
diff --git a/examples/asr/conf/wav2vec_ctc/wav2vecCTC.yaml b/examples/asr/conf/wav2vec_ctc/wav2vecCTC.yaml
index 11c9576e6f6d..02c6eb2a6273 100644
--- a/examples/asr/conf/wav2vec_ctc/wav2vecCTC.yaml
+++ b/examples/asr/conf/wav2vec_ctc/wav2vecCTC.yaml
@@ -126,7 +126,7 @@ trainer:
   devices: 1 # number of gpus
   num_nodes: 1
   max_epochs: 100
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   accelerator: gpu
   strategy: ddp
   accumulate_grad_batches: 1
diff --git a/examples/asr/experimental/k2/conf/citrinet/citrinet_mmi_1024.yaml b/examples/asr/experimental/k2/conf/citrinet/citrinet_mmi_1024.yaml
index 60d5c2bfd95d..1c1be351ca35 100644
--- a/examples/asr/experimental/k2/conf/citrinet/citrinet_mmi_1024.yaml
+++ b/examples/asr/experimental/k2/conf/citrinet/citrinet_mmi_1024.yaml
@@ -460,7 +460,7 @@ model:
 trainer:
   devices: 1 # number of gpus
   max_epochs: 100
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   num_nodes: 1
   accelerator: gpu
   strategy: ddp
diff --git a/examples/nlp/dialogue/conf/dialogue_config.yaml b/examples/nlp/dialogue/conf/dialogue_config.yaml
index 11844a41f144..733f2f7fe7e4 100644
--- a/examples/nlp/dialogue/conf/dialogue_config.yaml
+++ b/examples/nlp/dialogue/conf/dialogue_config.yaml
@@ -18,7 +18,7 @@ trainer:
   devices: 1 # number of GPUs (0 for CPU), or list of the GPUs to use e.g. [0, 1]
   num_nodes: 1
   max_epochs: 3
-  max_steps: null # precedence over max_epochs
+  max_steps: -1 # precedence over max_epochs
   accumulate_grad_batches: 1 # accumulates grads every k batches
   gradient_clip_val: 1.0
   precision: 16 # Should be set to 16 for O1 and O2 to enable the AMP.
diff --git a/examples/nlp/dialogue/dialogue.py b/examples/nlp/dialogue/dialogue.py
index e3ce88e33275..3f31185fdcaa 100644
--- a/examples/nlp/dialogue/dialogue.py
+++ b/examples/nlp/dialogue/dialogue.py
@@ -53,7 +53,7 @@
 from nemo.collections.nlp.models.dialogue.intent_slot_classification_model import IntentSlotClassificationModel
 from nemo.collections.nlp.models.dialogue.sgdqa_model import SGDQAModel
 from nemo.collections.nlp.modules.common.megatron.megatron_utils import compute_model_parallel_rank
-from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 from nemo.utils.app_state import AppState
@@ -66,11 +66,11 @@ def main(cfg: DictConfig) -> None:
     logging.info(f'Config: {OmegaConf.to_yaml(cfg)}')
 
     try:
-        plugins = NLPDDPPlugin()
+        strategy = NLPDDPStrategy()
     except (ImportError, ModuleNotFoundError):
-        plugins = None
+        strategy = None
 
-    trainer = pl.Trainer(**cfg.trainer, plugins=plugins)
+    trainer = pl.Trainer(**cfg.trainer, strategy=strategy)
 
     exp_manager(trainer, cfg.get("exp_manager", None))
 
@@ -139,7 +139,7 @@ def main(cfg: DictConfig) -> None:
     if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.ds_item is not None:
         eval_device = [cfg.trainer.devices[0]] if isinstance(cfg.trainer.devices, list) else 1
         trainer = pl.Trainer(
-            devices=eval_device, accelerator=cfg.trainer.accelerator, precision=16, plugins=NLPDDPPlugin()
+            devices=eval_device, accelerator=cfg.trainer.accelerator, precision=16, strategy=NLPDDPStrategy()
         )
         model.setup_multiple_test_data(test_data_config=cfg.model.test_ds)
         if model.prepare_test(trainer):
diff --git a/examples/nlp/entity_linking/conf/tiny_example_entity_linking_config.yaml b/examples/nlp/entity_linking/conf/tiny_example_entity_linking_config.yaml
index a08500e0467e..b7f538ccd68f 100644
--- a/examples/nlp/entity_linking/conf/tiny_example_entity_linking_config.yaml
+++ b/examples/nlp/entity_linking/conf/tiny_example_entity_linking_config.yaml
@@ -4,7 +4,7 @@ trainer:
   devices: 1
   num_nodes: 1
   max_epochs: 2
-  max_steps: null
+  max_steps: -1
   accumulate_grad_batches: 1
   precision: 16
   accelerator: gpu
diff --git a/examples/nlp/entity_linking/conf/umls_medical_entity_linking_config.yaml b/examples/nlp/entity_linking/conf/umls_medical_entity_linking_config.yaml
index cfc3442be87c..ad636ef23e18 100644
--- a/examples/nlp/entity_linking/conf/umls_medical_entity_linking_config.yaml
+++ b/examples/nlp/entity_linking/conf/umls_medical_entity_linking_config.yaml
@@ -4,7 +4,7 @@ trainer:
   devices: 1
   num_nodes: 1
   max_epochs: 2
-  max_steps: null
+  max_steps: -1
   accumulate_grad_batches: 1
   precision: 16
   accelerator: gpu
diff --git a/examples/nlp/glue_benchmark/glue_benchmark_config.yaml b/examples/nlp/glue_benchmark/glue_benchmark_config.yaml
index d94aa2fcfd2d..21cdc04db22f 100644
--- a/examples/nlp/glue_benchmark/glue_benchmark_config.yaml
+++ b/examples/nlp/glue_benchmark/glue_benchmark_config.yaml
@@ -5,7 +5,7 @@ trainer:
   devices: 1 # the number of gpus, 0 for CPU
   num_nodes: 1
   max_epochs: 3
-  max_steps: null # precedence over max_epochs
+  max_steps: -1 # precedence over max_epochs
   accumulate_grad_batches: 1 # accumulates grads every k batches
   precision: 16
   accelerator: gpu
diff --git a/examples/nlp/information_retrieval/conf/bert_ir_config.yaml b/examples/nlp/information_retrieval/conf/bert_ir_config.yaml
index ba745134f89b..56e573e0bcf6 100644
--- a/examples/nlp/information_retrieval/conf/bert_ir_config.yaml
+++ b/examples/nlp/information_retrieval/conf/bert_ir_config.yaml
@@ -4,7 +4,7 @@ trainer:
   devices: 1 # the number of gpus, 0 for CPU, or list with gpu indices
   num_nodes: 1
   max_epochs: 2 # the number of training epochs
-  max_steps: null # precedence over max_epochs
+  max_steps: -1 # precedence over max_epochs
   accumulate_grad_batches: 1 # accumulates grads every k batches
   precision: 16 # 16 to use AMP
   accelerator: gpu
diff --git a/examples/nlp/intent_slot_classification/conf/intent_slot_classification_config.yaml b/examples/nlp/intent_slot_classification/conf/intent_slot_classification_config.yaml
index beb88cd905d7..da6bc3c2579a 100644
--- a/examples/nlp/intent_slot_classification/conf/intent_slot_classification_config.yaml
+++ b/examples/nlp/intent_slot_classification/conf/intent_slot_classification_config.yaml
@@ -4,7 +4,7 @@ trainer:
   devices: 1 # the number of gpus, 0 for CPU
   num_nodes: 1
   max_epochs: 50
-  max_steps: null # precedence over max_epochs
+  max_steps: -1 # precedence over max_epochs
   accumulate_grad_batches: 1 # accumulates grads every k batches
   precision: 32 # Should be set to 16 for O1 and O2 amp_level to enable the AMP.
   accelerator: gpu
@@ -13,7 +13,7 @@ trainer:
   val_check_interval: 1.0  # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
   resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
 
-  checkpoint_callback: False
+  enable_checkpointing: False
   logger: false  # Provided by exp_manager
 
 model:
@@ -83,7 +83,7 @@ model:
     sched:
       name: WarmupAnnealing
       iters_per_batch: null # computed at runtime
-      max_steps: null # computed at runtime or explicitly set here
+      max_steps: -1 # computed at runtime or explicitly set here
 
       # pytorch lightning args
       monitor: val_loss
@@ -108,4 +108,4 @@ hydra:
     dir: .
   job_logging:
     root:
-      handlers: null
\ No newline at end of file
+      handlers: null
diff --git a/examples/nlp/intent_slot_classification/conf/multi_label_intent_slot_classification_config.yaml b/examples/nlp/intent_slot_classification/conf/multi_label_intent_slot_classification_config.yaml
index 442b0d3c72bd..7d534ca7f216 100644
--- a/examples/nlp/intent_slot_classification/conf/multi_label_intent_slot_classification_config.yaml
+++ b/examples/nlp/intent_slot_classification/conf/multi_label_intent_slot_classification_config.yaml
@@ -4,7 +4,7 @@ trainer:
   devices: -1 # number of GPUs, -1 would use all available GPUs
   num_nodes: 1
   max_epochs: 5
-  max_steps: null # precedence over max_epochs
+  max_steps: -1 # precedence over max_epochs
   accumulate_grad_batches: 1 # accumulates grads every k batches
   precision: 32 # Should be set to 16 for O1 and O2 amp_level to enable the AMP.
   accelerator: auto
@@ -13,7 +13,7 @@ trainer:
   val_check_interval: 1.0  # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
   resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
 
-  checkpoint_callback: false # Provided by exp_manager
+  enable_checkpointing: false # Provided by exp_manager
   logger: false  # Provided by exp_manager
 
 model:
@@ -83,7 +83,7 @@ model:
     sched:
       name: WarmupAnnealing
       iters_per_batch: null # computed at runtime
-      max_steps: null # computed at runtime or explicitly set here
+      max_steps: -1 # computed at runtime or explicitly set here
 
       # pytorch lightning args
       monitor: val_loss
diff --git a/examples/nlp/language_modeling/bert_pretraining.py b/examples/nlp/language_modeling/bert_pretraining.py
index 7760b73709da..75d0a1072e69 100644
--- a/examples/nlp/language_modeling/bert_pretraining.py
+++ b/examples/nlp/language_modeling/bert_pretraining.py
@@ -15,7 +15,7 @@
 
 import pytorch_lightning as pl
 from omegaconf import DictConfig, OmegaConf
-from pytorch_lightning.plugins import DDPPlugin
+from pytorch_lightning.strategies import DDPStrategy
 
 from nemo.collections.nlp.models.language_modeling import BERTLMModel
 from nemo.core.config import hydra_runner
@@ -26,7 +26,7 @@
 @hydra_runner(config_path="conf", config_name="bert_pretraining_from_text_config")
 def main(cfg: DictConfig) -> None:
     logging.info(f'Config:\n {OmegaConf.to_yaml(cfg)}')
-    trainer = pl.Trainer(plugins=[DDPPlugin(find_unused_parameters=True)], **cfg.trainer)
+    trainer = pl.Trainer(strategy=DDPStrategy(find_unused_parameters=True), **cfg.trainer)
     exp_manager(trainer, cfg.get("exp_manager", None))
     bert_model = BERTLMModel(cfg.model, trainer=trainer)
     trainer.fit(bert_model)
diff --git a/examples/nlp/language_modeling/conf/bert_pretraining_from_text_config.yaml b/examples/nlp/language_modeling/conf/bert_pretraining_from_text_config.yaml
index f7aef2e090d6..c29fcb3e912d 100644
--- a/examples/nlp/language_modeling/conf/bert_pretraining_from_text_config.yaml
+++ b/examples/nlp/language_modeling/conf/bert_pretraining_from_text_config.yaml
@@ -4,7 +4,7 @@ trainer:
   devices: 1 # the number of gpus, 0 for CPU, or list with gpu indices
   num_nodes: 1
   max_epochs: 2 # the number of training epochs
-  max_steps: null # precedence over max_epochs
+  max_steps: -1 # precedence over max_epochs
   accumulate_grad_batches: 1 # accumulates grads every k batches
   precision: 16 # 16 to use AMP
   accelerator: gpu
diff --git a/examples/nlp/language_modeling/conf/megatron_bart_config.yaml b/examples/nlp/language_modeling/conf/megatron_bart_config.yaml
index 4f8bbdc92388..6084f712499f 100644
--- a/examples/nlp/language_modeling/conf/megatron_bart_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_bart_config.yaml
@@ -13,7 +13,7 @@ trainer:
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
   replace_sampler_ddp: False
-  max_epochs: 1000 # PTL default. In practice, max_steps will be reached first. 
+  max_epochs: -1 # PTL default. In practice, max_steps will be reached first. 
   max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10
   val_check_interval: 100
diff --git a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml
index e93dcbe297e7..b76d713e3022 100644
--- a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml
@@ -9,7 +9,7 @@ trainer:
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
   replace_sampler_ddp: False
-  max_epochs: 1000 # PTL default. In practice we don't usually train for more than 1 epoch.
+  max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch.
   max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10
   val_check_interval: 100
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index 837d397c0815..d6643f7ea524 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -9,7 +9,7 @@ trainer:
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
   replace_sampler_ddp: False
-  max_epochs: 1000 # PTL default. In practice, max_steps will be reached first. 
+  max_epochs: -1 # PTL default. In practice, max_steps will be reached first. 
   max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10
   val_check_interval: 100
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_prompt_learning_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_prompt_learning_config.yaml
index 0891c59da12e..7b000646dcd5 100644
--- a/examples/nlp/language_modeling/conf/megatron_gpt_prompt_learning_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_prompt_learning_config.yaml
@@ -9,7 +9,7 @@ trainer:
   enable_checkpointing: False
   replace_sampler_ddp: False
   max_epochs: 3
-  max_steps: null # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10
   val_check_interval: 1.0
   accumulate_grad_batches: 1
diff --git a/examples/nlp/language_modeling/conf/megatron_retro_config.yaml b/examples/nlp/language_modeling/conf/megatron_retro_config.yaml
index adb5bd787573..8121bef5451b 100644
--- a/examples/nlp/language_modeling/conf/megatron_retro_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_retro_config.yaml
@@ -12,7 +12,7 @@ trainer:
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
   replace_sampler_ddp: False
-  max_epochs: 1000 # PTL default. In practice we don't usually train for more than 1 epoch.
+  max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch.
   max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10
   val_check_interval: 100
diff --git a/examples/nlp/language_modeling/conf/megatron_retro_mutransfer.yaml b/examples/nlp/language_modeling/conf/megatron_retro_mutransfer.yaml
index 73c3dab8a3e3..bfcd6798ae21 100644
--- a/examples/nlp/language_modeling/conf/megatron_retro_mutransfer.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_retro_mutransfer.yaml
@@ -14,7 +14,7 @@ trainer:
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
   replace_sampler_ddp: False
-  max_epochs: 1000 # PTL default. In practice we don't usually train for more than 1 epoch.
+  max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch.
   max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10
   val_check_interval: 100
diff --git a/examples/nlp/language_modeling/conf/megatron_t0_config.yaml b/examples/nlp/language_modeling/conf/megatron_t0_config.yaml
index 95f2b3bb5561..3850ce505819 100644
--- a/examples/nlp/language_modeling/conf/megatron_t0_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_t0_config.yaml
@@ -9,7 +9,7 @@ trainer:
   enable_checkpointing: False
   replace_sampler_ddp: False
   max_epochs: 10
-  max_steps: null # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10
   val_check_interval: 300
   accumulate_grad_batches: 1
diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config.yaml
index 8f973b059c90..7a93c604366b 100644
--- a/examples/nlp/language_modeling/conf/megatron_t5_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_t5_config.yaml
@@ -13,7 +13,7 @@ trainer:
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
   replace_sampler_ddp: False
-  max_epochs: 1000 # PTL default. In practice, max_steps will be reached first. 
+  max_epochs: -1 # PTL default. In practice, max_steps will be reached first. 
   max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10
   val_check_interval: 100
diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml
index bac1bac2ec89..ac68b57e0216 100644
--- a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml
@@ -9,7 +9,7 @@ trainer:
   enable_checkpointing: False
   replace_sampler_ddp: False
   max_epochs: 3
-  max_steps: null # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10
   val_check_interval: 300
   accumulate_grad_batches: 1
diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml
index 10eedd384e79..1b08bc37246e 100644
--- a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml
@@ -9,7 +9,7 @@ trainer:
   enable_checkpointing: False
   replace_sampler_ddp: False
   max_epochs: 3
-  max_steps: null # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10
   val_check_interval: 300
   accumulate_grad_batches: 1
diff --git a/examples/nlp/language_modeling/conf/megatron_t5_finetune.yaml b/examples/nlp/language_modeling/conf/megatron_t5_finetune.yaml
index 91a9730637a3..9a5cf15cfe74 100644
--- a/examples/nlp/language_modeling/conf/megatron_t5_finetune.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_t5_finetune.yaml
@@ -9,7 +9,7 @@ trainer:
   enable_checkpointing: False
   replace_sampler_ddp: False
   max_epochs: 10
-  max_steps: null # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10
   val_check_interval: 300
   accumulate_grad_batches: 1
diff --git a/examples/nlp/language_modeling/conf/megatron_t5_lm_adaptation_finetune.yaml b/examples/nlp/language_modeling/conf/megatron_t5_lm_adaptation_finetune.yaml
index d3860e9957c0..10baf9d080f0 100644
--- a/examples/nlp/language_modeling/conf/megatron_t5_lm_adaptation_finetune.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_t5_lm_adaptation_finetune.yaml
@@ -9,7 +9,7 @@ trainer:
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
   replace_sampler_ddp: False
-  max_epochs: 1000 # PTL default. In practice, max_steps will be reached first. 
+  max_epochs: -1 # PTL default. In practice, max_steps will be reached first. 
   max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10
   val_check_interval: 100
diff --git a/examples/nlp/language_modeling/conf/megatron_t5_prompt_learning.yaml b/examples/nlp/language_modeling/conf/megatron_t5_prompt_learning.yaml
index 55c56855efd3..e8f10d0f5f59 100644
--- a/examples/nlp/language_modeling/conf/megatron_t5_prompt_learning.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_t5_prompt_learning.yaml
@@ -9,7 +9,7 @@ trainer:
   enable_checkpointing: False
   replace_sampler_ddp: False
   max_epochs: 10
-  max_steps: null 
+  max_steps: -1 
   log_every_n_steps: 10
   val_check_interval: 1
   accumulate_grad_batches: 1
diff --git a/examples/nlp/language_modeling/conf/megatron_ul2_config.yaml b/examples/nlp/language_modeling/conf/megatron_ul2_config.yaml
index 2faf75c651e4..f815feaa65b1 100644
--- a/examples/nlp/language_modeling/conf/megatron_ul2_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_ul2_config.yaml
@@ -13,7 +13,7 @@ trainer:
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
   replace_sampler_ddp: False
-  max_epochs: 1000 # PTL default. In practice, max_steps will be reached first. 
+  max_epochs: -1 # PTL default. In practice, max_steps will be reached first. 
   max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10
   val_check_interval: 100
diff --git a/examples/nlp/language_modeling/megatron_bart_pretraining.py b/examples/nlp/language_modeling/megatron_bart_pretraining.py
index c381a192be4e..9a7300656f99 100644
--- a/examples/nlp/language_modeling/megatron_bart_pretraining.py
+++ b/examples/nlp/language_modeling/megatron_bart_pretraining.py
@@ -24,7 +24,7 @@
 from nemo.collections.nlp.parts.nlp_overrides import (
     GradScaler,
     MegatronHalfPrecisionPlugin,
-    NLPDDPPlugin,
+    NLPDDPStrategy,
     PipelineMixedPrecisionPlugin,
 )
 from nemo.core.config import hydra_runner
@@ -38,13 +38,12 @@ def main(cfg) -> None:
     logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
 
     megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False)
-    plugins = [
-        NLPDDPPlugin(
-            no_ddp_communication_hook=True,
-            gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
-            find_unused_parameters=False,
-        )
-    ]
+    plugins = []
+    strategy = NLPDDPStrategy(
+        no_ddp_communication_hook=True,
+        gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
+        find_unused_parameters=False,
+    )
     if cfg.trainer.precision in [16, 'bf16']:
         scaler = None
         if cfg.trainer.precision == 16:
@@ -61,7 +60,7 @@ def main(cfg) -> None:
     if cfg.get('cluster_type', None) == 'BCP':
         plugins.append(TorchElasticEnvironment())
 
-    trainer = Trainer(plugins=plugins, **cfg.trainer, callbacks=[ModelSummary(max_depth=3)])
+    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[ModelSummary(max_depth=3)])
 
     exp_manager(trainer, cfg.exp_manager)
 
diff --git a/examples/nlp/language_modeling/megatron_bert_pretraining.py b/examples/nlp/language_modeling/megatron_bert_pretraining.py
index f8beea305390..f8239dcd538b 100644
--- a/examples/nlp/language_modeling/megatron_bert_pretraining.py
+++ b/examples/nlp/language_modeling/megatron_bert_pretraining.py
@@ -20,7 +20,7 @@
 from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector
 
 from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel
-from nemo.collections.nlp.parts.nlp_overrides import GradScaler, NLPDDPPlugin
+from nemo.collections.nlp.parts.nlp_overrides import GradScaler, NLPDDPStrategy
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 from nemo.utils.exp_manager import StatelessTimer, exp_manager
@@ -31,7 +31,8 @@ def main(cfg) -> None:
     logging.info("\n\n************** Experiment configuration ***********")
     logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
 
-    plugins = [NLPDDPPlugin(find_unused_parameters=False)]
+    plugins = []
+    strategy = NLPDDPStrategy(find_unused_parameters=False)
     if cfg.trainer.precision == 16:
         scaler = GradScaler(
             init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
@@ -42,7 +43,7 @@ def main(cfg) -> None:
     if cfg.get('cluster_type', None) == 'BCP':
         plugins.append(TorchElasticEnvironment())
 
-    trainer = Trainer(plugins=plugins, **cfg.trainer)
+    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer)
 
     exp_manager(trainer, cfg.exp_manager)
 
diff --git a/examples/nlp/language_modeling/megatron_change_num_partitions.py b/examples/nlp/language_modeling/megatron_change_num_partitions.py
index ea09a85e4611..5bf12f5511d0 100644
--- a/examples/nlp/language_modeling/megatron_change_num_partitions.py
+++ b/examples/nlp/language_modeling/megatron_change_num_partitions.py
@@ -18,7 +18,7 @@
 import torch
 from pytorch_lightning import Trainer
 
-from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin, NLPSaveRestoreConnector
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector
 from nemo.utils import logging, model_utils
 from nemo.utils.app_state import AppState
 
@@ -144,7 +144,7 @@ def main():
     tgt_tp_size = args.target_tensor_model_parallel_size
     cls = model_utils.import_class_by_path(args.model_class)
 
-    trainer = Trainer(devices=1, plugins=NLPDDPPlugin(), accelerator="cpu", precision=precision)
+    trainer = Trainer(devices=1, strategy=NLPDDPStrategy(), accelerator="cpu", precision=precision)
     app_state = AppState()
     app_state.data_parallel_rank = 0
     app_state.pipeline_model_parallel_size = 1  # not supported yet in this script
@@ -168,7 +168,7 @@ def main():
 
         model.cfg.tensor_model_parallel_size = 1
         app_state.model_parallel_size = 1
-        trainer = Trainer(devices=1, plugins=NLPDDPPlugin(), accelerator="cpu", precision=precision)
+        trainer = Trainer(devices=1, strategy=NLPDDPStrategy(), accelerator="cpu", precision=precision)
         model = cls(model.cfg, trainer).to('cpu')
         model._save_restore_connector = NLPSaveRestoreConnector()
 
@@ -187,7 +187,7 @@ def main():
 
         model.cfg.tensor_model_parallel_size = tgt_tp_size
         app_state.model_parallel_size = tgt_tp_size
-        trainer = Trainer(devices=1, plugins=NLPDDPPlugin(), accelerator="cpu", precision=precision)
+        trainer = Trainer(devices=1, strategy=NLPDDPStrategy(), accelerator="cpu", precision=precision)
         model = cls(model.cfg, trainer).to('cpu')
         model._save_restore_connector = NLPSaveRestoreConnector()
 
diff --git a/examples/nlp/language_modeling/megatron_gpt_eval.py b/examples/nlp/language_modeling/megatron_gpt_eval.py
index c800dc52a3f0..7e66c3096f33 100644
--- a/examples/nlp/language_modeling/megatron_gpt_eval.py
+++ b/examples/nlp/language_modeling/megatron_gpt_eval.py
@@ -24,7 +24,7 @@
 from nemo.collections.nlp.modules.common.text_generation_server import MegatronServer
 from nemo.collections.nlp.modules.common.text_generation_utils import generate
 from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam
-from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
 from nemo.core.config import hydra_runner
 from nemo.utils.app_state import AppState
 from nemo.utils.model_utils import inject_model_parallel_rank
@@ -153,7 +153,7 @@ def __getitem__(self, idx):
 def main(cfg) -> None:
 
     # trainer required for restoring model parallel models
-    trainer = Trainer(plugins=NLPDDPPlugin(), **cfg.trainer)
+    trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer)
     assert (
         cfg.trainer.devices * cfg.trainer.num_nodes
         == cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size
diff --git a/examples/nlp/language_modeling/megatron_gpt_pretraining.py b/examples/nlp/language_modeling/megatron_gpt_pretraining.py
index f827d9ad92df..7f15bb1daa8a 100644
--- a/examples/nlp/language_modeling/megatron_gpt_pretraining.py
+++ b/examples/nlp/language_modeling/megatron_gpt_pretraining.py
@@ -23,7 +23,7 @@
 from nemo.collections.nlp.parts.nlp_overrides import (
     GradScaler,
     MegatronHalfPrecisionPlugin,
-    NLPDDPPlugin,
+    NLPDDPStrategy,
     PipelineMixedPrecisionPlugin,
 )
 from nemo.core.config import hydra_runner
@@ -38,13 +38,12 @@ def main(cfg) -> None:
 
     megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False)
 
-    plugins = [
-        NLPDDPPlugin(
-            no_ddp_communication_hook=True,  # we don't use DDP for async grad allreduce
-            gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
-            find_unused_parameters=False,
-        )
-    ]
+    plugins = []
+    strategy = NLPDDPStrategy(
+        no_ddp_communication_hook=True,  # we don't use DDP for async grad allreduce
+        gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
+        find_unused_parameters=False,
+    )
     if cfg.trainer.precision in [16, 'bf16']:
         scaler = None
         if cfg.trainer.precision == 16:
@@ -61,7 +60,7 @@ def main(cfg) -> None:
     if cfg.get('cluster_type', None) == 'BCP':
         plugins.append(TorchElasticEnvironment())
 
-    trainer = Trainer(plugins=plugins, **cfg.trainer)
+    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer)
 
     exp_manager(trainer, cfg.exp_manager)
 
diff --git a/examples/nlp/language_modeling/megatron_gpt_prompt_learning.py b/examples/nlp/language_modeling/megatron_gpt_prompt_learning.py
index d9b95db2bdf6..243559c44b02 100644
--- a/examples/nlp/language_modeling/megatron_gpt_prompt_learning.py
+++ b/examples/nlp/language_modeling/megatron_gpt_prompt_learning.py
@@ -22,7 +22,7 @@
 )
 from nemo.collections.nlp.parts.nlp_overrides import (
     GradScaler,
-    NLPDDPPlugin,
+    NLPDDPStrategy,
     NLPSaveRestoreConnector,
     PipelineMixedPrecisionPlugin,
 )
@@ -45,7 +45,8 @@ def main(cfg) -> None:
     logging.info("\n\n************** Experiment configuration ***********")
     logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
 
-    plugins = [NLPDDPPlugin(no_ddp_communication_hook=True, find_unused_parameters=False,)]
+    plugins = []
+    strategy = NLPDDPStrategy(no_ddp_communication_hook=True, find_unused_parameters=False,)
     if cfg.trainer.precision == 16:
         scaler = GradScaler(
             init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
@@ -57,7 +58,7 @@ def main(cfg) -> None:
     if cfg.get('cluster_type', None) == 'BCP':
         plugins.append(TorchElasticEnvironment())
 
-    trainer = Trainer(plugins=plugins, **cfg.trainer)
+    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer)
     exp_manager(trainer, cfg.exp_manager)
 
     # Override timer callback to a stateless one
diff --git a/examples/nlp/language_modeling/megatron_gpt_prompt_learning_eval.py b/examples/nlp/language_modeling/megatron_gpt_prompt_learning_eval.py
index e81f6dcdb7fb..6e7d2d35fbe5 100644
--- a/examples/nlp/language_modeling/megatron_gpt_prompt_learning_eval.py
+++ b/examples/nlp/language_modeling/megatron_gpt_prompt_learning_eval.py
@@ -21,7 +21,7 @@
     MegatronGPTPromptLearningModel,
 )
 from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam
-from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
 from nemo.core.config import hydra_runner
 
 
@@ -75,7 +75,7 @@ def main(cfg) -> None:
         raise EnvironmentError("GPU is needed for the inference")
 
     # trainer required for restoring model parallel models
-    trainer = Trainer(plugins=NLPDDPPlugin(), **cfg.trainer)
+    trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer)
     assert (
         cfg.trainer.devices * cfg.trainer.num_nodes
         == cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size
diff --git a/examples/nlp/language_modeling/megatron_gpt_test.py b/examples/nlp/language_modeling/megatron_gpt_test.py
index 24770fac0e3c..b53ef375e3a0 100644
--- a/examples/nlp/language_modeling/megatron_gpt_test.py
+++ b/examples/nlp/language_modeling/megatron_gpt_test.py
@@ -18,7 +18,7 @@
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
 from nemo.collections.nlp.modules.common.megatron.megatron_utils import compute_model_parallel_rank
 from nemo.collections.nlp.parts.nlp_overrides import (
-    NLPDDPPlugin,
+    NLPDDPStrategy,
     NLPNativeMixedPrecisionPlugin,
     NLPPrecisionPlugin,
     NLPSaveRestoreConnector,
@@ -37,18 +37,18 @@ def main(cfg) -> None:
     if cfg.trainer.precision == 16:
         trainer = Trainer(
             plugins=[
-                NLPDDPPlugin(),
                 NLPNativeMixedPrecisionPlugin(
                     init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
                     growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
                 ),
             ],
+            strategy=NLPDDPStrategy(),
             **cfg.trainer,
         )
     elif cfg.trainer.precision == 'bf16':
-        trainer = Trainer(plugins=[NLPDDPPlugin(), NLPNativeBfloat16PrecisionPlugin(),], **cfg.trainer,)
+        trainer = Trainer(plugins=[NLPNativeBfloat16PrecisionPlugin(),], strategy=NLPDDPStrategy(), **cfg.trainer,)
     else:
-        trainer = Trainer(plugins=[NLPDDPPlugin(), NLPPrecisionPlugin()], **cfg.trainer)
+        trainer = Trainer(plugins=[NLPPrecisionPlugin()], strategy=NLPDDPStrategy(), **cfg.trainer)
 
     app_state = AppState()
     app_state.model_parallel_size = cfg.model.tensor_model_parallel_size
diff --git a/examples/nlp/language_modeling/megatron_retro_cal_shape.py b/examples/nlp/language_modeling/megatron_retro_cal_shape.py
index 02e0283e7aca..06bec216e925 100644
--- a/examples/nlp/language_modeling/megatron_retro_cal_shape.py
+++ b/examples/nlp/language_modeling/megatron_retro_cal_shape.py
@@ -19,7 +19,7 @@
 
 from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel
 from nemo.collections.nlp.modules.common.megatron.mup.shape import make_base_shapes
-from nemo.collections.nlp.parts.nlp_overrides import GradScaler, MegatronHalfPrecisionPlugin, NLPDDPPlugin
+from nemo.collections.nlp.parts.nlp_overrides import GradScaler, MegatronHalfPrecisionPlugin, NLPDDPStrategy
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 
@@ -30,13 +30,12 @@ def main(cfg) -> None:
     logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
 
     megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False)
-    plugins = [
-        NLPDDPPlugin(
-            no_ddp_communication_hook=True if megatron_amp_o2 else False,
-            gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
-            find_unused_parameters=False,
-        )
-    ]
+    plugins = []
+    strategy = NLPDDPStrategy(
+        no_ddp_communication_hook=True if megatron_amp_o2 else False,
+        gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
+        find_unused_parameters=False,
+    )
 
     if cfg.trainer.precision in [16, 'bf16']:
         scaler = None
@@ -54,7 +53,7 @@ def main(cfg) -> None:
     if cfg.get('cluster_type', None) == 'BCP':
         plugins.append(TorchElasticEnvironment())
 
-    trainer = Trainer(plugins=plugins, **cfg.trainer)
+    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer)
 
     # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
     with open_dict(cfg):
diff --git a/examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py b/examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py
index ad43e3d472c1..cf7dc5d747cc 100644
--- a/examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py
+++ b/examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py
@@ -25,7 +25,7 @@
 from nemo.collections.nlp.modules.common.megatron.mup.init import normal_
 from nemo.collections.nlp.modules.common.megatron.mup.optim import MuAdam, MuAdamW
 from nemo.collections.nlp.modules.common.megatron.mup.shape import set_base_shapes
-from nemo.collections.nlp.parts.nlp_overrides import GradScaler, MegatronHalfPrecisionPlugin, NLPDDPPlugin
+from nemo.collections.nlp.parts.nlp_overrides import GradScaler, MegatronHalfPrecisionPlugin, NLPDDPStrategy
 from nemo.core.config import hydra_runner
 from nemo.core.config.optimizers import AdamParams, AdamWParams
 from nemo.core.optim.optimizers import register_optimizer
@@ -41,13 +41,12 @@ def main(cfg) -> None:
     logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
 
     megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False)
-    plugins = [
-        NLPDDPPlugin(
-            no_ddp_communication_hook=True if megatron_amp_o2 else False,
-            gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
-            find_unused_parameters=False,
-        )
-    ]
+    plugins = []
+    strategy = NLPDDPStrategy(
+        no_ddp_communication_hook=True if megatron_amp_o2 else False,
+        gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
+        find_unused_parameters=False,
+    )
 
     if cfg.trainer.precision in [16, 'bf16']:
         scaler = None
@@ -65,7 +64,7 @@ def main(cfg) -> None:
     if cfg.get('cluster_type', None) == 'BCP':
         plugins.append(TorchElasticEnvironment())
 
-    trainer = Trainer(plugins=plugins, **cfg.trainer)
+    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer)
 
     exp_manager(trainer, cfg.exp_manager)
 
diff --git a/examples/nlp/language_modeling/megatron_retro_pretraining.py b/examples/nlp/language_modeling/megatron_retro_pretraining.py
index 8f25646a3b68..374bb938583e 100644
--- a/examples/nlp/language_modeling/megatron_retro_pretraining.py
+++ b/examples/nlp/language_modeling/megatron_retro_pretraining.py
@@ -20,7 +20,7 @@
 from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector
 
 from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel
-from nemo.collections.nlp.parts.nlp_overrides import GradScaler, MegatronHalfPrecisionPlugin, NLPDDPPlugin
+from nemo.collections.nlp.parts.nlp_overrides import GradScaler, MegatronHalfPrecisionPlugin, NLPDDPStrategy
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 from nemo.utils.exp_manager import StatelessTimer, exp_manager
@@ -32,13 +32,12 @@ def main(cfg) -> None:
     logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
 
     megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False)
-    plugins = [
-        NLPDDPPlugin(
-            no_ddp_communication_hook=True if megatron_amp_o2 else False,
-            gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
-            find_unused_parameters=False,
-        )
-    ]
+    plugins = []
+    strategy = NLPDDPStrategy(
+        no_ddp_communication_hook=True if megatron_amp_o2 else False,
+        gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
+        find_unused_parameters=False,
+    )
 
     if cfg.trainer.precision in [16, 'bf16']:
         scaler = None
@@ -56,7 +55,7 @@ def main(cfg) -> None:
     if cfg.get('cluster_type', None) == 'BCP':
         plugins.append(TorchElasticEnvironment())
 
-    trainer = Trainer(plugins=plugins, **cfg.trainer)
+    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer)
 
     exp_manager(trainer, cfg.exp_manager)
 
diff --git a/examples/nlp/language_modeling/megatron_t5_eval.py b/examples/nlp/language_modeling/megatron_t5_eval.py
index 03abb4132dae..0c205ab65ad0 100644
--- a/examples/nlp/language_modeling/megatron_t5_eval.py
+++ b/examples/nlp/language_modeling/megatron_t5_eval.py
@@ -22,7 +22,7 @@
 from nemo.collections.nlp.data.language_modeling.megatron.request_dataset import T5RequestDataset
 from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model
 from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel
-from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin, NLPSaveRestoreConnector
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector
 from nemo.utils.app_state import AppState
 
 assert torch.cuda.is_available()
@@ -55,7 +55,7 @@ def main():
 
     # trainer required for restoring model parallel models
     trainer = Trainer(
-        plugins=NLPDDPPlugin(),
+        strategy=NLPDDPStrategy(),
         devices=args.tensor_model_parallel_size * args.pipeline_model_parallel_size,
         accelerator='gpu',
         precision=args.precision,
diff --git a/examples/nlp/language_modeling/megatron_t5_lm_adaptation_finetune.py b/examples/nlp/language_modeling/megatron_t5_lm_adaptation_finetune.py
index 9a78bb6e36fc..063147d66abb 100644
--- a/examples/nlp/language_modeling/megatron_t5_lm_adaptation_finetune.py
+++ b/examples/nlp/language_modeling/megatron_t5_lm_adaptation_finetune.py
@@ -24,7 +24,7 @@
 from nemo.collections.nlp.parts.nlp_overrides import (
     GradScaler,
     MegatronHalfPrecisionPlugin,
-    NLPDDPPlugin,
+    NLPDDPStrategy,
     NLPSaveRestoreConnector,
     PipelineMixedPrecisionPlugin,
 )
@@ -39,13 +39,12 @@ def main(cfg) -> None:
     logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
 
     megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False)
-    plugins = [
-        NLPDDPPlugin(
-            no_ddp_communication_hook=True,  # we don't use DDP for async grad allreduce
-            gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
-            find_unused_parameters=False,
-        )
-    ]
+    plugins = []
+    strategy = NLPDDPStrategy(
+        no_ddp_communication_hook=True,  # we don't use DDP for async grad allreduce
+        gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
+        find_unused_parameters=False,
+    )
     if cfg.trainer.precision in [16, 'bf16']:
         scaler = None
         if cfg.trainer.precision == 16:
@@ -62,7 +61,7 @@ def main(cfg) -> None:
     if cfg.get('cluster_type', None) == 'BCP':
         plugins.append(TorchElasticEnvironment())
 
-    trainer = Trainer(plugins=plugins, **cfg.trainer, callbacks=[ModelSummary(max_depth=3)])
+    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[ModelSummary(max_depth=3)])
     exp_manager(trainer, cfg.exp_manager)
 
     # update resume from checkpoint found by exp_manager
diff --git a/examples/nlp/language_modeling/megatron_t5_pretraining.py b/examples/nlp/language_modeling/megatron_t5_pretraining.py
index f566f927e58f..462cc62d28eb 100644
--- a/examples/nlp/language_modeling/megatron_t5_pretraining.py
+++ b/examples/nlp/language_modeling/megatron_t5_pretraining.py
@@ -24,7 +24,7 @@
 from nemo.collections.nlp.parts.nlp_overrides import (
     GradScaler,
     MegatronHalfPrecisionPlugin,
-    NLPDDPPlugin,
+    NLPDDPStrategy,
     PipelineMixedPrecisionPlugin,
 )
 from nemo.core.config import hydra_runner
@@ -38,13 +38,12 @@ def main(cfg) -> None:
     logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
 
     megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False)
-    plugins = [
-        NLPDDPPlugin(
-            no_ddp_communication_hook=True,  # we don't use DDP for async grad allreduce
-            gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
-            find_unused_parameters=False,
-        )
-    ]
+    plugins = []
+    strategy = NLPDDPStrategy(
+        no_ddp_communication_hook=True,  # we don't use DDP for async grad allreduce
+        gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
+        find_unused_parameters=False,
+    )
     if cfg.trainer.precision in [16, 'bf16']:
         scaler = None
         if cfg.trainer.precision == 16:
@@ -61,7 +60,7 @@ def main(cfg) -> None:
     if cfg.get('cluster_type', None) == 'BCP':
         plugins.append(TorchElasticEnvironment())
 
-    trainer = Trainer(plugins=plugins, **cfg.trainer, callbacks=[ModelSummary(max_depth=3)])
+    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[ModelSummary(max_depth=3)])
     exp_manager(trainer, cfg.exp_manager)
 
     # update resume from checkpoint found by exp_manager
diff --git a/examples/nlp/language_modeling/megatron_t5_prompt_learning.py b/examples/nlp/language_modeling/megatron_t5_prompt_learning.py
index ac31dcbeec8f..0d135fb60b98 100644
--- a/examples/nlp/language_modeling/megatron_t5_prompt_learning.py
+++ b/examples/nlp/language_modeling/megatron_t5_prompt_learning.py
@@ -22,7 +22,7 @@
 )
 from nemo.collections.nlp.parts.nlp_overrides import (
     GradScaler,
-    NLPDDPPlugin,
+    NLPDDPStrategy,
     NLPSaveRestoreConnector,
     PipelineMixedPrecisionPlugin,
 )
@@ -45,7 +45,8 @@ def main(cfg) -> None:
     logging.info("\n\n************** Experiment configuration ***********")
     logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
 
-    plugins = [NLPDDPPlugin(no_ddp_communication_hook=True, find_unused_parameters=False,)]
+    plugins = []
+    strategy = NLPDDPStrategy(no_ddp_communication_hook=True, find_unused_parameters=False,)
     if cfg.trainer.precision == 16:
         scaler = GradScaler(
             init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
@@ -57,7 +58,7 @@ def main(cfg) -> None:
     if cfg.get('cluster_type', None) == 'BCP':
         plugins.append(TorchElasticEnvironment())
 
-    trainer = Trainer(plugins=plugins, **cfg.trainer)
+    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer)
     exp_manager(trainer, cfg.exp_manager)
 
     # Override timer callback to a stateless one
diff --git a/examples/nlp/language_modeling/megatron_t5_prompt_learning_eval.py b/examples/nlp/language_modeling/megatron_t5_prompt_learning_eval.py
index be88d0205e6c..b1d39141d742 100644
--- a/examples/nlp/language_modeling/megatron_t5_prompt_learning_eval.py
+++ b/examples/nlp/language_modeling/megatron_t5_prompt_learning_eval.py
@@ -21,7 +21,7 @@
     MegatronT5PromptLearningModel,
 )
 from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel
-from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
 from nemo.core.config import hydra_runner
 from nemo.utils.app_state import AppState
 
@@ -41,7 +41,7 @@
 def main(cfg) -> None:
 
     # trainer required for restoring model parallel models
-    trainer = Trainer(plugins=NLPDDPPlugin(), **cfg.trainer)
+    trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer)
     assert (
         cfg.trainer.devices * cfg.trainer.num_nodes
         == cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size
diff --git a/examples/nlp/language_modeling/megatron_t5_ptune.py b/examples/nlp/language_modeling/megatron_t5_ptune.py
deleted file mode 100644
index 3ed47986635e..000000000000
--- a/examples/nlp/language_modeling/megatron_t5_ptune.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from pathlib import Path
-
-import torch
-from omegaconf.omegaconf import OmegaConf, open_dict
-from pytorch_lightning import Trainer
-from pytorch_lightning.callbacks.timer import Timer
-from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment
-from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin
-from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector
-
-from nemo.collections.nlp.data.glue_benchmark.gpt_ptune_dataset import TemplateProcessor, register_taskdata_processor
-from nemo.collections.nlp.models.language_modeling.megatron_ptune_t5_model import MegatronT5PTuneModel
-from nemo.collections.nlp.parts.nlp_overrides import GradScaler, NLPDDPPlugin
-from nemo.core.config import hydra_runner
-from nemo.utils import logging
-from nemo.utils.exp_manager import StatelessTimer, exp_manager
-
-
-@hydra_runner(config_path="conf", config_name="megatron_ptune_t5")
-def main(cfg) -> None:
-    logging.info("\n\n************** Experiment configuration ***********")
-    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
-
-    # setup the data processor
-    for processor_config in cfg.model.task_processors:
-        processor = TemplateProcessor(
-            template=processor_config.template, limit_length_field=processor_config.limit_length_field
-        )
-        register_taskdata_processor(processor_config.taskname, processor)
-
-    plugins = [NLPDDPPlugin()]
-    if cfg.trainer.precision == 16:
-        scaler = GradScaler(
-            init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
-            growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
-        )
-        plugins.append(NativeMixedPrecisionPlugin(precision=16, device='cuda', scaler=scaler))
-
-    if cfg.get('cluster_type', None) == 'BCP':
-        plugins.append(TorchElasticEnvironment())
-
-    trainer = Trainer(plugins=plugins, **cfg.trainer)
-
-    exp_manager(trainer, cfg.exp_manager)
-
-    # update resume from checkpoint found by exp_manager
-    resume_from_checkpoint = trainer._checkpoint_connector.resume_from_checkpoint_fit_path
-    logging.info(f'Resuming training from checkpoint: {resume_from_checkpoint}')
-
-    trainer._checkpoint_connector = CheckpointConnector(trainer, resume_from_checkpoint=resume_from_checkpoint)
-    # Override timer callback to a stateless one
-    for idx, callback in enumerate(trainer.callbacks):
-        if isinstance(callback, Timer):
-            trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time,)
-
-    # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
-    with open_dict(cfg):
-        cfg.model.precision = cfg.trainer.precision
-    model = MegatronT5PTuneModel(cfg.model, trainer)
-    trainer.fit(model)
-
-    if cfg.model.data.test_ds.file_path:
-        logging.info("===========================================================================================")
-        logging.info("Starting the testing of the trained model on test set...")
-        trainer.test(model)
-        logging.info("Testing finished!")
-        logging.info("===========================================================================================")
-        # extract the path of the best checkpoint from the training, you may update it to any checkpoint
-        checkpoint_path = trainer.checkpoint_callback.best_model_path
-        tensor_parallel_size = cfg.model.tensor_model_parallel_size
-        pathobj = Path(checkpoint_path)
-        checkpoint_folder = str(pathobj.parent)
-        checkpoint_name = str(pathobj.name)
-
-        rank = trainer.accelerator.training_type_plugin.local_rank
-        if tensor_parallel_size > 1:
-            # inject model parallel rank
-            checkpoint_path = os.path.join(checkpoint_folder, f'mp_rank_{rank:02d}', checkpoint_name)
-        else:
-            checkpoint_path = os.path.join(checkpoint_folder, checkpoint_name)
-
-        # Load the checkpoint
-        best_eval_model = MegatronT5PTuneModel.load_from_checkpoint(
-            checkpoint_path=checkpoint_path, strict=False, trainer=trainer
-        )
-        logging.info(f'Best checkpoint path: {checkpoint_path}')
-        logging.info("Running Test with best EVAL checkpoint!")
-        # setup the test dataset
-        #  best_eval_model.setup_test_data(test_data_config=cfg.model.data.test_ds)
-        if torch.distributed.is_initialized():
-            torch.distributed.barrier()
-        trainer.test(model=best_eval_model, ckpt_path=None, verbose=False)
-        logging.info("Beset EVAL Testing finished!")
-        logging.info("===========================================================================================")
-
-    if cfg.model.nemo_path:
-        # '.nemo' file contains the last checkpoint and the params to initialize the model
-        best_eval_model.save_to(cfg.model.nemo_path)
-        logging.info(f'Model is saved into `.nemo` file: {cfg.model.nemo_path}')
-
-    # perform inference on a list of queries.
-    if "infer_samples" in cfg.model and cfg.model.infer_samples:
-        logging.info("===========================================================================================")
-        logging.info("Starting the inference on some sample queries...")
-
-        # max_seq_length=512 is the maximum length BERT supports.
-        results = best_eval_model.cuda().ptune_inference(
-            queries=cfg.model.infer_samples, batch_size=1, decode_token_len=5
-        )
-        logging.info('The prediction results of some sample queries with the trained model:')
-        for query, result in zip(cfg.model.infer_samples, results):
-            logging.info(f'Query : {query}')
-            logging.info(f'Predicted label: {result}')
-
-        logging.info("Inference finished!")
-        logging.info("===========================================================================================")
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py
index 62fe80a663ed..f51a809654ad 100644
--- a/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py
+++ b/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py
@@ -20,7 +20,7 @@
 
 from nemo.collections.nlp.models.language_modeling.megatron_finetune_model import MegatronT5FinetuneModel
 from nemo.collections.nlp.models.language_modeling.megatron_glue_model import MegatronT5GLUEModel
-from nemo.collections.nlp.parts.nlp_overrides import GradScaler, MegatronHalfPrecisionPlugin, NLPDDPPlugin
+from nemo.collections.nlp.parts.nlp_overrides import GradScaler, MegatronHalfPrecisionPlugin, NLPDDPStrategy
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 from nemo.utils.exp_manager import StatelessTimer, exp_manager
@@ -32,13 +32,12 @@ def main(cfg) -> None:
     logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
 
     megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False)
-    plugins = [
-        NLPDDPPlugin(
-            no_ddp_communication_hook=True,
-            gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
-            find_unused_parameters=False,
-        )
-    ]
+    plugins = []
+    strategy = NLPDDPStrategy(
+        no_ddp_communication_hook=True,
+        gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
+        find_unused_parameters=False,
+    )
     if cfg.trainer.precision in [16, 'bf16']:
         scaler = None
         if cfg.trainer.precision == 16:
@@ -55,7 +54,7 @@ def main(cfg) -> None:
     if cfg.get('cluster_type', None) == 'BCP':
         plugins.append(TorchElasticEnvironment())
 
-    trainer = Trainer(plugins=plugins, **cfg.trainer)
+    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer)
 
     exp_manager(trainer, cfg.exp_manager)
 
diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py
index 5f22aa1cd847..6aa3a515d0bd 100644
--- a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py
+++ b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py
@@ -24,7 +24,7 @@
 from nemo.collections.nlp.parts.nlp_overrides import (
     GradScaler,
     MegatronHalfPrecisionPlugin,
-    NLPDDPPlugin,
+    NLPDDPStrategy,
     NLPSaveRestoreConnector,
     PipelineMixedPrecisionPlugin,
 )
@@ -39,13 +39,12 @@ def main(cfg) -> None:
     logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
 
     megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False)
-    plugins = [
-        NLPDDPPlugin(
-            no_ddp_communication_hook=True,
-            gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
-            find_unused_parameters=False,
-        )
-    ]
+    plugins = []
+    strategy = NLPDDPStrategy(
+        no_ddp_communication_hook=True,
+        gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
+        find_unused_parameters=False,
+    )
     if cfg.trainer.precision in [16, 'bf16']:
         scaler = None
         if cfg.trainer.precision == 16:
@@ -62,7 +61,7 @@ def main(cfg) -> None:
     if cfg.get('cluster_type', None) == 'BCP':
         plugins.append(TorchElasticEnvironment())
 
-    trainer = Trainer(plugins=plugins, **cfg.trainer)
+    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer)
 
     exp_manager(trainer, cfg.exp_manager)
 
diff --git a/examples/nlp/machine_translation/enc_dec_nmt-bottleneck.py b/examples/nlp/machine_translation/enc_dec_nmt-bottleneck.py
index 5b2a036736ba..87898da15643 100644
--- a/examples/nlp/machine_translation/enc_dec_nmt-bottleneck.py
+++ b/examples/nlp/machine_translation/enc_dec_nmt-bottleneck.py
@@ -21,7 +21,7 @@
 from nemo.collections.nlp.data.machine_translation.preproc_mt_data import MTDataPreproc
 from nemo.collections.nlp.models.machine_translation.mt_enc_dec_bottleneck_model import MTBottleneckModel
 from nemo.collections.nlp.models.machine_translation.mt_enc_dec_config import MTBottleneckModelConfig
-from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
 from nemo.core.config import hydra_runner
 from nemo.core.config.modelPT import NemoConfig
 from nemo.core.config.pytorch_lightning import TrainerConfig
@@ -116,8 +116,8 @@ def main(cfg: MTBottleneckConfig) -> None:
 
     # training is managed by PyTorch Lightning
     trainer_cfg = OmegaConf.to_container(cfg.trainer)
-    trainer_cfg.pop('plugins', None)
-    trainer = Trainer(plugins=[NLPDDPPlugin()], **trainer_cfg)
+    trainer_cfg.pop('strategy', None)
+    trainer = Trainer(strategy=NLPDDPStrategy(), **trainer_cfg)
 
     # tokenizers will be trained and and tarred training data will be created if needed
     # model config is then updated
diff --git a/examples/nlp/machine_translation/enc_dec_nmt.py b/examples/nlp/machine_translation/enc_dec_nmt.py
index e6a93c0bba9e..bdb501d7e7dd 100644
--- a/examples/nlp/machine_translation/enc_dec_nmt.py
+++ b/examples/nlp/machine_translation/enc_dec_nmt.py
@@ -21,7 +21,7 @@
 from nemo.collections.nlp.data.machine_translation.preproc_mt_data import MTDataPreproc
 from nemo.collections.nlp.models.machine_translation.mt_enc_dec_config import MTEncDecModelConfig
 from nemo.collections.nlp.models.machine_translation.mt_enc_dec_model import MTEncDecModel
-from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
 from nemo.core.config import hydra_runner
 from nemo.core.config.modelPT import NemoConfig
 from nemo.core.config.pytorch_lightning import TrainerConfig
@@ -110,8 +110,8 @@ def main(cfg: MTEncDecConfig) -> None:
 
     # training is managed by PyTorch Lightning
     trainer_cfg = OmegaConf.to_container(cfg.trainer)
-    trainer_cfg.pop('plugins', None)
-    trainer = Trainer(plugins=[NLPDDPPlugin()], **trainer_cfg)
+    trainer_cfg.pop('strategy', None)
+    trainer = Trainer(strategy=NLPDDPStrategy(), **trainer_cfg)
 
     # tokenizers will be trained and and tarred training data will be created if needed
     # model config is then updated
diff --git a/examples/nlp/machine_translation/enc_dec_nmt_finetune.py b/examples/nlp/machine_translation/enc_dec_nmt_finetune.py
index a67067beb455..c5540f72607e 100644
--- a/examples/nlp/machine_translation/enc_dec_nmt_finetune.py
+++ b/examples/nlp/machine_translation/enc_dec_nmt_finetune.py
@@ -22,7 +22,7 @@
 from nemo.collections.nlp.data.machine_translation.preproc_mt_data import MTDataPreproc
 from nemo.collections.nlp.models.machine_translation.mt_enc_dec_config import MTEncDecModelConfig
 from nemo.collections.nlp.models.machine_translation.mt_enc_dec_model import MTEncDecModel
-from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
 from nemo.core.config import hydra_runner
 from nemo.core.config.modelPT import NemoConfig
 from nemo.core.config.pytorch_lightning import TrainerConfig
@@ -78,8 +78,8 @@ def main(cfg: MTFineTuneConfig) -> None:
 
     # training is managed by PyTorch Lightning
     trainer_cfg = OmegaConf.to_container(cfg.trainer)
-    trainer_cfg.pop('plugins', None)
-    trainer = Trainer(plugins=[NLPDDPPlugin()], **trainer_cfg)
+    trainer_cfg.pop('strategy', None)
+    trainer = Trainer(strategy=NLPDDPStrategy(), **trainer_cfg)
 
     # experiment logs, checkpoints, and auto-resume are managed by exp_manager and PyTorch Lightning
     exp_manager(trainer, cfg.exp_manager)
diff --git a/examples/nlp/machine_translation/megatron_nmt_training.py b/examples/nlp/machine_translation/megatron_nmt_training.py
index 97c09ea00e38..b5dbdfc1de7f 100644
--- a/examples/nlp/machine_translation/megatron_nmt_training.py
+++ b/examples/nlp/machine_translation/megatron_nmt_training.py
@@ -27,7 +27,7 @@
 from nemo.collections.nlp.parts.nlp_overrides import (
     GradScaler,
     MegatronHalfPrecisionPlugin,
-    NLPDDPPlugin,
+    NLPDDPStrategy,
     NLPSaveRestoreConnector,
     PipelineMixedPrecisionPlugin,
 )
@@ -42,13 +42,12 @@ def main(cfg) -> None:
     logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
 
     megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False)
-    plugins = [
-        NLPDDPPlugin(
-            no_ddp_communication_hook=True,
-            gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
-            find_unused_parameters=False,
-        )
-    ]
+    plugins = []
+    strategy = NLPDDPStrategy(
+        no_ddp_communication_hook=True,
+        gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
+        find_unused_parameters=False,
+    )
     if cfg.trainer.precision in [16, 'bf16']:
         scaler = None
         if cfg.trainer.precision == 16:
@@ -65,7 +64,7 @@ def main(cfg) -> None:
     if cfg.get('cluster_type', None) == 'BCP':
         plugins.append(TorchElasticEnvironment())
 
-    trainer = Trainer(plugins=plugins, **cfg.trainer, callbacks=[ModelSummary(max_depth=3)])
+    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[ModelSummary(max_depth=3)])
 
     # tokenizers will be trained and and tarred training data will be created if needed
     # model config is then updated
diff --git a/examples/nlp/machine_translation/nmt_transformer_infer_megatron.py b/examples/nlp/machine_translation/nmt_transformer_infer_megatron.py
index b1c6de1254c8..a8d87f71dcbe 100644
--- a/examples/nlp/machine_translation/nmt_transformer_infer_megatron.py
+++ b/examples/nlp/machine_translation/nmt_transformer_infer_megatron.py
@@ -29,7 +29,7 @@
 from nemo.collections.nlp.models.machine_translation.megatron_nmt_model import MegatronNMTModel
 from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel
 from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults
-from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin, NLPSaveRestoreConnector
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 from nemo.utils.app_state import AppState
@@ -48,7 +48,7 @@
 def main(cfg) -> None:
 
     # trainer required for restoring model parallel models
-    trainer = Trainer(plugins=NLPDDPPlugin(), **cfg.trainer)
+    trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer)
     assert (
         cfg.trainer.devices * cfg.trainer.num_nodes
         == cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size
diff --git a/examples/nlp/question_answering/conf/question_answering_squad_config.yaml b/examples/nlp/question_answering/conf/question_answering_squad_config.yaml
index f0e677441c5e..2e54b6fecc7e 100644
--- a/examples/nlp/question_answering/conf/question_answering_squad_config.yaml
+++ b/examples/nlp/question_answering/conf/question_answering_squad_config.yaml
@@ -7,7 +7,7 @@ trainer:
   devices: 1 # the number of gpus, 0 for CPU, or list with gpu indices
   num_nodes: 1
   max_epochs: 2 # the number of training epochs
-  max_steps: null # precedence over max_epochs
+  max_steps: -1 # precedence over max_epochs
   accumulate_grad_batches: 1 # accumulates grads every k batches
   precision: 16 # 16 to use AMP
   accelerator: gpu
diff --git a/examples/nlp/text2sparql/conf/text2sparql_config.yaml b/examples/nlp/text2sparql/conf/text2sparql_config.yaml
index 21de59b6cb14..b9823e79b050 100644
--- a/examples/nlp/text2sparql/conf/text2sparql_config.yaml
+++ b/examples/nlp/text2sparql/conf/text2sparql_config.yaml
@@ -6,7 +6,7 @@ trainer:
   devices: 1 # the number of gpus, 0 for CPU, or list with gpu indices
   num_nodes: 1
   max_epochs: 2 # the number of training epochs
-  max_steps: null # precedence over max_epochs
+  max_steps: -1 # precedence over max_epochs
   accumulate_grad_batches: 1 # accumulates grads every k batches
   accelerator: gpu
   strategy: ddp
diff --git a/examples/nlp/text_classification/conf/ptune_text_classification_config.yaml b/examples/nlp/text_classification/conf/ptune_text_classification_config.yaml
index 9f91620bdf0a..be6499f9a643 100644
--- a/examples/nlp/text_classification/conf/ptune_text_classification_config.yaml
+++ b/examples/nlp/text_classification/conf/ptune_text_classification_config.yaml
@@ -18,7 +18,7 @@ trainer:
   devices: 1 # number of GPUs (0 for CPU), or list of the GPUs to use e.g. [0, 1]
   num_nodes: 1
   max_epochs: 100
-  max_steps: null # precedence over max_epochs
+  max_steps: -1 # precedence over max_epochs
   accumulate_grad_batches: 1 # accumulates grads every k batches
   gradient_clip_val: 0.0
   precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
diff --git a/examples/nlp/text_classification/conf/text_classification_config.yaml b/examples/nlp/text_classification/conf/text_classification_config.yaml
index af979929edc0..abc81ebdd0d3 100644
--- a/examples/nlp/text_classification/conf/text_classification_config.yaml
+++ b/examples/nlp/text_classification/conf/text_classification_config.yaml
@@ -18,7 +18,7 @@ trainer:
   devices: 1 # number of GPUs (0 for CPU), or list of the GPUs to use e.g. [0, 1]
   num_nodes: 1
   max_epochs: 100
-  max_steps: null # precedence over max_epochs
+  max_steps: -1 # precedence over max_epochs
   accumulate_grad_batches: 1 # accumulates grads every k batches
   gradient_clip_val: 0.0
   precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
diff --git a/examples/nlp/text_classification/model_parallel_text_classification_evaluation.py b/examples/nlp/text_classification/model_parallel_text_classification_evaluation.py
index 255a27ee79b4..ab3322f552c1 100644
--- a/examples/nlp/text_classification/model_parallel_text_classification_evaluation.py
+++ b/examples/nlp/text_classification/model_parallel_text_classification_evaluation.py
@@ -19,7 +19,7 @@
 from omegaconf import DictConfig, OmegaConf
 
 from nemo.collections.nlp.models.text_classification import TextClassificationModel
-from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 from nemo.utils.exp_manager import exp_manager
@@ -28,7 +28,7 @@
 @hydra_runner(config_path="conf", config_name="text_classification_config")
 def main(cfg: DictConfig) -> None:
     logging.info(f'\nConfig Params:\n{OmegaConf.to_yaml(cfg)}')
-    trainer = pl.Trainer(plugins=[NLPDDPPlugin()], **cfg.trainer)
+    trainer = pl.Trainer(strategy=NLPDDPStrategy(), **cfg.trainer)
     exp_manager(trainer, cfg.get("exp_manager", None))
     # TODO: can we drop strict=False
     model = TextClassificationModel.restore_from(cfg.model.nemo_path, trainer=trainer, strict=False)
diff --git a/examples/nlp/text_classification/text_classification_with_bert.py b/examples/nlp/text_classification/text_classification_with_bert.py
index 2ae9469fb998..db3d263cb1ab 100644
--- a/examples/nlp/text_classification/text_classification_with_bert.py
+++ b/examples/nlp/text_classification/text_classification_with_bert.py
@@ -99,7 +99,7 @@
 from omegaconf import DictConfig, OmegaConf
 
 from nemo.collections.nlp.models.text_classification import TextClassificationModel
-from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 from nemo.utils.exp_manager import exp_manager
@@ -109,11 +109,11 @@
 def main(cfg: DictConfig) -> None:
     logging.info(f'\nConfig Params:\n{OmegaConf.to_yaml(cfg)}')
     try:
-        plugin = NLPDDPPlugin()
+        strategy = NLPDDPStrategy()
     except (ImportError, ModuleNotFoundError):
-        plugin = None
+        strategy = None
 
-    trainer = pl.Trainer(plugins=plugin, **cfg.trainer)
+    trainer = pl.Trainer(strategy=strategy, **cfg.trainer)
     exp_manager(trainer, cfg.get("exp_manager", None))
 
     if not cfg.model.train_ds.file_path:
diff --git a/examples/nlp/text_normalization_as_tagging/conf/thutmose_tagger_itn_config.yaml b/examples/nlp/text_normalization_as_tagging/conf/thutmose_tagger_itn_config.yaml
index 37ee85e7c53b..a95947b4aff3 100644
--- a/examples/nlp/text_normalization_as_tagging/conf/thutmose_tagger_itn_config.yaml
+++ b/examples/nlp/text_normalization_as_tagging/conf/thutmose_tagger_itn_config.yaml
@@ -8,7 +8,7 @@ trainer:
   devices: 1 # the number of gpus, 0 for CPU
   num_nodes: 1
   max_epochs: 3  # the number of training epochs
-  checkpoint_callback: false  # provided by exp_manager
+  enable_checkpointing: false  # provided by exp_manager
   logger: false  # provided by exp_manager
   accumulate_grad_batches: 1 # accumulates grads every k batches
   gradient_clip_val: 0.0
diff --git a/examples/nlp/token_classification/conf/punctuation_capitalization_config.yaml b/examples/nlp/token_classification/conf/punctuation_capitalization_config.yaml
index 0cb51f87dcc5..76f5fac1db7a 100644
--- a/examples/nlp/token_classification/conf/punctuation_capitalization_config.yaml
+++ b/examples/nlp/token_classification/conf/punctuation_capitalization_config.yaml
@@ -21,7 +21,7 @@ trainer:
   devices: 1 # the number of gpus, 0 for CPU
   num_nodes: 1
   max_epochs: 3
-  max_steps: null # precedence over max_epochs
+  max_steps: -1 # precedence over max_epochs
   accumulate_grad_batches: 1 # accumulates grads every k batches
   gradient_clip_val: 0.0
   precision: 16 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0
diff --git a/examples/nlp/token_classification/conf/token_classification_config.yaml b/examples/nlp/token_classification/conf/token_classification_config.yaml
index 1a5135dfc31d..b79baf4372de 100644
--- a/examples/nlp/token_classification/conf/token_classification_config.yaml
+++ b/examples/nlp/token_classification/conf/token_classification_config.yaml
@@ -20,7 +20,7 @@ trainer:
   devices: 1 # the number of gpus, 0 for CPU
   num_nodes: 1
   max_epochs: 5
-  max_steps: null # precedence over max_epochs
+  max_steps: -1 # precedence over max_epochs
   accumulate_grad_batches: 1 # accumulates grads every k batches
   gradient_clip_val: 0.0
   precision: 16 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0
diff --git a/examples/nlp/token_classification/token_classification_train.py b/examples/nlp/token_classification/token_classification_train.py
index 3e3b0a245737..9b18d10b24e6 100644
--- a/examples/nlp/token_classification/token_classification_train.py
+++ b/examples/nlp/token_classification/token_classification_train.py
@@ -18,7 +18,7 @@
 from omegaconf import DictConfig, OmegaConf
 
 from nemo.collections.nlp.models import TokenClassificationModel
-from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 from nemo.utils.exp_manager import exp_manager
@@ -103,11 +103,11 @@
 @hydra_runner(config_path="conf", config_name="token_classification_config")
 def main(cfg: DictConfig) -> None:
     try:
-        plugin = NLPDDPPlugin()
+        strategy = NLPDDPStrategy()
     except (ImportError, ModuleNotFoundError):
-        plugin = None
+        strategy = None
 
-    trainer = pl.Trainer(plugins=plugin, **cfg.trainer)
+    trainer = pl.Trainer(strategy=strategy, **cfg.trainer)
     exp_manager(trainer, cfg.get("exp_manager", None))
 
     if not cfg.pretrained_model:
diff --git a/examples/nlp/zero_shot_intent_recognition/conf/zero_shot_intent_config.yaml b/examples/nlp/zero_shot_intent_recognition/conf/zero_shot_intent_config.yaml
index a3f80d9cccbd..0817c486a089 100644
--- a/examples/nlp/zero_shot_intent_recognition/conf/zero_shot_intent_config.yaml
+++ b/examples/nlp/zero_shot_intent_recognition/conf/zero_shot_intent_config.yaml
@@ -17,7 +17,7 @@ trainer:
   devices: 1 # the number of gpus, 0 for CPU
   num_nodes: 1
   max_epochs: 1
-  max_steps: null # precedence over max_epochs
+  max_steps: -1 # precedence over max_epochs
   accumulate_grad_batches: 1 # accumulates grads every k batches
   precision: 16
   accelerator: gpu
diff --git a/examples/speaker_tasks/recognition/conf/SpeakerNet_recognition_3x2x512.yaml b/examples/speaker_tasks/recognition/conf/SpeakerNet_recognition_3x2x512.yaml
index a4ec0a19b76e..bf7aa7d00aff 100644
--- a/examples/speaker_tasks/recognition/conf/SpeakerNet_recognition_3x2x512.yaml
+++ b/examples/speaker_tasks/recognition/conf/SpeakerNet_recognition_3x2x512.yaml
@@ -122,7 +122,7 @@ model:
     sched:
       name: CosineAnnealing
       iters_per_batch: 1 # computed at runtime
-      max_steps: null # computed at runtime or explicitly set here
+      max_steps: -1 # computed at runtime or explicitly set here
 
       # scheduler config override
       args:
@@ -136,7 +136,7 @@ model:
 trainer:
   devices: 1 # number of gpus
   max_epochs: 200
-  max_steps: null # computed at runtime if not set
+  max_steps: -1 # computed at runtime if not set
   num_nodes: 1
   accelerator: gpu
   strategy: ddp
diff --git a/nemo/collections/asr/modules/conv_asr.py b/nemo/collections/asr/modules/conv_asr.py
index 85431604a883..e4a798dd5fae 100644
--- a/nemo/collections/asr/modules/conv_asr.py
+++ b/nemo/collections/asr/modules/conv_asr.py
@@ -763,7 +763,7 @@ class SpeakerDecoder(NeuralModule, Exportable):
         num_classes (int): Number of unique speakers in dataset
         emb_sizes (list) : shapes of intermediate embedding layers (we consider speaker embbeddings from 1st of this layers)
                 Defaults to [1024,1024]
-        pool_mode (str) : Pooling stratergy type. options are 'xvector','tap', 'attention'
+        pool_mode (str) : Pooling strategy type. options are 'xvector','tap', 'attention'
                 Defaults to 'xvector (mean and variance)'
                 tap (temporal average pooling: just mean)
                 attention (attention based pooling)
diff --git a/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py b/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py
index 4acffb63f4b0..27da5bd1812c 100644
--- a/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py
+++ b/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py
@@ -603,10 +603,6 @@ def build_memmap_dataset_from_config(self, cfg: DictConfig):
                 data_prefix.append(weight)
                 data_prefix.append(prefix)
 
-            if self.trainer.max_steps is None:
-                raise ValueError(
-                    f"trainer.max_steps must be set to use blendable memmap datasets. Found {self.trainer.max_steps}."
-                )
             num_train_samples = [self.trainer.max_steps * self._cfg.global_batch_size]
             _, _, num_train_samples_per_dataset = get_datasets_weights_and_num_samples(data_prefix, num_train_samples)
             num_train_samples_after_blend = sum([x[0] for x in num_train_samples_per_dataset])
diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index e826947e3755..2738041ecf85 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -29,7 +29,7 @@
 from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
 from pytorch_lightning.plugins.precision import NativeMixedPrecisionPlugin
 from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin
-from pytorch_lightning.plugins.training_type.ddp import DDPPlugin
+from pytorch_lightning.strategies.ddp import DDPStrategy
 from pytorch_lightning.trainer.trainer import Trainer
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.fetching import DataFetcher
@@ -54,7 +54,7 @@
     HAVE_APEX = False
 
 
-class NLPDDPPlugin(DDPPlugin):
+class NLPDDPStrategy(DDPStrategy):
     """ DDP plugin for Pytorch Lightning. Needed to customize DDP for model parallel models.
 
     Args:
@@ -219,7 +219,7 @@ def distributed_sampler_kwargs(self):
             return distributed_sampler_kwargs
 
         else:
-            return super(NLPDDPPlugin, self).distributed_sampler_kwargs
+            return super(NLPDDPStrategy, self).distributed_sampler_kwargs
 
 
 class NLPSaveRestoreConnector(SaveRestoreConnector):
diff --git a/nemo/core/classes/modelPT.py b/nemo/core/classes/modelPT.py
index 8bab7c573ac1..032ce63c2dc3 100644
--- a/nemo/core/classes/modelPT.py
+++ b/nemo/core/classes/modelPT.py
@@ -483,7 +483,7 @@ def setup_optimization(self, optim_config: Optional[Union[DictConfig, Dict]] = N
 
             if not isinstance(self._trainer.accumulate_grad_batches, int):
                 raise ValueError("We do not currently support gradient acculumation that is not an integer.")
-            if self._trainer.max_steps is None or self.trainer.max_steps < 0:
+            if self.trainer.max_steps < 0:
                 # Store information needed to calculate max_steps
                 optim_config['sched']['t_max_epochs'] = self._trainer.max_epochs
                 optim_config['sched']['t_accumulate_grad_batches'] = self._trainer.accumulate_grad_batches
@@ -1355,6 +1355,10 @@ def cfg(self):
         """
         return self._cfg
 
+    @LightningModule.trainer.getter
+    def trainer(self):
+        return self._trainer
+
     @cfg.setter
     def cfg(self, cfg):
         """
@@ -1469,3 +1473,34 @@ def on_train_batch_end(self, outputs, batch: Any, batch_idx: int, unused: int =
                     if batch_idx == self._nsys_profile_end_step and get_rank() in self._nsys_profile_ranks:
                         logging.info("====== End nsys profiling ======")
                         torch.cuda.cudart().cudaProfilerStop()
+
+    # TODO: Remove in PTL 1.7.2
+    def cuda(self, device=None):
+        """ PTL is overriding this method and changing the pytorch behavior of a module.
+            The PTL LightingModule override will move the module to device 0 if device is None.
+            See the PTL method here: https://github.com/Lightning-AI/lightning/blob/master/src/pytorch_lightning/core/mixins/device_dtype_mixin.py#L113
+
+            Here we are overriding this to maintain the default Pytorch nn.module behavior:
+            https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/module.py#L728
+        
+        Moves all model parameters and buffers to the GPU.
+
+        This also makes associated parameters and buffers different objects. So
+        it should be called before constructing optimizer if the module will
+        live on GPU while being optimized.
+
+        .. note::
+            This method modifies the module in-place.
+
+        Args:
+            device (int, optional): if specified, all parameters will be
+                copied to that device
+
+        Returns:
+            Module: self
+        """
+        if device is None:
+            device = torch.device("cuda", torch.cuda.current_device())
+        elif isinstance(device, int):
+            device = torch.device("cuda", index=device)
+        return super().cuda(device=device)
diff --git a/nemo/core/config/pytorch_lightning.py b/nemo/core/config/pytorch_lightning.py
index 873a70ad4e0c..8ce3bdf236a0 100644
--- a/nemo/core/config/pytorch_lightning.py
+++ b/nemo/core/config/pytorch_lightning.py
@@ -36,17 +36,13 @@ class TrainerConfig:
     """
 
     logger: Any = True
-    checkpoint_callback: Any = True
     callbacks: Optional[Any] = None
     default_root_dir: Optional[str] = None
     gradient_clip_val: float = 0
-    process_position: int = 0
     num_nodes: int = 1
     gpus: Optional[Any] = None
     auto_select_gpus: bool = False
     tpu_cores: Optional[Any] = None
-    log_gpu_memory: Optional[str] = None
-    progress_bar_refresh_rate: int = 1
     enable_progress_bar: bool = True
     overfit_batches: Any = 0.0
     track_grad_norm: Any = -1
@@ -55,18 +51,16 @@ class TrainerConfig:
     accumulate_grad_batches: Any = 1
     max_epochs: int = 1000
     min_epochs: int = 1
-    max_steps: Optional[int] = None
+    max_steps: Optional[int] = -1
     min_steps: Optional[int] = None
     limit_train_batches: Any = 1.0
     limit_val_batches: Any = 1.0
     limit_test_batches: Any = 1.0
     val_check_interval: Any = 1.0
-    flush_logs_every_n_steps: int = 100
     log_every_n_steps: int = 50
     accelerator: Optional[str] = None
     sync_batchnorm: bool = False
     precision: Any = 32
-    weights_summary: Optional[str] = "full"  # ModelSummary.MODE_DEFAULT
     weights_save_path: Optional[str] = None
     num_sanity_val_steps: int = 2
     resume_from_checkpoint: Optional[str] = None
@@ -76,23 +70,20 @@ class TrainerConfig:
     auto_lr_find: Any = False
     replace_sampler_ddp: bool = True
     detect_anomaly: bool = False
-    terminate_on_nan: bool = False
     auto_scale_batch_size: Any = False
-    prepare_data_per_node: bool = True
     amp_backend: str = 'native'
     amp_level: Optional[str] = None
     plugins: Optional[Any] = None  # Optional[Union[str, list]]
     move_metrics_to_cpu: bool = False
     multiple_trainloader_mode: str = 'max_size_cycle'
     limit_predict_batches: float = 1.0
-    stochastic_weight_avg: bool = False
     gradient_clip_algorithm: str = 'norm'
     max_time: Optional[Any] = None  # can be one of Union[str, timedelta, Dict[str, int], None]
     reload_dataloaders_every_n_epochs: int = 0
     ipus: Optional[int] = None
     devices: Any = None
     strategy: Any = None
-    enable_checkpointing: bool = True
+    enable_checkpointing: bool = False
     enable_model_summary: bool = True
 
 
diff --git a/nemo/core/optim/lr_scheduler.py b/nemo/core/optim/lr_scheduler.py
index 269a80e2f536..922dc8636fe8 100644
--- a/nemo/core/optim/lr_scheduler.py
+++ b/nemo/core/optim/lr_scheduler.py
@@ -704,7 +704,7 @@ def prepare_lr_scheduler(
       sched:
         name: <name of scheduler>
         iters_per_batch: null # computed at runtime; mandatory to have
-        max_steps: null # computed at runtime or explicitly set here; mandatory to have
+        max_steps: -1 # computed at runtime or explicitly set here; mandatory to have
 
         # pytorch lightning args <mandatory>
         monitor: val_loss
diff --git a/requirements/requirements_lightning.txt b/requirements/requirements_lightning.txt
index d828e4796222..83df00e8e3e1 100644
--- a/requirements/requirements_lightning.txt
+++ b/requirements/requirements_lightning.txt
@@ -1,4 +1,4 @@
-pytorch-lightning>=1.6.1,<1.7.0
+pytorch-lightning>=1.7.0
 torchmetrics>=0.4.1rc0
 transformers>=4.0.1
 webdataset>=0.1.48,<=0.1.62
diff --git a/scripts/checkpoint_averaging/megatron_checkpoint_averaging.py b/scripts/checkpoint_averaging/megatron_checkpoint_averaging.py
index a48e1f3fd10f..4ac6a6546fde 100755
--- a/scripts/checkpoint_averaging/megatron_checkpoint_averaging.py
+++ b/scripts/checkpoint_averaging/megatron_checkpoint_averaging.py
@@ -37,7 +37,7 @@
 import torch
 from pytorch_lightning.trainer.trainer import Trainer
 
-from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin, NLPSaveRestoreConnector
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector
 from nemo.core import ModelPT
 from nemo.utils import logging, model_utils
 
@@ -71,7 +71,7 @@ def main():
 
     device = torch.device("cpu")
 
-    trainer = Trainer(plugins=NLPDDPPlugin(), devices=1, num_nodes=1, precision=16, accelerator='gpu')
+    trainer = Trainer(strategy=NLPDDPStrategy(), devices=1, num_nodes=1, precision=16, accelerator='gpu')
     # loop over all folders with .nemo files (or .nemo files)
     for model_fname_i, model_fname in enumerate(args.model_fname_list):
         if not model_fname.endswith(".nemo"):
diff --git a/scripts/export.py b/scripts/export.py
index f8cd98d7d7b6..5c0bc2a0a6f1 100644
--- a/scripts/export.py
+++ b/scripts/export.py
@@ -90,7 +90,7 @@ def nemo_export(argv):
         num_nodes=1,
         # Need to set the following two to False as ExpManager will take care of them differently.
         logger=False,
-        checkpoint_callback=False,
+        enable_checkpointing=False,
     )
     trainer = Trainer(cfg_trainer)
 
diff --git a/scripts/nemo_legacy_import/nlp_checkpoint_port.py b/scripts/nemo_legacy_import/nlp_checkpoint_port.py
index 55dc3e63984b..a0a83d761f92 100644
--- a/scripts/nemo_legacy_import/nlp_checkpoint_port.py
+++ b/scripts/nemo_legacy_import/nlp_checkpoint_port.py
@@ -88,7 +88,7 @@ def nemo_convert(argv):
         num_nodes=1,
         # Need to set the following two to False as ExpManager will take care of them differently.
         logger=False,
-        checkpoint_callback=False,
+        enable_checkpointing=False,
     )
     trainer = pl.Trainer(cfg_trainer)
 
diff --git a/scripts/speaker_tasks/filelist_to_manifest.py b/scripts/speaker_tasks/filelist_to_manifest.py
index 49e4b97bb3e8..4369c177aef0 100644
--- a/scripts/speaker_tasks/filelist_to_manifest.py
+++ b/scripts/speaker_tasks/filelist_to_manifest.py
@@ -46,7 +46,7 @@
 --min_spkrs_count: min number of samples per speaker to consider and ignore otherwise, defaults to 0 (all speakers)
 """
 
-DURATIONS = sorted([1, 2, 3, 4], reverse=True)
+DURATIONS = sorted([3], reverse=True)
 MIN_ENERGY = 0.01
 CWD = os.getcwd()
 
diff --git a/tests/collections/asr/numba/rnnt_loss/test_rnnt_pytorch.py b/tests/collections/asr/numba/rnnt_loss/test_rnnt_pytorch.py
index dc08d41bbd60..d303e5355bf9 100644
--- a/tests/collections/asr/numba/rnnt_loss/test_rnnt_pytorch.py
+++ b/tests/collections/asr/numba/rnnt_loss/test_rnnt_pytorch.py
@@ -431,7 +431,7 @@ def zero_grad():
         pt_cost2, _ = wrap_and_call(fn_pt, acts2, labels2, device)
         pt_grads1_p_2 = base_layer.grad.clone().cpu().numpy()
 
-        assert np.allclose(pt_grads1_p_2, np_grads1 + np_grads2, atol=1e-6)
+        assert np.allclose(pt_grads1_p_2, np_grads1 + np_grads2, atol=1e-5)
 
 
 if __name__ == "__main__":
diff --git a/tests/collections/nlp/test_gpt_eval.py b/tests/collections/nlp/test_gpt_eval.py
index 2871a35ed8f8..0e64b989176f 100644
--- a/tests/collections/nlp/test_gpt_eval.py
+++ b/tests/collections/nlp/test_gpt_eval.py
@@ -20,7 +20,7 @@
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
 from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam
-from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
 
 
 class TestGPTEval:
@@ -38,7 +38,7 @@ def setup_method(self, test_method):
         model_file = '/home/TestData/nlp/megatron_gpt/125M/megatron_gpt.nemo'
 
         # trainer required for restoring model parallel models
-        trainer = Trainer(plugins=NLPDDPPlugin(), **trainer_config)
+        trainer = Trainer(strategy=NLPDDPStrategy(), **trainer_config)
         assert (
             trainer_config["devices"] * trainer_config['num_nodes']
             == tensor_model_parallel_size * pipeline_model_parallel_size
diff --git a/tests/collections/nlp/test_gpt_model.py b/tests/collections/nlp/test_gpt_model.py
index 9e125c1a1a4e..0c9104d06245 100644
--- a/tests/collections/nlp/test_gpt_model.py
+++ b/tests/collections/nlp/test_gpt_model.py
@@ -22,7 +22,7 @@
 from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
 from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids
-from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
 
 DEVICE_CAPABILITY = None
 if torch.cuda.is_available():
@@ -133,9 +133,9 @@ def gpt_model(model_cfg, trainer_cfg, precision):
     model_cfg['precision'] = precision
     trainer_cfg['precision'] = precision
 
-    plugins = [NLPDDPPlugin()]
+    strategy = NLPDDPStrategy()
 
-    trainer = Trainer(plugins=plugins, **trainer_cfg)
+    trainer = Trainer(strategy=strategy, **trainer_cfg)
 
     cfg = DictConfig(model_cfg)
 
diff --git a/tests/collections/nlp/test_nlp_exportables.py b/tests/collections/nlp/test_nlp_exportables.py
index 21f65ec5d94b..da7afa90213e 100644
--- a/tests/collections/nlp/test_nlp_exportables.py
+++ b/tests/collections/nlp/test_nlp_exportables.py
@@ -87,6 +87,11 @@ def test_IntentSlotClassificationModel_export_to_onnx(self, dummy_data):
             config.trainer.devices = 1
             config.trainer.precision = 32
             config.trainer.strategy = None
+            config.trainer.max_steps = -1
+            if (
+                'checkpoint_callback' in config.trainer
+            ):  # TODO: Update this to create deafult config rather than pulling from git
+                del config.trainer.checkpoint_callback
             trainer = pl.Trainer(**config.trainer)
             model = IntentSlotClassificationModel(config.model, trainer=trainer)
             filename = os.path.join(tmpdir, 'isc.onnx')
diff --git a/tests/collections/nlp/test_retrieval_module.py b/tests/collections/nlp/test_retrieval_module.py
index 60fe81356a0d..b5da4e20085f 100644
--- a/tests/collections/nlp/test_retrieval_module.py
+++ b/tests/collections/nlp/test_retrieval_module.py
@@ -34,7 +34,7 @@
     init_method_normal,
     scaled_init_method_normal,
 )
-from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
 
 try:
     from apex.transformer.enums import AttnMaskType
@@ -52,15 +52,12 @@ def setup_class(cls):
         if not torch.cuda.is_available():
             return
         GPUS = 1
-        plugins = [NLPDDPPlugin()]
         TP_SIZE = GPUS
         PP_SIZE = 1
         MB_SIZE = 4
         GB_SIZE = 8
         SEED = 1234
-        trainer = Trainer(
-            plugins=plugins, devices=GPUS, accelerator='gpu', num_nodes=1, logger=None, log_gpu_memory=None
-        )
+        trainer = Trainer(strategy=NLPDDPStrategy(), devices=GPUS, accelerator='gpu', num_nodes=1, logger=None,)
 
         initialize_model_parallel_for_nemo(
             world_size=trainer.world_size,
diff --git a/tests/collections/nlp/test_retrieval_module_inference.py b/tests/collections/nlp/test_retrieval_module_inference.py
index 437679c37478..fa74f8b14df7 100644
--- a/tests/collections/nlp/test_retrieval_module_inference.py
+++ b/tests/collections/nlp/test_retrieval_module_inference.py
@@ -35,7 +35,7 @@
     init_method_normal,
     scaled_init_method_normal,
 )
-from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
 
 try:
     from apex.transformer.enums import AttnMaskType
@@ -53,15 +53,12 @@ def setup_class(cls):
         if not torch.cuda.is_available():
             return
         GPUS = 1
-        plugins = [NLPDDPPlugin()]
         TP_SIZE = GPUS
         PP_SIZE = 1
         MB_SIZE = 4
         GB_SIZE = 8
         SEED = 1234
-        trainer = Trainer(
-            plugins=plugins, devices=GPUS, accelerator='gpu', num_nodes=1, logger=None, log_gpu_memory=None
-        )
+        trainer = Trainer(strategy=NLPDDPStrategy(), devices=GPUS, accelerator='gpu', num_nodes=1, logger=None,)
 
         initialize_model_parallel_for_nemo(
             world_size=trainer.world_size,
diff --git a/tests/core/test_optimizers_schedulers.py b/tests/core/test_optimizers_schedulers.py
index 2bf827f61283..b0e0eee17216 100644
--- a/tests/core/test_optimizers_schedulers.py
+++ b/tests/core/test_optimizers_schedulers.py
@@ -876,8 +876,7 @@ def train(
                 accumulate_grad_batches=accumulate_grad_batches,
                 limit_train_batches=limit_train_batches,
                 enable_checkpointing=False,
-                progress_bar_refresh_rate=0,
-                weights_summary=None,
+                enable_progress_bar=False,
             )
             max_steps = optim.lr_scheduler.compute_max_steps(
                 max_epochs, accumulate_grad_batches, limit_train_batches, devices, dataset_len, batch_size, drop_last,
@@ -952,8 +951,7 @@ def train(
                 accumulate_grad_batches=accumulate_grad_batches,
                 limit_train_batches=limit_train_batches,
                 enable_checkpointing=False,
-                progress_bar_refresh_rate=0,
-                weights_summary=None,
+                enable_progress_bar=False,
             )
             model = ExampleModel(batch_size, dataset_len, drop_last, max_steps)
             trainer.callbacks.append(SchedulerNoOpCallback())
diff --git a/tests/core_ptl/test_ptl_stateless_timer.py b/tests/core_ptl/test_ptl_stateless_timer.py
index c8ef019fcf9d..c78ee18edb87 100644
--- a/tests/core_ptl/test_ptl_stateless_timer.py
+++ b/tests/core_ptl/test_ptl_stateless_timer.py
@@ -96,8 +96,8 @@ def setup_model(self):
             accelerator='gpu',
             strategy='ddp',
             logger=None,
+            enable_checkpointing=False,
             callbacks=[StatelessTimer('00:00:00:03')],
-            checkpoint_callback=False,
         )
         exp_manager_cfg = ExpManagerConfig(
             explicit_log_dir='./ptl_stateless_timer_check/',
diff --git a/tutorials/00_NeMo_Primer.ipynb b/tutorials/00_NeMo_Primer.ipynb
index 761dc2748791..0923f87ee517 100644
--- a/tutorials/00_NeMo_Primer.ipynb
+++ b/tutorials/00_NeMo_Primer.ipynb
@@ -652,7 +652,7 @@
         "      name: CosineAnnealing\n",
         "\n",
         "      # Optional arguments\n",
-        "      max_steps: null # computed at runtime or explicitly set here\n",
+        "      max_steps: -1 # computed at runtime or explicitly set here\n",
         "      monitor: val_loss\n",
         "      reduce_on_plateau: false\n",
         "\n",
diff --git a/tutorials/asr/Multilang_ASR.ipynb b/tutorials/asr/Multilang_ASR.ipynb
index 96c73b668536..a6bc8adfe4c9 100644
--- a/tutorials/asr/Multilang_ASR.ipynb
+++ b/tutorials/asr/Multilang_ASR.ipynb
@@ -1550,10 +1550,10 @@
     "                      max_epochs=MAX_EPOCHS, \n",
     "                      accumulate_grad_batches=GRAD_ACCUM,\n",
     "                      precision=16,\n",
-    "                      checkpoint_callback=False,\n",
+    "                      enable_checkpointing=False,\n",
     "                      logger=False,\n",
     "                      log_every_n_steps=LOG_EVERY_N_STEPS,\n",
-    "                      progress_bar_refresh_rate=1,\n",
+    "                      enable_progress_bar=True,\n",
     "                      check_val_every_n_epoch=1)"
    ]
   },
@@ -5032,4 +5032,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
\ No newline at end of file
+}
diff --git a/tutorials/asr/Speech_Commands.ipynb b/tutorials/asr/Speech_Commands.ipynb
index c82f2cfab549..5687b4f1600f 100644
--- a/tutorials/asr/Speech_Commands.ipynb
+++ b/tutorials/asr/Speech_Commands.ipynb
@@ -1323,7 +1323,7 @@
                 "    devices=1,\n",
                 "    accelerator=accelerator,\n",
                 "    max_epochs=5,\n",
-                "    max_steps=None,  # computed at runtime if not set\n",
+                "    max_steps=-1,  # computed at runtime if not set\n",
                 "    num_nodes=1,\n",
                 "    accumulate_grad_batches=1,\n",
                 "    enable_checkpointing=False,  # Provided by exp_manager\n",
@@ -1623,4 +1623,4 @@
             ]
         }
     ]
-}
\ No newline at end of file
+}
diff --git a/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb b/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb
index f4b8e66b3892..abbd1b09537f 100644
--- a/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb
+++ b/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb
@@ -823,7 +823,7 @@
    "source": [
     "import torch\n",
     "import pytorch_lightning as pl\n",
-    "from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin\n",
+    "from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy\n",
     "from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment\n",
     "\n",
     "# lets modify some trainer configs\n",
@@ -848,8 +848,8 @@
     "os.environ[\"RANK\"] = '0'\n",
     "os.environ[\"WORLD_SIZE\"] = '1'\n",
     "\n",
-    "plugins = [NLPDDPPlugin(find_unused_parameters=False, no_ddp_communication_hook=True), TorchElasticEnvironment()]\n",
-    "trainer = pl.Trainer(plugins=plugins, **config.trainer)\n",
+    "strategy = NLPDDPStrategy(find_unused_parameters=False, no_ddp_communication_hook=True), TorchElasticEnvironment()\n",
+    "trainer = pl.Trainer(strategy=strategy, **config.trainer)\n",
     "\n",
     "print(\"Trainer config - \\n\")\n",
     "print(OmegaConf.to_yaml(config.trainer))"
@@ -1136,8 +1136,8 @@
     "config.model.optim.lr = 5e-4\n",
     "\n",
     "# Reset the trainer\n",
-    "plugins = [NLPDDPPlugin(find_unused_parameters=False, no_ddp_communication_hook=True), TorchElasticEnvironment()]\n",
-    "trainer = pl.Trainer(plugins=plugins, **config.trainer)\n",
+    "strategy = NLPDDPStrategy(find_unused_parameters=False, no_ddp_communication_hook=True), TorchElasticEnvironment()\n",
+    "trainer = pl.Trainer(strategy=strategy, **config.trainer)\n",
     "\n",
     "print(\"Trainer config - \\n\")\n",
     "print(OmegaConf.to_yaml(config.trainer))"
diff --git a/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb b/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb
index 00f99db6e243..9d0ae82c3ebf 100644
--- a/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb
+++ b/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb
@@ -826,7 +826,7 @@
     "    devices=1,\n",
     "    accelerator=accelerator,\n",
     "    max_epochs=5,\n",
-    "    max_steps=None,  # computed at runtime if not set\n",
+    "    max_steps=-1,  # computed at runtime if not set\n",
     "    num_nodes=1,\n",
     "    accumulate_grad_batches=1,\n",
     "    enable_checkpointing=False,  # Provided by exp_manager\n",