From 838674af60a9779174d9b04a8022d852ff932c19 Mon Sep 17 00:00:00 2001 From: Nithin Rao Date: Thu, 11 Aug 2022 08:20:29 -0700 Subject: [PATCH] upgrade to PTL 1.7 (#4672) * upgrade to PTL 1.7 Signed-off-by: nithinraok * min version Signed-off-by: nithinraok * replace progressbar_refresh_rate with enable progressbar, this is callback now Signed-off-by: nithinraok * progressbar Signed-off-by: nithinraok * replace removed PTL 1.7 args, fix cpu tests, remove p-tune older script Signed-off-by: nithinraok * revert ssl test fixes Signed-off-by: nithinraok * override trainer property and fix numba grad check Signed-off-by: nithinraok * NLPDDPlugin -> NLPDDPStrategy Signed-off-by: nithinraok * style fix Signed-off-by: nithinraok * set max_steps default as -1 Signed-off-by: nithinraok * fix maxsteps in notebooks Signed-off-by: nithinraok * update trainer config Signed-off-by: nithinraok * fix speech2label jenkins Signed-off-by: nithinraok * fix speech2text jenkins Signed-off-by: nithinraok * DDPPlugin -> DDPStrategy Signed-off-by: nithinraok * remove provided strategy keys from trainer config nlp Signed-off-by: nithinraok * check other examples Signed-off-by: nithinraok * override LightningModule .cuda call to maintain pytorch default behavior Signed-off-by: ericharper * revert gpt eval jenkins test Signed-off-by: nithinraok * overwrite cuda class to PTL Signed-off-by: nithinraok * review feedback Signed-off-by: nithinraok * remove checkpoint callback from main config Signed-off-by: nithinraok * patch fix for intentslot classification test Signed-off-by: nithinraok * style fix Signed-off-by: nithinraok Signed-off-by: nithinraok Signed-off-by: ericharper Co-authored-by: ericharper Signed-off-by: Hainan Xu --- Jenkinsfile | 4 +- docs/source/core/core.rst | 2 +- docs/source/nlp/megatron.rst | 10 +- .../asr/conf/asr_adapters/asr_adaptation.yaml | 2 +- .../asr/conf/carnelinet/carnelinet_384.yaml | 2 +- examples/asr/conf/citrinet/citrinet_1024.yaml | 2 +- examples/asr/conf/citrinet/citrinet_384.yaml | 2 +- examples/asr/conf/citrinet/citrinet_512.yaml | 2 +- examples/asr/conf/citrinet/config_bpe.yaml | 2 +- examples/asr/conf/config.yaml | 2 +- .../asr/conf/conformer/conformer_ctc_bpe.yaml | 4 +- .../conf/conformer/conformer_ctc_char.yaml | 4 +- .../conformer/conformer_transducer_bpe.yaml | 4 +- .../conformer/conformer_transducer_char.yaml | 4 +- .../conformer_ctc_bpe_multilang.yaml | 4 +- .../conformer_transducer_bpe_multilang.yaml | 4 +- .../conformer_ctc_bpe_streaming.yaml | 4 +- .../conformer_transducer_bpe_streaming.yaml | 4 +- .../asr/conf/contextnet_rnnt/config_rnnt.yaml | 2 +- .../conf/contextnet_rnnt/config_rnnt_bpe.yaml | 2 +- .../conf/contextnet_rnnt/contextnet_rnnt.yaml | 2 +- .../contextnet_rnnt/contextnet_rnnt_char.yaml | 2 +- .../contextnet_rnnt_multilang.yaml | 2 +- examples/asr/conf/jasper/jasper_10x5dr.yaml | 2 +- examples/asr/conf/lstm/lstm_ctc_bpe.yaml | 4 +- .../asr/conf/lstm/lstm_transducer_bpe.yaml | 4 +- .../asr/conf/marblenet/marblenet_3x2x64.yaml | 2 +- .../matchboxnet/matchboxnet_3x1x64_v1.yaml | 2 +- .../matchboxnet/matchboxnet_3x1x64_v2.yaml | 2 +- .../asr/conf/quartznet/quartznet_15x5.yaml | 2 +- .../conf/quartznet/quartznet_15x5_aug.yaml | 2 +- .../asr/conf/quartznet/quartznet_15x5_ru.yaml | 2 +- .../asr/conf/quartznet/quartznet_15x5_zh.yaml | 2 +- .../squeezeformer/squeezeformer_ctc_bpe.yaml | 4 +- .../squeezeformer/squeezeformer_ctc_char.yaml | 4 +- .../conf/ssl/citrinet/citrinet_ssl_1024.yaml | 4 +- .../conf/ssl/citrinet/citrinet_ssl_ci.yaml | 2 +- .../asr/conf/ssl/conformer/conformer_ssl.yaml | 4 +- .../conf/ssl/contextnet/contextnet_ssl.yaml | 4 +- examples/asr/conf/ssl/wav2vec/wav2vec_ci.yaml | 2 +- examples/asr/conf/wav2vec_ctc/wav2vecCTC.yaml | 2 +- .../k2/conf/citrinet/citrinet_mmi_1024.yaml | 2 +- .../nlp/dialogue/conf/dialogue_config.yaml | 2 +- examples/nlp/dialogue/dialogue.py | 10 +- .../tiny_example_entity_linking_config.yaml | 2 +- .../umls_medical_entity_linking_config.yaml | 2 +- .../glue_benchmark/glue_benchmark_config.yaml | 2 +- .../conf/bert_ir_config.yaml | 2 +- .../intent_slot_classification_config.yaml | 8 +- ...bel_intent_slot_classification_config.yaml | 6 +- .../nlp/language_modeling/bert_pretraining.py | 4 +- .../bert_pretraining_from_text_config.yaml | 2 +- .../conf/megatron_bart_config.yaml | 2 +- .../conf/megatron_bert_config.yaml | 2 +- .../conf/megatron_gpt_config.yaml | 2 +- .../megatron_gpt_prompt_learning_config.yaml | 2 +- .../conf/megatron_retro_config.yaml | 2 +- .../conf/megatron_retro_mutransfer.yaml | 2 +- .../conf/megatron_t0_config.yaml | 2 +- .../conf/megatron_t5_config.yaml | 2 +- ...megatron_t5_config_finetune_glue_mnli.yaml | 2 +- ...megatron_t5_config_finetune_glue_xnli.yaml | 2 +- .../conf/megatron_t5_finetune.yaml | 2 +- .../megatron_t5_lm_adaptation_finetune.yaml | 2 +- .../conf/megatron_t5_prompt_learning.yaml | 2 +- .../conf/megatron_ul2_config.yaml | 2 +- .../megatron_bart_pretraining.py | 17 ++- .../megatron_bert_pretraining.py | 7 +- .../megatron_change_num_partitions.py | 8 +- .../language_modeling/megatron_gpt_eval.py | 4 +- .../megatron_gpt_pretraining.py | 17 ++- .../megatron_gpt_prompt_learning.py | 7 +- .../megatron_gpt_prompt_learning_eval.py | 4 +- .../language_modeling/megatron_gpt_test.py | 8 +- .../megatron_retro_cal_shape.py | 17 ++- .../megatron_retro_mutransfer_pretrain.py | 17 ++- .../megatron_retro_pretraining.py | 17 ++- .../nlp/language_modeling/megatron_t5_eval.py | 4 +- .../megatron_t5_lm_adaptation_finetune.py | 17 ++- .../megatron_t5_pretraining.py | 17 ++- .../megatron_t5_prompt_learning.py | 7 +- .../megatron_t5_prompt_learning_eval.py | 4 +- .../language_modeling/megatron_t5_ptune.py | 135 ------------------ .../megatron_t5_seq2seq_eval.py | 17 ++- .../megatron_t5_seq2seq_finetune.py | 17 ++- .../enc_dec_nmt-bottleneck.py | 6 +- .../nlp/machine_translation/enc_dec_nmt.py | 6 +- .../enc_dec_nmt_finetune.py | 6 +- .../megatron_nmt_training.py | 17 ++- .../nmt_transformer_infer_megatron.py | 4 +- .../conf/question_answering_squad_config.yaml | 2 +- .../text2sparql/conf/text2sparql_config.yaml | 2 +- .../ptune_text_classification_config.yaml | 2 +- .../conf/text_classification_config.yaml | 2 +- ...parallel_text_classification_evaluation.py | 4 +- .../text_classification_with_bert.py | 8 +- .../conf/thutmose_tagger_itn_config.yaml | 2 +- .../punctuation_capitalization_config.yaml | 2 +- .../conf/token_classification_config.yaml | 2 +- .../token_classification_train.py | 8 +- .../conf/zero_shot_intent_config.yaml | 2 +- .../conf/SpeakerNet_recognition_3x2x512.yaml | 4 +- nemo/collections/asr/modules/conv_asr.py | 2 +- .../machine_translation/megatron_nmt_model.py | 4 - nemo/collections/nlp/parts/nlp_overrides.py | 6 +- nemo/core/classes/modelPT.py | 37 ++++- nemo/core/config/pytorch_lightning.py | 13 +- nemo/core/optim/lr_scheduler.py | 2 +- requirements/requirements_lightning.txt | 2 +- .../megatron_checkpoint_averaging.py | 4 +- scripts/export.py | 2 +- .../nemo_legacy_import/nlp_checkpoint_port.py | 2 +- scripts/speaker_tasks/filelist_to_manifest.py | 2 +- .../asr/numba/rnnt_loss/test_rnnt_pytorch.py | 2 +- tests/collections/nlp/test_gpt_eval.py | 4 +- tests/collections/nlp/test_gpt_model.py | 6 +- tests/collections/nlp/test_nlp_exportables.py | 5 + .../collections/nlp/test_retrieval_module.py | 7 +- .../nlp/test_retrieval_module_inference.py | 7 +- tests/core/test_optimizers_schedulers.py | 6 +- tests/core_ptl/test_ptl_stateless_timer.py | 2 +- tutorials/00_NeMo_Primer.ipynb | 2 +- tutorials/asr/Multilang_ASR.ipynb | 6 +- tutorials/asr/Speech_Commands.ipynb | 4 +- .../nlp/Multitask_Prompt_and_PTuning.ipynb | 10 +- .../Speaker_Identification_Verification.ipynb | 2 +- 126 files changed, 314 insertions(+), 437 deletions(-) delete mode 100644 examples/nlp/language_modeling/megatron_t5_ptune.py diff --git a/Jenkinsfile b/Jenkinsfile index f88d93fef38e..68c8e7fb8413 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -597,7 +597,7 @@ pipeline { trainer.devices=[0] \ trainer.accelerator="gpu" \ trainer.max_epochs=1 \ - +trainer.max_steps=1 \ + trainer.max_steps=1 \ +trainer.num_sanity_val_steps=1 \ exp_manager.exp_dir=examples/asr/speech_to_text_results' sh 'rm -rf examples/asr/speech_to_text_results' @@ -612,7 +612,7 @@ pipeline { trainer.devices=[1] \ trainer.accelerator="gpu" \ trainer.max_epochs=1 \ - +trainer.max_steps=1 \ + trainer.max_steps=1 \ +trainer.num_sanity_val_steps=1 \ model.preprocessor._target_=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \ ~model.preprocessor.window_size \ diff --git a/docs/source/core/core.rst b/docs/source/core/core.rst index 098f3e04c643..32a36a4fbb6c 100644 --- a/docs/source/core/core.rst +++ b/docs/source/core/core.rst @@ -399,7 +399,7 @@ configuration for a Novograd optimizer with Cosine Annealing learning rate sched name: CosineAnnealing # Optional arguments - max_steps: null # computed at runtime or explicitly set here + max_steps: -1 # computed at runtime or explicitly set here monitor: val_loss reduce_on_plateau: false diff --git a/docs/source/nlp/megatron.rst b/docs/source/nlp/megatron.rst index b6ef3c293063..743aa2f84b53 100644 --- a/docs/source/nlp/megatron.rst +++ b/docs/source/nlp/megatron.rst @@ -30,15 +30,15 @@ the same features as other NeMo Models. Training ^^^^^^^^ -All of the necessary logic to train model parallel models in NeMo with PyTorch Lightning is contained in the ``NLPDDPPlugin``. -The ``NLPDDPPlugin`` subclasses the PyTorch Lightning training type plugin ``DDPPlugin``. -See `plugins `_ for more information on PyTorch Lightning Plugins. +All of the necessary logic to train model parallel models in NeMo with PyTorch Lightning is contained in the ``NLPDDPStrategy``. +The ``NLPDDPStrategy`` subclasses the PyTorch Lightning strategy type ``DDPStrategy``. +See `strategies `_ for more information on PyTorch Lightning Strategies To enable model parallel training in NeMo: .. code-block:: python - trainer = Trainer(plugins=[NLPDDPPlugin()], **cfg.trainer) + trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer) Megatron-LM checkpoints have a specific format. One checkpoint is saved for each model parallel rank: @@ -157,7 +157,7 @@ Since model parallel models always require more than one GPU, the ``Trainer`` is .. code-block:: python - trainer = pl.Trainer(plugins=[NLPDDPPlugin()], **cfg.trainer) + trainer = pl.Trainer(strategy=NLPDDPStrategy(), **cfg.trainer) model = TextClassificationModel.restore_from(cfg.model.nemo_path, trainer=trainer) model.setup_test_data(test_data_config=cfg.model.test_ds) diff --git a/examples/asr/conf/asr_adapters/asr_adaptation.yaml b/examples/asr/conf/asr_adapters/asr_adaptation.yaml index 7584e2220d10..59df7ee41ca7 100644 --- a/examples/asr/conf/asr_adapters/asr_adaptation.yaml +++ b/examples/asr/conf/asr_adapters/asr_adaptation.yaml @@ -164,7 +164,7 @@ trainer: gradient_clip_val: null precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. log_every_n_steps: 10 # Interval of logging. - progress_bar_refresh_rate: 10 + enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs diff --git a/examples/asr/conf/carnelinet/carnelinet_384.yaml b/examples/asr/conf/carnelinet/carnelinet_384.yaml index 2d3d567be510..6693247ab340 100644 --- a/examples/asr/conf/carnelinet/carnelinet_384.yaml +++ b/examples/asr/conf/carnelinet/carnelinet_384.yaml @@ -238,7 +238,7 @@ model: trainer: devices: 1 # number of gpus max_epochs: 100 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set num_nodes: 1 accelerator: gpu strategy: ddp diff --git a/examples/asr/conf/citrinet/citrinet_1024.yaml b/examples/asr/conf/citrinet/citrinet_1024.yaml index 324623c5fd88..0722a7ec740a 100644 --- a/examples/asr/conf/citrinet/citrinet_1024.yaml +++ b/examples/asr/conf/citrinet/citrinet_1024.yaml @@ -448,7 +448,7 @@ model: trainer: devices: 1 # number of gpus max_epochs: 100 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set num_nodes: 1 accelerator: gpu strategy: ddp diff --git a/examples/asr/conf/citrinet/citrinet_384.yaml b/examples/asr/conf/citrinet/citrinet_384.yaml index b49ab1f5aee5..f2ceb5f45f6c 100644 --- a/examples/asr/conf/citrinet/citrinet_384.yaml +++ b/examples/asr/conf/citrinet/citrinet_384.yaml @@ -403,7 +403,7 @@ model: trainer: devices: 1 # number of gpus max_epochs: 100 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set num_nodes: 1 accelerator: gpu strategy: ddp diff --git a/examples/asr/conf/citrinet/citrinet_512.yaml b/examples/asr/conf/citrinet/citrinet_512.yaml index f5dc5ecd229b..a36cb1df7375 100644 --- a/examples/asr/conf/citrinet/citrinet_512.yaml +++ b/examples/asr/conf/citrinet/citrinet_512.yaml @@ -402,7 +402,7 @@ model: trainer: devices: 1 # number of gpus max_epochs: 100 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set num_nodes: 1 accelerator: gpu strategy: ddp diff --git a/examples/asr/conf/citrinet/config_bpe.yaml b/examples/asr/conf/citrinet/config_bpe.yaml index 2cb2768793c0..887160142c1c 100644 --- a/examples/asr/conf/citrinet/config_bpe.yaml +++ b/examples/asr/conf/citrinet/config_bpe.yaml @@ -165,7 +165,7 @@ model: trainer: devices: 1 # number of gpus max_epochs: 5 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set num_nodes: 1 accelerator: gpu strategy: ddp diff --git a/examples/asr/conf/config.yaml b/examples/asr/conf/config.yaml index 2b2163b57474..6ab764c0907b 100644 --- a/examples/asr/conf/config.yaml +++ b/examples/asr/conf/config.yaml @@ -168,7 +168,7 @@ model: trainer: devices: 1 # number of gpus max_epochs: 5 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set num_nodes: 1 accelerator: gpu strategy: ddp diff --git a/examples/asr/conf/conformer/conformer_ctc_bpe.yaml b/examples/asr/conf/conformer/conformer_ctc_bpe.yaml index 1870d3069631..b0ec44a6a424 100644 --- a/examples/asr/conf/conformer/conformer_ctc_bpe.yaml +++ b/examples/asr/conf/conformer/conformer_ctc_bpe.yaml @@ -165,7 +165,7 @@ trainer: devices: -1 # number of GPUs, -1 would use all available GPUs num_nodes: 1 max_epochs: 1000 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations accelerator: auto strategy: ddp @@ -173,7 +173,7 @@ trainer: gradient_clip_val: 0.0 precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. log_every_n_steps: 10 # Interval of logging. - progress_bar_refresh_rate: 10 + enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs diff --git a/examples/asr/conf/conformer/conformer_ctc_char.yaml b/examples/asr/conf/conformer/conformer_ctc_char.yaml index a7c6e3f1e916..ad6152311f6d 100644 --- a/examples/asr/conf/conformer/conformer_ctc_char.yaml +++ b/examples/asr/conf/conformer/conformer_ctc_char.yaml @@ -140,7 +140,7 @@ trainer: devices: -1 # number of GPUs, -1 would use all available GPUs num_nodes: 1 max_epochs: 1000 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations accelerator: auto strategy: ddp @@ -148,7 +148,7 @@ trainer: gradient_clip_val: 0.0 precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. log_every_n_steps: 10 # Interval of logging. - progress_bar_refresh_rate: 10 + enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs diff --git a/examples/asr/conf/conformer/conformer_transducer_bpe.yaml b/examples/asr/conf/conformer/conformer_transducer_bpe.yaml index aa2f9ea2edeb..35cfe811953b 100644 --- a/examples/asr/conf/conformer/conformer_transducer_bpe.yaml +++ b/examples/asr/conf/conformer/conformer_transducer_bpe.yaml @@ -215,7 +215,7 @@ trainer: devices: -1 # number of GPUs, -1 would use all available GPUs num_nodes: 1 max_epochs: 500 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations accelerator: auto strategy: ddp @@ -223,7 +223,7 @@ trainer: gradient_clip_val: 0.0 precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. log_every_n_steps: 10 # Interval of logging. - progress_bar_refresh_rate: 10 + enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs diff --git a/examples/asr/conf/conformer/conformer_transducer_char.yaml b/examples/asr/conf/conformer/conformer_transducer_char.yaml index 1d4c4d04db7e..24b28204176a 100644 --- a/examples/asr/conf/conformer/conformer_transducer_char.yaml +++ b/examples/asr/conf/conformer/conformer_transducer_char.yaml @@ -210,7 +210,7 @@ trainer: devices: -1 # number of GPUs, -1 would use all available GPUs num_nodes: 1 max_epochs: 500 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations accelerator: auto strategy: ddp @@ -218,7 +218,7 @@ trainer: gradient_clip_val: 0.0 precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. log_every_n_steps: 10 # Interval of logging. - progress_bar_refresh_rate: 10 + enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs diff --git a/examples/asr/conf/conformer/multilang/conformer_ctc_bpe_multilang.yaml b/examples/asr/conf/conformer/multilang/conformer_ctc_bpe_multilang.yaml index 6be1461686f0..3a999e8819a0 100644 --- a/examples/asr/conf/conformer/multilang/conformer_ctc_bpe_multilang.yaml +++ b/examples/asr/conf/conformer/multilang/conformer_ctc_bpe_multilang.yaml @@ -166,7 +166,7 @@ trainer: devices: -1 # number of GPUs, -1 would use all available GPUs num_nodes: 1 max_epochs: 1000 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations accelerator: auto strategy: ddp @@ -174,7 +174,7 @@ trainer: gradient_clip_val: 0.0 precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. log_every_n_steps: 10 # Interval of logging. - progress_bar_refresh_rate: 10 + enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs diff --git a/examples/asr/conf/conformer/multilang/conformer_transducer_bpe_multilang.yaml b/examples/asr/conf/conformer/multilang/conformer_transducer_bpe_multilang.yaml index 61bb35247494..0e73d166d2be 100644 --- a/examples/asr/conf/conformer/multilang/conformer_transducer_bpe_multilang.yaml +++ b/examples/asr/conf/conformer/multilang/conformer_transducer_bpe_multilang.yaml @@ -216,7 +216,7 @@ trainer: devices: -1 # number of GPUs, -1 would use all available GPUs num_nodes: 1 max_epochs: 1000 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations accelerator: auto strategy: ddp @@ -224,7 +224,7 @@ trainer: gradient_clip_val: 0.0 precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. log_every_n_steps: 10 # Interval of logging. - progress_bar_refresh_rate: 10 + enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs diff --git a/examples/asr/conf/conformer/streaming/conformer_ctc_bpe_streaming.yaml b/examples/asr/conf/conformer/streaming/conformer_ctc_bpe_streaming.yaml index dc2eb4cdf131..f6563787ca55 100644 --- a/examples/asr/conf/conformer/streaming/conformer_ctc_bpe_streaming.yaml +++ b/examples/asr/conf/conformer/streaming/conformer_ctc_bpe_streaming.yaml @@ -153,7 +153,7 @@ trainer: devices: -1 # number of GPUs, -1 would use all available GPUs num_nodes: 1 max_epochs: 1000 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations accelerator: auto strategy: ddp @@ -161,7 +161,7 @@ trainer: gradient_clip_val: 1.0 precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. log_every_n_steps: 10 # Interval of logging. - progress_bar_refresh_rate: 10 + enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs diff --git a/examples/asr/conf/conformer/streaming/conformer_transducer_bpe_streaming.yaml b/examples/asr/conf/conformer/streaming/conformer_transducer_bpe_streaming.yaml index 0034a0b194a2..1f75ddb265e2 100644 --- a/examples/asr/conf/conformer/streaming/conformer_transducer_bpe_streaming.yaml +++ b/examples/asr/conf/conformer/streaming/conformer_transducer_bpe_streaming.yaml @@ -213,7 +213,7 @@ trainer: devices: -1 # number of GPUs, -1 would use all available GPUs num_nodes: 1 max_epochs: 1000 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations accelerator: auto strategy: ddp @@ -221,7 +221,7 @@ trainer: gradient_clip_val: 0.0 precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. log_every_n_steps: 10 # Interval of logging. - progress_bar_refresh_rate: 10 + enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs diff --git a/examples/asr/conf/contextnet_rnnt/config_rnnt.yaml b/examples/asr/conf/contextnet_rnnt/config_rnnt.yaml index a58c467b8110..ba8eca6833f3 100644 --- a/examples/asr/conf/contextnet_rnnt/config_rnnt.yaml +++ b/examples/asr/conf/contextnet_rnnt/config_rnnt.yaml @@ -235,7 +235,7 @@ model: trainer: devices: 1 # number of gpus max_epochs: 5 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set num_nodes: 1 accelerator: gpu strategy: ddp diff --git a/examples/asr/conf/contextnet_rnnt/config_rnnt_bpe.yaml b/examples/asr/conf/contextnet_rnnt/config_rnnt_bpe.yaml index 1f4dd0e954c9..74cb0c92f194 100644 --- a/examples/asr/conf/contextnet_rnnt/config_rnnt_bpe.yaml +++ b/examples/asr/conf/contextnet_rnnt/config_rnnt_bpe.yaml @@ -235,7 +235,7 @@ model: trainer: devices: 1 # number of gpus max_epochs: 5 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set num_nodes: 1 accelerator: gpu strategy: ddp diff --git a/examples/asr/conf/contextnet_rnnt/contextnet_rnnt.yaml b/examples/asr/conf/contextnet_rnnt/contextnet_rnnt.yaml index 2ec7590d8840..596434cd8a79 100644 --- a/examples/asr/conf/contextnet_rnnt/contextnet_rnnt.yaml +++ b/examples/asr/conf/contextnet_rnnt/contextnet_rnnt.yaml @@ -474,7 +474,7 @@ model: trainer: devices: 1 # number of gpus max_epochs: 100 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set num_nodes: 1 # Should be set via SLURM variable `SLURM_JOB_NUM_NODES` accelerator: gpu strategy: ddp diff --git a/examples/asr/conf/contextnet_rnnt/contextnet_rnnt_char.yaml b/examples/asr/conf/contextnet_rnnt/contextnet_rnnt_char.yaml index 490394676acd..b190d7159529 100644 --- a/examples/asr/conf/contextnet_rnnt/contextnet_rnnt_char.yaml +++ b/examples/asr/conf/contextnet_rnnt/contextnet_rnnt_char.yaml @@ -476,7 +476,7 @@ model: trainer: devices: 1 # number of gpus max_epochs: 100 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set num_nodes: 1 # Should be set via SLURM variable `SLURM_JOB_NUM_NODES` accelerator: gpu strategy: ddp diff --git a/examples/asr/conf/contextnet_rnnt/contextnet_rnnt_multilang.yaml b/examples/asr/conf/contextnet_rnnt/contextnet_rnnt_multilang.yaml index 972c3cd9a761..34ffdc923efb 100644 --- a/examples/asr/conf/contextnet_rnnt/contextnet_rnnt_multilang.yaml +++ b/examples/asr/conf/contextnet_rnnt/contextnet_rnnt_multilang.yaml @@ -481,7 +481,7 @@ model: trainer: devices: -1 # number of GPUs, -1 would use all available GPUs max_epochs: 100 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set num_nodes: 1 # Should be set via SLURM variable `SLURM_JOB_NUM_NODES` accelerator: auto strategy: ddp diff --git a/examples/asr/conf/jasper/jasper_10x5dr.yaml b/examples/asr/conf/jasper/jasper_10x5dr.yaml index ad2f0536c133..e93b8b6043c8 100644 --- a/examples/asr/conf/jasper/jasper_10x5dr.yaml +++ b/examples/asr/conf/jasper/jasper_10x5dr.yaml @@ -190,7 +190,7 @@ model: trainer: devices: 1 # number of gpus max_epochs: 5 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set num_nodes: 1 accelerator: gpu strategy: ddp diff --git a/examples/asr/conf/lstm/lstm_ctc_bpe.yaml b/examples/asr/conf/lstm/lstm_ctc_bpe.yaml index f81cb43ecc07..e899f44f97ef 100644 --- a/examples/asr/conf/lstm/lstm_ctc_bpe.yaml +++ b/examples/asr/conf/lstm/lstm_ctc_bpe.yaml @@ -123,7 +123,7 @@ trainer: devices: -1 # number of GPUs, -1 would use all available GPUs num_nodes: 1 max_epochs: 500 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations accelerator: gpu strategy: ddp @@ -131,7 +131,7 @@ trainer: gradient_clip_val: 0.3 precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. log_every_n_steps: 10 # Interval of logging. - progress_bar_refresh_rate: 10 + enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs diff --git a/examples/asr/conf/lstm/lstm_transducer_bpe.yaml b/examples/asr/conf/lstm/lstm_transducer_bpe.yaml index 2ac1408c508c..e35bb95d291f 100644 --- a/examples/asr/conf/lstm/lstm_transducer_bpe.yaml +++ b/examples/asr/conf/lstm/lstm_transducer_bpe.yaml @@ -186,7 +186,7 @@ trainer: devices: -1 # number of GPUs, -1 would use all available GPUs num_nodes: 1 max_epochs: 500 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations accelerator: auto strategy: ddp @@ -194,7 +194,7 @@ trainer: gradient_clip_val: 0.3 precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. log_every_n_steps: 10 # Interval of logging. - progress_bar_refresh_rate: 10 + enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs diff --git a/examples/asr/conf/marblenet/marblenet_3x2x64.yaml b/examples/asr/conf/marblenet/marblenet_3x2x64.yaml index fe4dcc537f06..f9b3f26d114c 100644 --- a/examples/asr/conf/marblenet/marblenet_3x2x64.yaml +++ b/examples/asr/conf/marblenet/marblenet_3x2x64.yaml @@ -165,7 +165,7 @@ model: trainer: devices: 1 # number of gpus max_epochs: 150 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set num_nodes: 1 accelerator: gpu strategy: ddp diff --git a/examples/asr/conf/matchboxnet/matchboxnet_3x1x64_v1.yaml b/examples/asr/conf/matchboxnet/matchboxnet_3x1x64_v1.yaml index ac462a273d96..af054aac2aba 100644 --- a/examples/asr/conf/matchboxnet/matchboxnet_3x1x64_v1.yaml +++ b/examples/asr/conf/matchboxnet/matchboxnet_3x1x64_v1.yaml @@ -177,7 +177,7 @@ model: trainer: devices: 1 # number of gpus max_epochs: 200 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set num_nodes: 1 accelerator: gpu strategy: ddp diff --git a/examples/asr/conf/matchboxnet/matchboxnet_3x1x64_v2.yaml b/examples/asr/conf/matchboxnet/matchboxnet_3x1x64_v2.yaml index a7d4974ed7f3..f3f4639766c5 100644 --- a/examples/asr/conf/matchboxnet/matchboxnet_3x1x64_v2.yaml +++ b/examples/asr/conf/matchboxnet/matchboxnet_3x1x64_v2.yaml @@ -177,7 +177,7 @@ model: trainer: devices: 1 # number of gpus max_epochs: 200 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set num_nodes: 1 accelerator: gpu strategy: ddp diff --git a/examples/asr/conf/quartznet/quartznet_15x5.yaml b/examples/asr/conf/quartznet/quartznet_15x5.yaml index 269be113e7be..d5f225365017 100644 --- a/examples/asr/conf/quartznet/quartznet_15x5.yaml +++ b/examples/asr/conf/quartznet/quartznet_15x5.yaml @@ -261,7 +261,7 @@ model: trainer: devices: 1 # number of gpus max_epochs: 5 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set num_nodes: 1 accelerator: gpu strategy: ddp diff --git a/examples/asr/conf/quartznet/quartznet_15x5_aug.yaml b/examples/asr/conf/quartznet/quartznet_15x5_aug.yaml index 93487d86dff9..4daec79ff1dd 100644 --- a/examples/asr/conf/quartznet/quartznet_15x5_aug.yaml +++ b/examples/asr/conf/quartznet/quartznet_15x5_aug.yaml @@ -267,7 +267,7 @@ model: trainer: devices: 1 # number of gpus max_epochs: 5 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set num_nodes: 1 accelerator: gpu strategy: ddp diff --git a/examples/asr/conf/quartznet/quartznet_15x5_ru.yaml b/examples/asr/conf/quartznet/quartznet_15x5_ru.yaml index 1a96811640d9..37f58f08ecf8 100644 --- a/examples/asr/conf/quartznet/quartznet_15x5_ru.yaml +++ b/examples/asr/conf/quartznet/quartznet_15x5_ru.yaml @@ -258,7 +258,7 @@ model: trainer: devices: 1 # number of gpus max_epochs: 5 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set num_nodes: 1 accelerator: gpu strategy: ddp diff --git a/examples/asr/conf/quartznet/quartznet_15x5_zh.yaml b/examples/asr/conf/quartznet/quartznet_15x5_zh.yaml index b10c922db7fd..c26b63b2a23b 100644 --- a/examples/asr/conf/quartznet/quartznet_15x5_zh.yaml +++ b/examples/asr/conf/quartznet/quartznet_15x5_zh.yaml @@ -457,7 +457,7 @@ model: trainer: devices: 1 # number of gpus max_epochs: 5 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set num_nodes: 1 accelerator: gpu strategy: ddp diff --git a/examples/asr/conf/squeezeformer/squeezeformer_ctc_bpe.yaml b/examples/asr/conf/squeezeformer/squeezeformer_ctc_bpe.yaml index c5d66043536f..430414994cb3 100644 --- a/examples/asr/conf/squeezeformer/squeezeformer_ctc_bpe.yaml +++ b/examples/asr/conf/squeezeformer/squeezeformer_ctc_bpe.yaml @@ -161,7 +161,7 @@ trainer: devices: -1 # number of GPUs, -1 would use all available GPUs num_nodes: 1 max_epochs: 1000 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations accelerator: auto strategy: ddp @@ -169,7 +169,7 @@ trainer: gradient_clip_val: 0.0 precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. log_every_n_steps: 10 # Interval of logging. - progress_bar_refresh_rate: 10 + enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs diff --git a/examples/asr/conf/squeezeformer/squeezeformer_ctc_char.yaml b/examples/asr/conf/squeezeformer/squeezeformer_ctc_char.yaml index 8fd06e24ee26..eb1abafe74eb 100644 --- a/examples/asr/conf/squeezeformer/squeezeformer_ctc_char.yaml +++ b/examples/asr/conf/squeezeformer/squeezeformer_ctc_char.yaml @@ -146,7 +146,7 @@ trainer: devices: -1 # number of GPUs, -1 would use all available GPUs num_nodes: 1 max_epochs: 1000 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations accelerator: auto strategy: ddp @@ -154,7 +154,7 @@ trainer: gradient_clip_val: 0.0 precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. log_every_n_steps: 10 # Interval of logging. - progress_bar_refresh_rate: 10 + enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs diff --git a/examples/asr/conf/ssl/citrinet/citrinet_ssl_1024.yaml b/examples/asr/conf/ssl/citrinet/citrinet_ssl_1024.yaml index bc6fc7536972..2579b9777199 100644 --- a/examples/asr/conf/ssl/citrinet/citrinet_ssl_1024.yaml +++ b/examples/asr/conf/ssl/citrinet/citrinet_ssl_1024.yaml @@ -472,7 +472,7 @@ trainer: devices: -1 # number of GPUs, -1 would use all available GPUs num_nodes: 1 max_epochs: 1000 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations accelerator: auto strategy: ddp @@ -480,7 +480,7 @@ trainer: gradient_clip_val: 1.0 precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. log_every_n_steps: 10 # Interval of logging. - progress_bar_refresh_rate: 10 + enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs diff --git a/examples/asr/conf/ssl/citrinet/citrinet_ssl_ci.yaml b/examples/asr/conf/ssl/citrinet/citrinet_ssl_ci.yaml index ac3e1bc8dffe..749b97587814 100644 --- a/examples/asr/conf/ssl/citrinet/citrinet_ssl_ci.yaml +++ b/examples/asr/conf/ssl/citrinet/citrinet_ssl_ci.yaml @@ -431,7 +431,7 @@ model: trainer: devices: 1 # number of gpus max_epochs: 100 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set num_nodes: 1 accelerator: gpu strategy: ddp diff --git a/examples/asr/conf/ssl/conformer/conformer_ssl.yaml b/examples/asr/conf/ssl/conformer/conformer_ssl.yaml index 6a200a3ba0f9..cb3843cdcdd5 100644 --- a/examples/asr/conf/ssl/conformer/conformer_ssl.yaml +++ b/examples/asr/conf/ssl/conformer/conformer_ssl.yaml @@ -181,7 +181,7 @@ trainer: devices: -1 # number of GPUs, -1 would use all available GPUs num_nodes: 1 max_epochs: 1000 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations accelerator: auto strategy: ddp @@ -189,7 +189,7 @@ trainer: gradient_clip_val: 1.0 precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. log_every_n_steps: 10 # Interval of logging. - progress_bar_refresh_rate: 10 + enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs diff --git a/examples/asr/conf/ssl/contextnet/contextnet_ssl.yaml b/examples/asr/conf/ssl/contextnet/contextnet_ssl.yaml index e62d28511ed9..54e73213ae45 100644 --- a/examples/asr/conf/ssl/contextnet/contextnet_ssl.yaml +++ b/examples/asr/conf/ssl/contextnet/contextnet_ssl.yaml @@ -436,7 +436,7 @@ trainer: devices: -1 # number of GPUs, -1 would use all available GPUs num_nodes: 1 max_epochs: 1000 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations accelerator: auto strategy: ddp @@ -444,7 +444,7 @@ trainer: gradient_clip_val: 1.0 precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. log_every_n_steps: 10 # Interval of logging. - progress_bar_refresh_rate: 10 + enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs diff --git a/examples/asr/conf/ssl/wav2vec/wav2vec_ci.yaml b/examples/asr/conf/ssl/wav2vec/wav2vec_ci.yaml index 2f9cb76db760..7746b9a17e59 100644 --- a/examples/asr/conf/ssl/wav2vec/wav2vec_ci.yaml +++ b/examples/asr/conf/ssl/wav2vec/wav2vec_ci.yaml @@ -131,7 +131,7 @@ model: trainer: devices: 1 # number of gpus num_nodes: 1 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set accelerator: gpu strategy: ddp accumulate_grad_batches: 1 diff --git a/examples/asr/conf/wav2vec_ctc/wav2vecCTC.yaml b/examples/asr/conf/wav2vec_ctc/wav2vecCTC.yaml index 11c9576e6f6d..02c6eb2a6273 100644 --- a/examples/asr/conf/wav2vec_ctc/wav2vecCTC.yaml +++ b/examples/asr/conf/wav2vec_ctc/wav2vecCTC.yaml @@ -126,7 +126,7 @@ trainer: devices: 1 # number of gpus num_nodes: 1 max_epochs: 100 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set accelerator: gpu strategy: ddp accumulate_grad_batches: 1 diff --git a/examples/asr/experimental/k2/conf/citrinet/citrinet_mmi_1024.yaml b/examples/asr/experimental/k2/conf/citrinet/citrinet_mmi_1024.yaml index 60d5c2bfd95d..1c1be351ca35 100644 --- a/examples/asr/experimental/k2/conf/citrinet/citrinet_mmi_1024.yaml +++ b/examples/asr/experimental/k2/conf/citrinet/citrinet_mmi_1024.yaml @@ -460,7 +460,7 @@ model: trainer: devices: 1 # number of gpus max_epochs: 100 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set num_nodes: 1 accelerator: gpu strategy: ddp diff --git a/examples/nlp/dialogue/conf/dialogue_config.yaml b/examples/nlp/dialogue/conf/dialogue_config.yaml index 11844a41f144..733f2f7fe7e4 100644 --- a/examples/nlp/dialogue/conf/dialogue_config.yaml +++ b/examples/nlp/dialogue/conf/dialogue_config.yaml @@ -18,7 +18,7 @@ trainer: devices: 1 # number of GPUs (0 for CPU), or list of the GPUs to use e.g. [0, 1] num_nodes: 1 max_epochs: 3 - max_steps: null # precedence over max_epochs + max_steps: -1 # precedence over max_epochs accumulate_grad_batches: 1 # accumulates grads every k batches gradient_clip_val: 1.0 precision: 16 # Should be set to 16 for O1 and O2 to enable the AMP. diff --git a/examples/nlp/dialogue/dialogue.py b/examples/nlp/dialogue/dialogue.py index e3ce88e33275..3f31185fdcaa 100644 --- a/examples/nlp/dialogue/dialogue.py +++ b/examples/nlp/dialogue/dialogue.py @@ -53,7 +53,7 @@ from nemo.collections.nlp.models.dialogue.intent_slot_classification_model import IntentSlotClassificationModel from nemo.collections.nlp.models.dialogue.sgdqa_model import SGDQAModel from nemo.collections.nlp.modules.common.megatron.megatron_utils import compute_model_parallel_rank -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy from nemo.core.config import hydra_runner from nemo.utils import logging from nemo.utils.app_state import AppState @@ -66,11 +66,11 @@ def main(cfg: DictConfig) -> None: logging.info(f'Config: {OmegaConf.to_yaml(cfg)}') try: - plugins = NLPDDPPlugin() + strategy = NLPDDPStrategy() except (ImportError, ModuleNotFoundError): - plugins = None + strategy = None - trainer = pl.Trainer(**cfg.trainer, plugins=plugins) + trainer = pl.Trainer(**cfg.trainer, strategy=strategy) exp_manager(trainer, cfg.get("exp_manager", None)) @@ -139,7 +139,7 @@ def main(cfg: DictConfig) -> None: if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.ds_item is not None: eval_device = [cfg.trainer.devices[0]] if isinstance(cfg.trainer.devices, list) else 1 trainer = pl.Trainer( - devices=eval_device, accelerator=cfg.trainer.accelerator, precision=16, plugins=NLPDDPPlugin() + devices=eval_device, accelerator=cfg.trainer.accelerator, precision=16, strategy=NLPDDPStrategy() ) model.setup_multiple_test_data(test_data_config=cfg.model.test_ds) if model.prepare_test(trainer): diff --git a/examples/nlp/entity_linking/conf/tiny_example_entity_linking_config.yaml b/examples/nlp/entity_linking/conf/tiny_example_entity_linking_config.yaml index a08500e0467e..b7f538ccd68f 100644 --- a/examples/nlp/entity_linking/conf/tiny_example_entity_linking_config.yaml +++ b/examples/nlp/entity_linking/conf/tiny_example_entity_linking_config.yaml @@ -4,7 +4,7 @@ trainer: devices: 1 num_nodes: 1 max_epochs: 2 - max_steps: null + max_steps: -1 accumulate_grad_batches: 1 precision: 16 accelerator: gpu diff --git a/examples/nlp/entity_linking/conf/umls_medical_entity_linking_config.yaml b/examples/nlp/entity_linking/conf/umls_medical_entity_linking_config.yaml index cfc3442be87c..ad636ef23e18 100644 --- a/examples/nlp/entity_linking/conf/umls_medical_entity_linking_config.yaml +++ b/examples/nlp/entity_linking/conf/umls_medical_entity_linking_config.yaml @@ -4,7 +4,7 @@ trainer: devices: 1 num_nodes: 1 max_epochs: 2 - max_steps: null + max_steps: -1 accumulate_grad_batches: 1 precision: 16 accelerator: gpu diff --git a/examples/nlp/glue_benchmark/glue_benchmark_config.yaml b/examples/nlp/glue_benchmark/glue_benchmark_config.yaml index d94aa2fcfd2d..21cdc04db22f 100644 --- a/examples/nlp/glue_benchmark/glue_benchmark_config.yaml +++ b/examples/nlp/glue_benchmark/glue_benchmark_config.yaml @@ -5,7 +5,7 @@ trainer: devices: 1 # the number of gpus, 0 for CPU num_nodes: 1 max_epochs: 3 - max_steps: null # precedence over max_epochs + max_steps: -1 # precedence over max_epochs accumulate_grad_batches: 1 # accumulates grads every k batches precision: 16 accelerator: gpu diff --git a/examples/nlp/information_retrieval/conf/bert_ir_config.yaml b/examples/nlp/information_retrieval/conf/bert_ir_config.yaml index ba745134f89b..56e573e0bcf6 100644 --- a/examples/nlp/information_retrieval/conf/bert_ir_config.yaml +++ b/examples/nlp/information_retrieval/conf/bert_ir_config.yaml @@ -4,7 +4,7 @@ trainer: devices: 1 # the number of gpus, 0 for CPU, or list with gpu indices num_nodes: 1 max_epochs: 2 # the number of training epochs - max_steps: null # precedence over max_epochs + max_steps: -1 # precedence over max_epochs accumulate_grad_batches: 1 # accumulates grads every k batches precision: 16 # 16 to use AMP accelerator: gpu diff --git a/examples/nlp/intent_slot_classification/conf/intent_slot_classification_config.yaml b/examples/nlp/intent_slot_classification/conf/intent_slot_classification_config.yaml index beb88cd905d7..da6bc3c2579a 100644 --- a/examples/nlp/intent_slot_classification/conf/intent_slot_classification_config.yaml +++ b/examples/nlp/intent_slot_classification/conf/intent_slot_classification_config.yaml @@ -4,7 +4,7 @@ trainer: devices: 1 # the number of gpus, 0 for CPU num_nodes: 1 max_epochs: 50 - max_steps: null # precedence over max_epochs + max_steps: -1 # precedence over max_epochs accumulate_grad_batches: 1 # accumulates grads every k batches precision: 32 # Should be set to 16 for O1 and O2 amp_level to enable the AMP. accelerator: gpu @@ -13,7 +13,7 @@ trainer: val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. - checkpoint_callback: False + enable_checkpointing: False logger: false # Provided by exp_manager model: @@ -83,7 +83,7 @@ model: sched: name: WarmupAnnealing iters_per_batch: null # computed at runtime - max_steps: null # computed at runtime or explicitly set here + max_steps: -1 # computed at runtime or explicitly set here # pytorch lightning args monitor: val_loss @@ -108,4 +108,4 @@ hydra: dir: . job_logging: root: - handlers: null \ No newline at end of file + handlers: null diff --git a/examples/nlp/intent_slot_classification/conf/multi_label_intent_slot_classification_config.yaml b/examples/nlp/intent_slot_classification/conf/multi_label_intent_slot_classification_config.yaml index 442b0d3c72bd..7d534ca7f216 100644 --- a/examples/nlp/intent_slot_classification/conf/multi_label_intent_slot_classification_config.yaml +++ b/examples/nlp/intent_slot_classification/conf/multi_label_intent_slot_classification_config.yaml @@ -4,7 +4,7 @@ trainer: devices: -1 # number of GPUs, -1 would use all available GPUs num_nodes: 1 max_epochs: 5 - max_steps: null # precedence over max_epochs + max_steps: -1 # precedence over max_epochs accumulate_grad_batches: 1 # accumulates grads every k batches precision: 32 # Should be set to 16 for O1 and O2 amp_level to enable the AMP. accelerator: auto @@ -13,7 +13,7 @@ trainer: val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. - checkpoint_callback: false # Provided by exp_manager + enable_checkpointing: false # Provided by exp_manager logger: false # Provided by exp_manager model: @@ -83,7 +83,7 @@ model: sched: name: WarmupAnnealing iters_per_batch: null # computed at runtime - max_steps: null # computed at runtime or explicitly set here + max_steps: -1 # computed at runtime or explicitly set here # pytorch lightning args monitor: val_loss diff --git a/examples/nlp/language_modeling/bert_pretraining.py b/examples/nlp/language_modeling/bert_pretraining.py index 7760b73709da..75d0a1072e69 100644 --- a/examples/nlp/language_modeling/bert_pretraining.py +++ b/examples/nlp/language_modeling/bert_pretraining.py @@ -15,7 +15,7 @@ import pytorch_lightning as pl from omegaconf import DictConfig, OmegaConf -from pytorch_lightning.plugins import DDPPlugin +from pytorch_lightning.strategies import DDPStrategy from nemo.collections.nlp.models.language_modeling import BERTLMModel from nemo.core.config import hydra_runner @@ -26,7 +26,7 @@ @hydra_runner(config_path="conf", config_name="bert_pretraining_from_text_config") def main(cfg: DictConfig) -> None: logging.info(f'Config:\n {OmegaConf.to_yaml(cfg)}') - trainer = pl.Trainer(plugins=[DDPPlugin(find_unused_parameters=True)], **cfg.trainer) + trainer = pl.Trainer(strategy=DDPStrategy(find_unused_parameters=True), **cfg.trainer) exp_manager(trainer, cfg.get("exp_manager", None)) bert_model = BERTLMModel(cfg.model, trainer=trainer) trainer.fit(bert_model) diff --git a/examples/nlp/language_modeling/conf/bert_pretraining_from_text_config.yaml b/examples/nlp/language_modeling/conf/bert_pretraining_from_text_config.yaml index f7aef2e090d6..c29fcb3e912d 100644 --- a/examples/nlp/language_modeling/conf/bert_pretraining_from_text_config.yaml +++ b/examples/nlp/language_modeling/conf/bert_pretraining_from_text_config.yaml @@ -4,7 +4,7 @@ trainer: devices: 1 # the number of gpus, 0 for CPU, or list with gpu indices num_nodes: 1 max_epochs: 2 # the number of training epochs - max_steps: null # precedence over max_epochs + max_steps: -1 # precedence over max_epochs accumulate_grad_batches: 1 # accumulates grads every k batches precision: 16 # 16 to use AMP accelerator: gpu diff --git a/examples/nlp/language_modeling/conf/megatron_bart_config.yaml b/examples/nlp/language_modeling/conf/megatron_bart_config.yaml index 4f8bbdc92388..6084f712499f 100644 --- a/examples/nlp/language_modeling/conf/megatron_bart_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_bart_config.yaml @@ -13,7 +13,7 @@ trainer: logger: False # logger provided by exp_manager enable_checkpointing: False replace_sampler_ddp: False - max_epochs: 1000 # PTL default. In practice, max_steps will be reached first. + max_epochs: -1 # PTL default. In practice, max_steps will be reached first. max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches log_every_n_steps: 10 val_check_interval: 100 diff --git a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml index e93dcbe297e7..b76d713e3022 100644 --- a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml @@ -9,7 +9,7 @@ trainer: logger: False # logger provided by exp_manager enable_checkpointing: False replace_sampler_ddp: False - max_epochs: 1000 # PTL default. In practice we don't usually train for more than 1 epoch. + max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch. max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches log_every_n_steps: 10 val_check_interval: 100 diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 837d397c0815..d6643f7ea524 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -9,7 +9,7 @@ trainer: logger: False # logger provided by exp_manager enable_checkpointing: False replace_sampler_ddp: False - max_epochs: 1000 # PTL default. In practice, max_steps will be reached first. + max_epochs: -1 # PTL default. In practice, max_steps will be reached first. max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches log_every_n_steps: 10 val_check_interval: 100 diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_prompt_learning_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_prompt_learning_config.yaml index 0891c59da12e..7b000646dcd5 100644 --- a/examples/nlp/language_modeling/conf/megatron_gpt_prompt_learning_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_prompt_learning_config.yaml @@ -9,7 +9,7 @@ trainer: enable_checkpointing: False replace_sampler_ddp: False max_epochs: 3 - max_steps: null # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches log_every_n_steps: 10 val_check_interval: 1.0 accumulate_grad_batches: 1 diff --git a/examples/nlp/language_modeling/conf/megatron_retro_config.yaml b/examples/nlp/language_modeling/conf/megatron_retro_config.yaml index adb5bd787573..8121bef5451b 100644 --- a/examples/nlp/language_modeling/conf/megatron_retro_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_retro_config.yaml @@ -12,7 +12,7 @@ trainer: logger: False # logger provided by exp_manager enable_checkpointing: False replace_sampler_ddp: False - max_epochs: 1000 # PTL default. In practice we don't usually train for more than 1 epoch. + max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch. max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches log_every_n_steps: 10 val_check_interval: 100 diff --git a/examples/nlp/language_modeling/conf/megatron_retro_mutransfer.yaml b/examples/nlp/language_modeling/conf/megatron_retro_mutransfer.yaml index 73c3dab8a3e3..bfcd6798ae21 100644 --- a/examples/nlp/language_modeling/conf/megatron_retro_mutransfer.yaml +++ b/examples/nlp/language_modeling/conf/megatron_retro_mutransfer.yaml @@ -14,7 +14,7 @@ trainer: logger: False # logger provided by exp_manager enable_checkpointing: False replace_sampler_ddp: False - max_epochs: 1000 # PTL default. In practice we don't usually train for more than 1 epoch. + max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch. max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches log_every_n_steps: 10 val_check_interval: 100 diff --git a/examples/nlp/language_modeling/conf/megatron_t0_config.yaml b/examples/nlp/language_modeling/conf/megatron_t0_config.yaml index 95f2b3bb5561..3850ce505819 100644 --- a/examples/nlp/language_modeling/conf/megatron_t0_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_t0_config.yaml @@ -9,7 +9,7 @@ trainer: enable_checkpointing: False replace_sampler_ddp: False max_epochs: 10 - max_steps: null # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches log_every_n_steps: 10 val_check_interval: 300 accumulate_grad_batches: 1 diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config.yaml index 8f973b059c90..7a93c604366b 100644 --- a/examples/nlp/language_modeling/conf/megatron_t5_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_t5_config.yaml @@ -13,7 +13,7 @@ trainer: logger: False # logger provided by exp_manager enable_checkpointing: False replace_sampler_ddp: False - max_epochs: 1000 # PTL default. In practice, max_steps will be reached first. + max_epochs: -1 # PTL default. In practice, max_steps will be reached first. max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches log_every_n_steps: 10 val_check_interval: 100 diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml index bac1bac2ec89..ac68b57e0216 100644 --- a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml +++ b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml @@ -9,7 +9,7 @@ trainer: enable_checkpointing: False replace_sampler_ddp: False max_epochs: 3 - max_steps: null # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches log_every_n_steps: 10 val_check_interval: 300 accumulate_grad_batches: 1 diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml index 10eedd384e79..1b08bc37246e 100644 --- a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml +++ b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml @@ -9,7 +9,7 @@ trainer: enable_checkpointing: False replace_sampler_ddp: False max_epochs: 3 - max_steps: null # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches log_every_n_steps: 10 val_check_interval: 300 accumulate_grad_batches: 1 diff --git a/examples/nlp/language_modeling/conf/megatron_t5_finetune.yaml b/examples/nlp/language_modeling/conf/megatron_t5_finetune.yaml index 91a9730637a3..9a5cf15cfe74 100644 --- a/examples/nlp/language_modeling/conf/megatron_t5_finetune.yaml +++ b/examples/nlp/language_modeling/conf/megatron_t5_finetune.yaml @@ -9,7 +9,7 @@ trainer: enable_checkpointing: False replace_sampler_ddp: False max_epochs: 10 - max_steps: null # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches log_every_n_steps: 10 val_check_interval: 300 accumulate_grad_batches: 1 diff --git a/examples/nlp/language_modeling/conf/megatron_t5_lm_adaptation_finetune.yaml b/examples/nlp/language_modeling/conf/megatron_t5_lm_adaptation_finetune.yaml index d3860e9957c0..10baf9d080f0 100644 --- a/examples/nlp/language_modeling/conf/megatron_t5_lm_adaptation_finetune.yaml +++ b/examples/nlp/language_modeling/conf/megatron_t5_lm_adaptation_finetune.yaml @@ -9,7 +9,7 @@ trainer: logger: False # logger provided by exp_manager enable_checkpointing: False replace_sampler_ddp: False - max_epochs: 1000 # PTL default. In practice, max_steps will be reached first. + max_epochs: -1 # PTL default. In practice, max_steps will be reached first. max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches log_every_n_steps: 10 val_check_interval: 100 diff --git a/examples/nlp/language_modeling/conf/megatron_t5_prompt_learning.yaml b/examples/nlp/language_modeling/conf/megatron_t5_prompt_learning.yaml index 55c56855efd3..e8f10d0f5f59 100644 --- a/examples/nlp/language_modeling/conf/megatron_t5_prompt_learning.yaml +++ b/examples/nlp/language_modeling/conf/megatron_t5_prompt_learning.yaml @@ -9,7 +9,7 @@ trainer: enable_checkpointing: False replace_sampler_ddp: False max_epochs: 10 - max_steps: null + max_steps: -1 log_every_n_steps: 10 val_check_interval: 1 accumulate_grad_batches: 1 diff --git a/examples/nlp/language_modeling/conf/megatron_ul2_config.yaml b/examples/nlp/language_modeling/conf/megatron_ul2_config.yaml index 2faf75c651e4..f815feaa65b1 100644 --- a/examples/nlp/language_modeling/conf/megatron_ul2_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_ul2_config.yaml @@ -13,7 +13,7 @@ trainer: logger: False # logger provided by exp_manager enable_checkpointing: False replace_sampler_ddp: False - max_epochs: 1000 # PTL default. In practice, max_steps will be reached first. + max_epochs: -1 # PTL default. In practice, max_steps will be reached first. max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches log_every_n_steps: 10 val_check_interval: 100 diff --git a/examples/nlp/language_modeling/megatron_bart_pretraining.py b/examples/nlp/language_modeling/megatron_bart_pretraining.py index c381a192be4e..9a7300656f99 100644 --- a/examples/nlp/language_modeling/megatron_bart_pretraining.py +++ b/examples/nlp/language_modeling/megatron_bart_pretraining.py @@ -24,7 +24,7 @@ from nemo.collections.nlp.parts.nlp_overrides import ( GradScaler, MegatronHalfPrecisionPlugin, - NLPDDPPlugin, + NLPDDPStrategy, PipelineMixedPrecisionPlugin, ) from nemo.core.config import hydra_runner @@ -38,13 +38,12 @@ def main(cfg) -> None: logging.info(f'\n{OmegaConf.to_yaml(cfg)}') megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) - plugins = [ - NLPDDPPlugin( - no_ddp_communication_hook=True, - gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, - find_unused_parameters=False, - ) - ] + plugins = [] + strategy = NLPDDPStrategy( + no_ddp_communication_hook=True, + gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, + find_unused_parameters=False, + ) if cfg.trainer.precision in [16, 'bf16']: scaler = None if cfg.trainer.precision == 16: @@ -61,7 +60,7 @@ def main(cfg) -> None: if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) - trainer = Trainer(plugins=plugins, **cfg.trainer, callbacks=[ModelSummary(max_depth=3)]) + trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[ModelSummary(max_depth=3)]) exp_manager(trainer, cfg.exp_manager) diff --git a/examples/nlp/language_modeling/megatron_bert_pretraining.py b/examples/nlp/language_modeling/megatron_bert_pretraining.py index f8beea305390..f8239dcd538b 100644 --- a/examples/nlp/language_modeling/megatron_bert_pretraining.py +++ b/examples/nlp/language_modeling/megatron_bert_pretraining.py @@ -20,7 +20,7 @@ from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel -from nemo.collections.nlp.parts.nlp_overrides import GradScaler, NLPDDPPlugin +from nemo.collections.nlp.parts.nlp_overrides import GradScaler, NLPDDPStrategy from nemo.core.config import hydra_runner from nemo.utils import logging from nemo.utils.exp_manager import StatelessTimer, exp_manager @@ -31,7 +31,8 @@ def main(cfg) -> None: logging.info("\n\n************** Experiment configuration ***********") logging.info(f'\n{OmegaConf.to_yaml(cfg)}') - plugins = [NLPDDPPlugin(find_unused_parameters=False)] + plugins = [] + strategy = NLPDDPStrategy(find_unused_parameters=False) if cfg.trainer.precision == 16: scaler = GradScaler( init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), @@ -42,7 +43,7 @@ def main(cfg) -> None: if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) - trainer = Trainer(plugins=plugins, **cfg.trainer) + trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer) exp_manager(trainer, cfg.exp_manager) diff --git a/examples/nlp/language_modeling/megatron_change_num_partitions.py b/examples/nlp/language_modeling/megatron_change_num_partitions.py index ea09a85e4611..5bf12f5511d0 100644 --- a/examples/nlp/language_modeling/megatron_change_num_partitions.py +++ b/examples/nlp/language_modeling/megatron_change_num_partitions.py @@ -18,7 +18,7 @@ import torch from pytorch_lightning import Trainer -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin, NLPSaveRestoreConnector +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector from nemo.utils import logging, model_utils from nemo.utils.app_state import AppState @@ -144,7 +144,7 @@ def main(): tgt_tp_size = args.target_tensor_model_parallel_size cls = model_utils.import_class_by_path(args.model_class) - trainer = Trainer(devices=1, plugins=NLPDDPPlugin(), accelerator="cpu", precision=precision) + trainer = Trainer(devices=1, strategy=NLPDDPStrategy(), accelerator="cpu", precision=precision) app_state = AppState() app_state.data_parallel_rank = 0 app_state.pipeline_model_parallel_size = 1 # not supported yet in this script @@ -168,7 +168,7 @@ def main(): model.cfg.tensor_model_parallel_size = 1 app_state.model_parallel_size = 1 - trainer = Trainer(devices=1, plugins=NLPDDPPlugin(), accelerator="cpu", precision=precision) + trainer = Trainer(devices=1, strategy=NLPDDPStrategy(), accelerator="cpu", precision=precision) model = cls(model.cfg, trainer).to('cpu') model._save_restore_connector = NLPSaveRestoreConnector() @@ -187,7 +187,7 @@ def main(): model.cfg.tensor_model_parallel_size = tgt_tp_size app_state.model_parallel_size = tgt_tp_size - trainer = Trainer(devices=1, plugins=NLPDDPPlugin(), accelerator="cpu", precision=precision) + trainer = Trainer(devices=1, strategy=NLPDDPStrategy(), accelerator="cpu", precision=precision) model = cls(model.cfg, trainer).to('cpu') model._save_restore_connector = NLPSaveRestoreConnector() diff --git a/examples/nlp/language_modeling/megatron_gpt_eval.py b/examples/nlp/language_modeling/megatron_gpt_eval.py index c800dc52a3f0..7e66c3096f33 100644 --- a/examples/nlp/language_modeling/megatron_gpt_eval.py +++ b/examples/nlp/language_modeling/megatron_gpt_eval.py @@ -24,7 +24,7 @@ from nemo.collections.nlp.modules.common.text_generation_server import MegatronServer from nemo.collections.nlp.modules.common.text_generation_utils import generate from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy from nemo.core.config import hydra_runner from nemo.utils.app_state import AppState from nemo.utils.model_utils import inject_model_parallel_rank @@ -153,7 +153,7 @@ def __getitem__(self, idx): def main(cfg) -> None: # trainer required for restoring model parallel models - trainer = Trainer(plugins=NLPDDPPlugin(), **cfg.trainer) + trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer) assert ( cfg.trainer.devices * cfg.trainer.num_nodes == cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size diff --git a/examples/nlp/language_modeling/megatron_gpt_pretraining.py b/examples/nlp/language_modeling/megatron_gpt_pretraining.py index f827d9ad92df..7f15bb1daa8a 100644 --- a/examples/nlp/language_modeling/megatron_gpt_pretraining.py +++ b/examples/nlp/language_modeling/megatron_gpt_pretraining.py @@ -23,7 +23,7 @@ from nemo.collections.nlp.parts.nlp_overrides import ( GradScaler, MegatronHalfPrecisionPlugin, - NLPDDPPlugin, + NLPDDPStrategy, PipelineMixedPrecisionPlugin, ) from nemo.core.config import hydra_runner @@ -38,13 +38,12 @@ def main(cfg) -> None: megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) - plugins = [ - NLPDDPPlugin( - no_ddp_communication_hook=True, # we don't use DDP for async grad allreduce - gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, - find_unused_parameters=False, - ) - ] + plugins = [] + strategy = NLPDDPStrategy( + no_ddp_communication_hook=True, # we don't use DDP for async grad allreduce + gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, + find_unused_parameters=False, + ) if cfg.trainer.precision in [16, 'bf16']: scaler = None if cfg.trainer.precision == 16: @@ -61,7 +60,7 @@ def main(cfg) -> None: if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) - trainer = Trainer(plugins=plugins, **cfg.trainer) + trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer) exp_manager(trainer, cfg.exp_manager) diff --git a/examples/nlp/language_modeling/megatron_gpt_prompt_learning.py b/examples/nlp/language_modeling/megatron_gpt_prompt_learning.py index d9b95db2bdf6..243559c44b02 100644 --- a/examples/nlp/language_modeling/megatron_gpt_prompt_learning.py +++ b/examples/nlp/language_modeling/megatron_gpt_prompt_learning.py @@ -22,7 +22,7 @@ ) from nemo.collections.nlp.parts.nlp_overrides import ( GradScaler, - NLPDDPPlugin, + NLPDDPStrategy, NLPSaveRestoreConnector, PipelineMixedPrecisionPlugin, ) @@ -45,7 +45,8 @@ def main(cfg) -> None: logging.info("\n\n************** Experiment configuration ***********") logging.info(f'\n{OmegaConf.to_yaml(cfg)}') - plugins = [NLPDDPPlugin(no_ddp_communication_hook=True, find_unused_parameters=False,)] + plugins = [] + strategy = NLPDDPStrategy(no_ddp_communication_hook=True, find_unused_parameters=False,) if cfg.trainer.precision == 16: scaler = GradScaler( init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), @@ -57,7 +58,7 @@ def main(cfg) -> None: if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) - trainer = Trainer(plugins=plugins, **cfg.trainer) + trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer) exp_manager(trainer, cfg.exp_manager) # Override timer callback to a stateless one diff --git a/examples/nlp/language_modeling/megatron_gpt_prompt_learning_eval.py b/examples/nlp/language_modeling/megatron_gpt_prompt_learning_eval.py index e81f6dcdb7fb..6e7d2d35fbe5 100644 --- a/examples/nlp/language_modeling/megatron_gpt_prompt_learning_eval.py +++ b/examples/nlp/language_modeling/megatron_gpt_prompt_learning_eval.py @@ -21,7 +21,7 @@ MegatronGPTPromptLearningModel, ) from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy from nemo.core.config import hydra_runner @@ -75,7 +75,7 @@ def main(cfg) -> None: raise EnvironmentError("GPU is needed for the inference") # trainer required for restoring model parallel models - trainer = Trainer(plugins=NLPDDPPlugin(), **cfg.trainer) + trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer) assert ( cfg.trainer.devices * cfg.trainer.num_nodes == cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size diff --git a/examples/nlp/language_modeling/megatron_gpt_test.py b/examples/nlp/language_modeling/megatron_gpt_test.py index 24770fac0e3c..b53ef375e3a0 100644 --- a/examples/nlp/language_modeling/megatron_gpt_test.py +++ b/examples/nlp/language_modeling/megatron_gpt_test.py @@ -18,7 +18,7 @@ from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel from nemo.collections.nlp.modules.common.megatron.megatron_utils import compute_model_parallel_rank from nemo.collections.nlp.parts.nlp_overrides import ( - NLPDDPPlugin, + NLPDDPStrategy, NLPNativeMixedPrecisionPlugin, NLPPrecisionPlugin, NLPSaveRestoreConnector, @@ -37,18 +37,18 @@ def main(cfg) -> None: if cfg.trainer.precision == 16: trainer = Trainer( plugins=[ - NLPDDPPlugin(), NLPNativeMixedPrecisionPlugin( init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), growth_interval=cfg.model.get('native_amp_growth_interval', 1000), ), ], + strategy=NLPDDPStrategy(), **cfg.trainer, ) elif cfg.trainer.precision == 'bf16': - trainer = Trainer(plugins=[NLPDDPPlugin(), NLPNativeBfloat16PrecisionPlugin(),], **cfg.trainer,) + trainer = Trainer(plugins=[NLPNativeBfloat16PrecisionPlugin(),], strategy=NLPDDPStrategy(), **cfg.trainer,) else: - trainer = Trainer(plugins=[NLPDDPPlugin(), NLPPrecisionPlugin()], **cfg.trainer) + trainer = Trainer(plugins=[NLPPrecisionPlugin()], strategy=NLPDDPStrategy(), **cfg.trainer) app_state = AppState() app_state.model_parallel_size = cfg.model.tensor_model_parallel_size diff --git a/examples/nlp/language_modeling/megatron_retro_cal_shape.py b/examples/nlp/language_modeling/megatron_retro_cal_shape.py index 02e0283e7aca..06bec216e925 100644 --- a/examples/nlp/language_modeling/megatron_retro_cal_shape.py +++ b/examples/nlp/language_modeling/megatron_retro_cal_shape.py @@ -19,7 +19,7 @@ from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel from nemo.collections.nlp.modules.common.megatron.mup.shape import make_base_shapes -from nemo.collections.nlp.parts.nlp_overrides import GradScaler, MegatronHalfPrecisionPlugin, NLPDDPPlugin +from nemo.collections.nlp.parts.nlp_overrides import GradScaler, MegatronHalfPrecisionPlugin, NLPDDPStrategy from nemo.core.config import hydra_runner from nemo.utils import logging @@ -30,13 +30,12 @@ def main(cfg) -> None: logging.info(f'\n{OmegaConf.to_yaml(cfg)}') megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) - plugins = [ - NLPDDPPlugin( - no_ddp_communication_hook=True if megatron_amp_o2 else False, - gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, - find_unused_parameters=False, - ) - ] + plugins = [] + strategy = NLPDDPStrategy( + no_ddp_communication_hook=True if megatron_amp_o2 else False, + gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, + find_unused_parameters=False, + ) if cfg.trainer.precision in [16, 'bf16']: scaler = None @@ -54,7 +53,7 @@ def main(cfg) -> None: if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) - trainer = Trainer(plugins=plugins, **cfg.trainer) + trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer) # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams with open_dict(cfg): diff --git a/examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py b/examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py index ad43e3d472c1..cf7dc5d747cc 100644 --- a/examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py +++ b/examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py @@ -25,7 +25,7 @@ from nemo.collections.nlp.modules.common.megatron.mup.init import normal_ from nemo.collections.nlp.modules.common.megatron.mup.optim import MuAdam, MuAdamW from nemo.collections.nlp.modules.common.megatron.mup.shape import set_base_shapes -from nemo.collections.nlp.parts.nlp_overrides import GradScaler, MegatronHalfPrecisionPlugin, NLPDDPPlugin +from nemo.collections.nlp.parts.nlp_overrides import GradScaler, MegatronHalfPrecisionPlugin, NLPDDPStrategy from nemo.core.config import hydra_runner from nemo.core.config.optimizers import AdamParams, AdamWParams from nemo.core.optim.optimizers import register_optimizer @@ -41,13 +41,12 @@ def main(cfg) -> None: logging.info(f'\n{OmegaConf.to_yaml(cfg)}') megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) - plugins = [ - NLPDDPPlugin( - no_ddp_communication_hook=True if megatron_amp_o2 else False, - gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, - find_unused_parameters=False, - ) - ] + plugins = [] + strategy = NLPDDPStrategy( + no_ddp_communication_hook=True if megatron_amp_o2 else False, + gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, + find_unused_parameters=False, + ) if cfg.trainer.precision in [16, 'bf16']: scaler = None @@ -65,7 +64,7 @@ def main(cfg) -> None: if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) - trainer = Trainer(plugins=plugins, **cfg.trainer) + trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer) exp_manager(trainer, cfg.exp_manager) diff --git a/examples/nlp/language_modeling/megatron_retro_pretraining.py b/examples/nlp/language_modeling/megatron_retro_pretraining.py index 8f25646a3b68..374bb938583e 100644 --- a/examples/nlp/language_modeling/megatron_retro_pretraining.py +++ b/examples/nlp/language_modeling/megatron_retro_pretraining.py @@ -20,7 +20,7 @@ from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel -from nemo.collections.nlp.parts.nlp_overrides import GradScaler, MegatronHalfPrecisionPlugin, NLPDDPPlugin +from nemo.collections.nlp.parts.nlp_overrides import GradScaler, MegatronHalfPrecisionPlugin, NLPDDPStrategy from nemo.core.config import hydra_runner from nemo.utils import logging from nemo.utils.exp_manager import StatelessTimer, exp_manager @@ -32,13 +32,12 @@ def main(cfg) -> None: logging.info(f'\n{OmegaConf.to_yaml(cfg)}') megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) - plugins = [ - NLPDDPPlugin( - no_ddp_communication_hook=True if megatron_amp_o2 else False, - gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, - find_unused_parameters=False, - ) - ] + plugins = [] + strategy = NLPDDPStrategy( + no_ddp_communication_hook=True if megatron_amp_o2 else False, + gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, + find_unused_parameters=False, + ) if cfg.trainer.precision in [16, 'bf16']: scaler = None @@ -56,7 +55,7 @@ def main(cfg) -> None: if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) - trainer = Trainer(plugins=plugins, **cfg.trainer) + trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer) exp_manager(trainer, cfg.exp_manager) diff --git a/examples/nlp/language_modeling/megatron_t5_eval.py b/examples/nlp/language_modeling/megatron_t5_eval.py index 03abb4132dae..0c205ab65ad0 100644 --- a/examples/nlp/language_modeling/megatron_t5_eval.py +++ b/examples/nlp/language_modeling/megatron_t5_eval.py @@ -22,7 +22,7 @@ from nemo.collections.nlp.data.language_modeling.megatron.request_dataset import T5RequestDataset from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin, NLPSaveRestoreConnector +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector from nemo.utils.app_state import AppState assert torch.cuda.is_available() @@ -55,7 +55,7 @@ def main(): # trainer required for restoring model parallel models trainer = Trainer( - plugins=NLPDDPPlugin(), + strategy=NLPDDPStrategy(), devices=args.tensor_model_parallel_size * args.pipeline_model_parallel_size, accelerator='gpu', precision=args.precision, diff --git a/examples/nlp/language_modeling/megatron_t5_lm_adaptation_finetune.py b/examples/nlp/language_modeling/megatron_t5_lm_adaptation_finetune.py index 9a78bb6e36fc..063147d66abb 100644 --- a/examples/nlp/language_modeling/megatron_t5_lm_adaptation_finetune.py +++ b/examples/nlp/language_modeling/megatron_t5_lm_adaptation_finetune.py @@ -24,7 +24,7 @@ from nemo.collections.nlp.parts.nlp_overrides import ( GradScaler, MegatronHalfPrecisionPlugin, - NLPDDPPlugin, + NLPDDPStrategy, NLPSaveRestoreConnector, PipelineMixedPrecisionPlugin, ) @@ -39,13 +39,12 @@ def main(cfg) -> None: logging.info(f'\n{OmegaConf.to_yaml(cfg)}') megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) - plugins = [ - NLPDDPPlugin( - no_ddp_communication_hook=True, # we don't use DDP for async grad allreduce - gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, - find_unused_parameters=False, - ) - ] + plugins = [] + strategy = NLPDDPStrategy( + no_ddp_communication_hook=True, # we don't use DDP for async grad allreduce + gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, + find_unused_parameters=False, + ) if cfg.trainer.precision in [16, 'bf16']: scaler = None if cfg.trainer.precision == 16: @@ -62,7 +61,7 @@ def main(cfg) -> None: if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) - trainer = Trainer(plugins=plugins, **cfg.trainer, callbacks=[ModelSummary(max_depth=3)]) + trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[ModelSummary(max_depth=3)]) exp_manager(trainer, cfg.exp_manager) # update resume from checkpoint found by exp_manager diff --git a/examples/nlp/language_modeling/megatron_t5_pretraining.py b/examples/nlp/language_modeling/megatron_t5_pretraining.py index f566f927e58f..462cc62d28eb 100644 --- a/examples/nlp/language_modeling/megatron_t5_pretraining.py +++ b/examples/nlp/language_modeling/megatron_t5_pretraining.py @@ -24,7 +24,7 @@ from nemo.collections.nlp.parts.nlp_overrides import ( GradScaler, MegatronHalfPrecisionPlugin, - NLPDDPPlugin, + NLPDDPStrategy, PipelineMixedPrecisionPlugin, ) from nemo.core.config import hydra_runner @@ -38,13 +38,12 @@ def main(cfg) -> None: logging.info(f'\n{OmegaConf.to_yaml(cfg)}') megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) - plugins = [ - NLPDDPPlugin( - no_ddp_communication_hook=True, # we don't use DDP for async grad allreduce - gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, - find_unused_parameters=False, - ) - ] + plugins = [] + strategy = NLPDDPStrategy( + no_ddp_communication_hook=True, # we don't use DDP for async grad allreduce + gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, + find_unused_parameters=False, + ) if cfg.trainer.precision in [16, 'bf16']: scaler = None if cfg.trainer.precision == 16: @@ -61,7 +60,7 @@ def main(cfg) -> None: if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) - trainer = Trainer(plugins=plugins, **cfg.trainer, callbacks=[ModelSummary(max_depth=3)]) + trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[ModelSummary(max_depth=3)]) exp_manager(trainer, cfg.exp_manager) # update resume from checkpoint found by exp_manager diff --git a/examples/nlp/language_modeling/megatron_t5_prompt_learning.py b/examples/nlp/language_modeling/megatron_t5_prompt_learning.py index ac31dcbeec8f..0d135fb60b98 100644 --- a/examples/nlp/language_modeling/megatron_t5_prompt_learning.py +++ b/examples/nlp/language_modeling/megatron_t5_prompt_learning.py @@ -22,7 +22,7 @@ ) from nemo.collections.nlp.parts.nlp_overrides import ( GradScaler, - NLPDDPPlugin, + NLPDDPStrategy, NLPSaveRestoreConnector, PipelineMixedPrecisionPlugin, ) @@ -45,7 +45,8 @@ def main(cfg) -> None: logging.info("\n\n************** Experiment configuration ***********") logging.info(f'\n{OmegaConf.to_yaml(cfg)}') - plugins = [NLPDDPPlugin(no_ddp_communication_hook=True, find_unused_parameters=False,)] + plugins = [] + strategy = NLPDDPStrategy(no_ddp_communication_hook=True, find_unused_parameters=False,) if cfg.trainer.precision == 16: scaler = GradScaler( init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), @@ -57,7 +58,7 @@ def main(cfg) -> None: if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) - trainer = Trainer(plugins=plugins, **cfg.trainer) + trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer) exp_manager(trainer, cfg.exp_manager) # Override timer callback to a stateless one diff --git a/examples/nlp/language_modeling/megatron_t5_prompt_learning_eval.py b/examples/nlp/language_modeling/megatron_t5_prompt_learning_eval.py index be88d0205e6c..b1d39141d742 100644 --- a/examples/nlp/language_modeling/megatron_t5_prompt_learning_eval.py +++ b/examples/nlp/language_modeling/megatron_t5_prompt_learning_eval.py @@ -21,7 +21,7 @@ MegatronT5PromptLearningModel, ) from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy from nemo.core.config import hydra_runner from nemo.utils.app_state import AppState @@ -41,7 +41,7 @@ def main(cfg) -> None: # trainer required for restoring model parallel models - trainer = Trainer(plugins=NLPDDPPlugin(), **cfg.trainer) + trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer) assert ( cfg.trainer.devices * cfg.trainer.num_nodes == cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size diff --git a/examples/nlp/language_modeling/megatron_t5_ptune.py b/examples/nlp/language_modeling/megatron_t5_ptune.py deleted file mode 100644 index 3ed47986635e..000000000000 --- a/examples/nlp/language_modeling/megatron_t5_ptune.py +++ /dev/null @@ -1,135 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from pathlib import Path - -import torch -from omegaconf.omegaconf import OmegaConf, open_dict -from pytorch_lightning import Trainer -from pytorch_lightning.callbacks.timer import Timer -from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment -from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin -from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector - -from nemo.collections.nlp.data.glue_benchmark.gpt_ptune_dataset import TemplateProcessor, register_taskdata_processor -from nemo.collections.nlp.models.language_modeling.megatron_ptune_t5_model import MegatronT5PTuneModel -from nemo.collections.nlp.parts.nlp_overrides import GradScaler, NLPDDPPlugin -from nemo.core.config import hydra_runner -from nemo.utils import logging -from nemo.utils.exp_manager import StatelessTimer, exp_manager - - -@hydra_runner(config_path="conf", config_name="megatron_ptune_t5") -def main(cfg) -> None: - logging.info("\n\n************** Experiment configuration ***********") - logging.info(f'\n{OmegaConf.to_yaml(cfg)}') - - # setup the data processor - for processor_config in cfg.model.task_processors: - processor = TemplateProcessor( - template=processor_config.template, limit_length_field=processor_config.limit_length_field - ) - register_taskdata_processor(processor_config.taskname, processor) - - plugins = [NLPDDPPlugin()] - if cfg.trainer.precision == 16: - scaler = GradScaler( - init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), - growth_interval=cfg.model.get('native_amp_growth_interval', 1000), - ) - plugins.append(NativeMixedPrecisionPlugin(precision=16, device='cuda', scaler=scaler)) - - if cfg.get('cluster_type', None) == 'BCP': - plugins.append(TorchElasticEnvironment()) - - trainer = Trainer(plugins=plugins, **cfg.trainer) - - exp_manager(trainer, cfg.exp_manager) - - # update resume from checkpoint found by exp_manager - resume_from_checkpoint = trainer._checkpoint_connector.resume_from_checkpoint_fit_path - logging.info(f'Resuming training from checkpoint: {resume_from_checkpoint}') - - trainer._checkpoint_connector = CheckpointConnector(trainer, resume_from_checkpoint=resume_from_checkpoint) - # Override timer callback to a stateless one - for idx, callback in enumerate(trainer.callbacks): - if isinstance(callback, Timer): - trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time,) - - # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams - with open_dict(cfg): - cfg.model.precision = cfg.trainer.precision - model = MegatronT5PTuneModel(cfg.model, trainer) - trainer.fit(model) - - if cfg.model.data.test_ds.file_path: - logging.info("===========================================================================================") - logging.info("Starting the testing of the trained model on test set...") - trainer.test(model) - logging.info("Testing finished!") - logging.info("===========================================================================================") - # extract the path of the best checkpoint from the training, you may update it to any checkpoint - checkpoint_path = trainer.checkpoint_callback.best_model_path - tensor_parallel_size = cfg.model.tensor_model_parallel_size - pathobj = Path(checkpoint_path) - checkpoint_folder = str(pathobj.parent) - checkpoint_name = str(pathobj.name) - - rank = trainer.accelerator.training_type_plugin.local_rank - if tensor_parallel_size > 1: - # inject model parallel rank - checkpoint_path = os.path.join(checkpoint_folder, f'mp_rank_{rank:02d}', checkpoint_name) - else: - checkpoint_path = os.path.join(checkpoint_folder, checkpoint_name) - - # Load the checkpoint - best_eval_model = MegatronT5PTuneModel.load_from_checkpoint( - checkpoint_path=checkpoint_path, strict=False, trainer=trainer - ) - logging.info(f'Best checkpoint path: {checkpoint_path}') - logging.info("Running Test with best EVAL checkpoint!") - # setup the test dataset - # best_eval_model.setup_test_data(test_data_config=cfg.model.data.test_ds) - if torch.distributed.is_initialized(): - torch.distributed.barrier() - trainer.test(model=best_eval_model, ckpt_path=None, verbose=False) - logging.info("Beset EVAL Testing finished!") - logging.info("===========================================================================================") - - if cfg.model.nemo_path: - # '.nemo' file contains the last checkpoint and the params to initialize the model - best_eval_model.save_to(cfg.model.nemo_path) - logging.info(f'Model is saved into `.nemo` file: {cfg.model.nemo_path}') - - # perform inference on a list of queries. - if "infer_samples" in cfg.model and cfg.model.infer_samples: - logging.info("===========================================================================================") - logging.info("Starting the inference on some sample queries...") - - # max_seq_length=512 is the maximum length BERT supports. - results = best_eval_model.cuda().ptune_inference( - queries=cfg.model.infer_samples, batch_size=1, decode_token_len=5 - ) - logging.info('The prediction results of some sample queries with the trained model:') - for query, result in zip(cfg.model.infer_samples, results): - logging.info(f'Query : {query}') - logging.info(f'Predicted label: {result}') - - logging.info("Inference finished!") - logging.info("===========================================================================================") - - -if __name__ == '__main__': - main() diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py index 62fe80a663ed..f51a809654ad 100644 --- a/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py +++ b/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py @@ -20,7 +20,7 @@ from nemo.collections.nlp.models.language_modeling.megatron_finetune_model import MegatronT5FinetuneModel from nemo.collections.nlp.models.language_modeling.megatron_glue_model import MegatronT5GLUEModel -from nemo.collections.nlp.parts.nlp_overrides import GradScaler, MegatronHalfPrecisionPlugin, NLPDDPPlugin +from nemo.collections.nlp.parts.nlp_overrides import GradScaler, MegatronHalfPrecisionPlugin, NLPDDPStrategy from nemo.core.config import hydra_runner from nemo.utils import logging from nemo.utils.exp_manager import StatelessTimer, exp_manager @@ -32,13 +32,12 @@ def main(cfg) -> None: logging.info(f'\n{OmegaConf.to_yaml(cfg)}') megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) - plugins = [ - NLPDDPPlugin( - no_ddp_communication_hook=True, - gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, - find_unused_parameters=False, - ) - ] + plugins = [] + strategy = NLPDDPStrategy( + no_ddp_communication_hook=True, + gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, + find_unused_parameters=False, + ) if cfg.trainer.precision in [16, 'bf16']: scaler = None if cfg.trainer.precision == 16: @@ -55,7 +54,7 @@ def main(cfg) -> None: if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) - trainer = Trainer(plugins=plugins, **cfg.trainer) + trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer) exp_manager(trainer, cfg.exp_manager) diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py index 5f22aa1cd847..6aa3a515d0bd 100644 --- a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py +++ b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py @@ -24,7 +24,7 @@ from nemo.collections.nlp.parts.nlp_overrides import ( GradScaler, MegatronHalfPrecisionPlugin, - NLPDDPPlugin, + NLPDDPStrategy, NLPSaveRestoreConnector, PipelineMixedPrecisionPlugin, ) @@ -39,13 +39,12 @@ def main(cfg) -> None: logging.info(f'\n{OmegaConf.to_yaml(cfg)}') megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) - plugins = [ - NLPDDPPlugin( - no_ddp_communication_hook=True, - gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, - find_unused_parameters=False, - ) - ] + plugins = [] + strategy = NLPDDPStrategy( + no_ddp_communication_hook=True, + gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, + find_unused_parameters=False, + ) if cfg.trainer.precision in [16, 'bf16']: scaler = None if cfg.trainer.precision == 16: @@ -62,7 +61,7 @@ def main(cfg) -> None: if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) - trainer = Trainer(plugins=plugins, **cfg.trainer) + trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer) exp_manager(trainer, cfg.exp_manager) diff --git a/examples/nlp/machine_translation/enc_dec_nmt-bottleneck.py b/examples/nlp/machine_translation/enc_dec_nmt-bottleneck.py index 5b2a036736ba..87898da15643 100644 --- a/examples/nlp/machine_translation/enc_dec_nmt-bottleneck.py +++ b/examples/nlp/machine_translation/enc_dec_nmt-bottleneck.py @@ -21,7 +21,7 @@ from nemo.collections.nlp.data.machine_translation.preproc_mt_data import MTDataPreproc from nemo.collections.nlp.models.machine_translation.mt_enc_dec_bottleneck_model import MTBottleneckModel from nemo.collections.nlp.models.machine_translation.mt_enc_dec_config import MTBottleneckModelConfig -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy from nemo.core.config import hydra_runner from nemo.core.config.modelPT import NemoConfig from nemo.core.config.pytorch_lightning import TrainerConfig @@ -116,8 +116,8 @@ def main(cfg: MTBottleneckConfig) -> None: # training is managed by PyTorch Lightning trainer_cfg = OmegaConf.to_container(cfg.trainer) - trainer_cfg.pop('plugins', None) - trainer = Trainer(plugins=[NLPDDPPlugin()], **trainer_cfg) + trainer_cfg.pop('strategy', None) + trainer = Trainer(strategy=NLPDDPStrategy(), **trainer_cfg) # tokenizers will be trained and and tarred training data will be created if needed # model config is then updated diff --git a/examples/nlp/machine_translation/enc_dec_nmt.py b/examples/nlp/machine_translation/enc_dec_nmt.py index e6a93c0bba9e..bdb501d7e7dd 100644 --- a/examples/nlp/machine_translation/enc_dec_nmt.py +++ b/examples/nlp/machine_translation/enc_dec_nmt.py @@ -21,7 +21,7 @@ from nemo.collections.nlp.data.machine_translation.preproc_mt_data import MTDataPreproc from nemo.collections.nlp.models.machine_translation.mt_enc_dec_config import MTEncDecModelConfig from nemo.collections.nlp.models.machine_translation.mt_enc_dec_model import MTEncDecModel -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy from nemo.core.config import hydra_runner from nemo.core.config.modelPT import NemoConfig from nemo.core.config.pytorch_lightning import TrainerConfig @@ -110,8 +110,8 @@ def main(cfg: MTEncDecConfig) -> None: # training is managed by PyTorch Lightning trainer_cfg = OmegaConf.to_container(cfg.trainer) - trainer_cfg.pop('plugins', None) - trainer = Trainer(plugins=[NLPDDPPlugin()], **trainer_cfg) + trainer_cfg.pop('strategy', None) + trainer = Trainer(strategy=NLPDDPStrategy(), **trainer_cfg) # tokenizers will be trained and and tarred training data will be created if needed # model config is then updated diff --git a/examples/nlp/machine_translation/enc_dec_nmt_finetune.py b/examples/nlp/machine_translation/enc_dec_nmt_finetune.py index a67067beb455..c5540f72607e 100644 --- a/examples/nlp/machine_translation/enc_dec_nmt_finetune.py +++ b/examples/nlp/machine_translation/enc_dec_nmt_finetune.py @@ -22,7 +22,7 @@ from nemo.collections.nlp.data.machine_translation.preproc_mt_data import MTDataPreproc from nemo.collections.nlp.models.machine_translation.mt_enc_dec_config import MTEncDecModelConfig from nemo.collections.nlp.models.machine_translation.mt_enc_dec_model import MTEncDecModel -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy from nemo.core.config import hydra_runner from nemo.core.config.modelPT import NemoConfig from nemo.core.config.pytorch_lightning import TrainerConfig @@ -78,8 +78,8 @@ def main(cfg: MTFineTuneConfig) -> None: # training is managed by PyTorch Lightning trainer_cfg = OmegaConf.to_container(cfg.trainer) - trainer_cfg.pop('plugins', None) - trainer = Trainer(plugins=[NLPDDPPlugin()], **trainer_cfg) + trainer_cfg.pop('strategy', None) + trainer = Trainer(strategy=NLPDDPStrategy(), **trainer_cfg) # experiment logs, checkpoints, and auto-resume are managed by exp_manager and PyTorch Lightning exp_manager(trainer, cfg.exp_manager) diff --git a/examples/nlp/machine_translation/megatron_nmt_training.py b/examples/nlp/machine_translation/megatron_nmt_training.py index 97c09ea00e38..b5dbdfc1de7f 100644 --- a/examples/nlp/machine_translation/megatron_nmt_training.py +++ b/examples/nlp/machine_translation/megatron_nmt_training.py @@ -27,7 +27,7 @@ from nemo.collections.nlp.parts.nlp_overrides import ( GradScaler, MegatronHalfPrecisionPlugin, - NLPDDPPlugin, + NLPDDPStrategy, NLPSaveRestoreConnector, PipelineMixedPrecisionPlugin, ) @@ -42,13 +42,12 @@ def main(cfg) -> None: logging.info(f'\n{OmegaConf.to_yaml(cfg)}') megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) - plugins = [ - NLPDDPPlugin( - no_ddp_communication_hook=True, - gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, - find_unused_parameters=False, - ) - ] + plugins = [] + strategy = NLPDDPStrategy( + no_ddp_communication_hook=True, + gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, + find_unused_parameters=False, + ) if cfg.trainer.precision in [16, 'bf16']: scaler = None if cfg.trainer.precision == 16: @@ -65,7 +64,7 @@ def main(cfg) -> None: if cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) - trainer = Trainer(plugins=plugins, **cfg.trainer, callbacks=[ModelSummary(max_depth=3)]) + trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[ModelSummary(max_depth=3)]) # tokenizers will be trained and and tarred training data will be created if needed # model config is then updated diff --git a/examples/nlp/machine_translation/nmt_transformer_infer_megatron.py b/examples/nlp/machine_translation/nmt_transformer_infer_megatron.py index b1c6de1254c8..a8d87f71dcbe 100644 --- a/examples/nlp/machine_translation/nmt_transformer_infer_megatron.py +++ b/examples/nlp/machine_translation/nmt_transformer_infer_megatron.py @@ -29,7 +29,7 @@ from nemo.collections.nlp.models.machine_translation.megatron_nmt_model import MegatronNMTModel from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin, NLPSaveRestoreConnector +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector from nemo.core.config import hydra_runner from nemo.utils import logging from nemo.utils.app_state import AppState @@ -48,7 +48,7 @@ def main(cfg) -> None: # trainer required for restoring model parallel models - trainer = Trainer(plugins=NLPDDPPlugin(), **cfg.trainer) + trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer) assert ( cfg.trainer.devices * cfg.trainer.num_nodes == cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size diff --git a/examples/nlp/question_answering/conf/question_answering_squad_config.yaml b/examples/nlp/question_answering/conf/question_answering_squad_config.yaml index f0e677441c5e..2e54b6fecc7e 100644 --- a/examples/nlp/question_answering/conf/question_answering_squad_config.yaml +++ b/examples/nlp/question_answering/conf/question_answering_squad_config.yaml @@ -7,7 +7,7 @@ trainer: devices: 1 # the number of gpus, 0 for CPU, or list with gpu indices num_nodes: 1 max_epochs: 2 # the number of training epochs - max_steps: null # precedence over max_epochs + max_steps: -1 # precedence over max_epochs accumulate_grad_batches: 1 # accumulates grads every k batches precision: 16 # 16 to use AMP accelerator: gpu diff --git a/examples/nlp/text2sparql/conf/text2sparql_config.yaml b/examples/nlp/text2sparql/conf/text2sparql_config.yaml index 21de59b6cb14..b9823e79b050 100644 --- a/examples/nlp/text2sparql/conf/text2sparql_config.yaml +++ b/examples/nlp/text2sparql/conf/text2sparql_config.yaml @@ -6,7 +6,7 @@ trainer: devices: 1 # the number of gpus, 0 for CPU, or list with gpu indices num_nodes: 1 max_epochs: 2 # the number of training epochs - max_steps: null # precedence over max_epochs + max_steps: -1 # precedence over max_epochs accumulate_grad_batches: 1 # accumulates grads every k batches accelerator: gpu strategy: ddp diff --git a/examples/nlp/text_classification/conf/ptune_text_classification_config.yaml b/examples/nlp/text_classification/conf/ptune_text_classification_config.yaml index 9f91620bdf0a..be6499f9a643 100644 --- a/examples/nlp/text_classification/conf/ptune_text_classification_config.yaml +++ b/examples/nlp/text_classification/conf/ptune_text_classification_config.yaml @@ -18,7 +18,7 @@ trainer: devices: 1 # number of GPUs (0 for CPU), or list of the GPUs to use e.g. [0, 1] num_nodes: 1 max_epochs: 100 - max_steps: null # precedence over max_epochs + max_steps: -1 # precedence over max_epochs accumulate_grad_batches: 1 # accumulates grads every k batches gradient_clip_val: 0.0 precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. diff --git a/examples/nlp/text_classification/conf/text_classification_config.yaml b/examples/nlp/text_classification/conf/text_classification_config.yaml index af979929edc0..abc81ebdd0d3 100644 --- a/examples/nlp/text_classification/conf/text_classification_config.yaml +++ b/examples/nlp/text_classification/conf/text_classification_config.yaml @@ -18,7 +18,7 @@ trainer: devices: 1 # number of GPUs (0 for CPU), or list of the GPUs to use e.g. [0, 1] num_nodes: 1 max_epochs: 100 - max_steps: null # precedence over max_epochs + max_steps: -1 # precedence over max_epochs accumulate_grad_batches: 1 # accumulates grads every k batches gradient_clip_val: 0.0 precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. diff --git a/examples/nlp/text_classification/model_parallel_text_classification_evaluation.py b/examples/nlp/text_classification/model_parallel_text_classification_evaluation.py index 255a27ee79b4..ab3322f552c1 100644 --- a/examples/nlp/text_classification/model_parallel_text_classification_evaluation.py +++ b/examples/nlp/text_classification/model_parallel_text_classification_evaluation.py @@ -19,7 +19,7 @@ from omegaconf import DictConfig, OmegaConf from nemo.collections.nlp.models.text_classification import TextClassificationModel -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy from nemo.core.config import hydra_runner from nemo.utils import logging from nemo.utils.exp_manager import exp_manager @@ -28,7 +28,7 @@ @hydra_runner(config_path="conf", config_name="text_classification_config") def main(cfg: DictConfig) -> None: logging.info(f'\nConfig Params:\n{OmegaConf.to_yaml(cfg)}') - trainer = pl.Trainer(plugins=[NLPDDPPlugin()], **cfg.trainer) + trainer = pl.Trainer(strategy=NLPDDPStrategy(), **cfg.trainer) exp_manager(trainer, cfg.get("exp_manager", None)) # TODO: can we drop strict=False model = TextClassificationModel.restore_from(cfg.model.nemo_path, trainer=trainer, strict=False) diff --git a/examples/nlp/text_classification/text_classification_with_bert.py b/examples/nlp/text_classification/text_classification_with_bert.py index 2ae9469fb998..db3d263cb1ab 100644 --- a/examples/nlp/text_classification/text_classification_with_bert.py +++ b/examples/nlp/text_classification/text_classification_with_bert.py @@ -99,7 +99,7 @@ from omegaconf import DictConfig, OmegaConf from nemo.collections.nlp.models.text_classification import TextClassificationModel -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy from nemo.core.config import hydra_runner from nemo.utils import logging from nemo.utils.exp_manager import exp_manager @@ -109,11 +109,11 @@ def main(cfg: DictConfig) -> None: logging.info(f'\nConfig Params:\n{OmegaConf.to_yaml(cfg)}') try: - plugin = NLPDDPPlugin() + strategy = NLPDDPStrategy() except (ImportError, ModuleNotFoundError): - plugin = None + strategy = None - trainer = pl.Trainer(plugins=plugin, **cfg.trainer) + trainer = pl.Trainer(strategy=strategy, **cfg.trainer) exp_manager(trainer, cfg.get("exp_manager", None)) if not cfg.model.train_ds.file_path: diff --git a/examples/nlp/text_normalization_as_tagging/conf/thutmose_tagger_itn_config.yaml b/examples/nlp/text_normalization_as_tagging/conf/thutmose_tagger_itn_config.yaml index 37ee85e7c53b..a95947b4aff3 100644 --- a/examples/nlp/text_normalization_as_tagging/conf/thutmose_tagger_itn_config.yaml +++ b/examples/nlp/text_normalization_as_tagging/conf/thutmose_tagger_itn_config.yaml @@ -8,7 +8,7 @@ trainer: devices: 1 # the number of gpus, 0 for CPU num_nodes: 1 max_epochs: 3 # the number of training epochs - checkpoint_callback: false # provided by exp_manager + enable_checkpointing: false # provided by exp_manager logger: false # provided by exp_manager accumulate_grad_batches: 1 # accumulates grads every k batches gradient_clip_val: 0.0 diff --git a/examples/nlp/token_classification/conf/punctuation_capitalization_config.yaml b/examples/nlp/token_classification/conf/punctuation_capitalization_config.yaml index 0cb51f87dcc5..76f5fac1db7a 100644 --- a/examples/nlp/token_classification/conf/punctuation_capitalization_config.yaml +++ b/examples/nlp/token_classification/conf/punctuation_capitalization_config.yaml @@ -21,7 +21,7 @@ trainer: devices: 1 # the number of gpus, 0 for CPU num_nodes: 1 max_epochs: 3 - max_steps: null # precedence over max_epochs + max_steps: -1 # precedence over max_epochs accumulate_grad_batches: 1 # accumulates grads every k batches gradient_clip_val: 0.0 precision: 16 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0 diff --git a/examples/nlp/token_classification/conf/token_classification_config.yaml b/examples/nlp/token_classification/conf/token_classification_config.yaml index 1a5135dfc31d..b79baf4372de 100644 --- a/examples/nlp/token_classification/conf/token_classification_config.yaml +++ b/examples/nlp/token_classification/conf/token_classification_config.yaml @@ -20,7 +20,7 @@ trainer: devices: 1 # the number of gpus, 0 for CPU num_nodes: 1 max_epochs: 5 - max_steps: null # precedence over max_epochs + max_steps: -1 # precedence over max_epochs accumulate_grad_batches: 1 # accumulates grads every k batches gradient_clip_val: 0.0 precision: 16 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0 diff --git a/examples/nlp/token_classification/token_classification_train.py b/examples/nlp/token_classification/token_classification_train.py index 3e3b0a245737..9b18d10b24e6 100644 --- a/examples/nlp/token_classification/token_classification_train.py +++ b/examples/nlp/token_classification/token_classification_train.py @@ -18,7 +18,7 @@ from omegaconf import DictConfig, OmegaConf from nemo.collections.nlp.models import TokenClassificationModel -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy from nemo.core.config import hydra_runner from nemo.utils import logging from nemo.utils.exp_manager import exp_manager @@ -103,11 +103,11 @@ @hydra_runner(config_path="conf", config_name="token_classification_config") def main(cfg: DictConfig) -> None: try: - plugin = NLPDDPPlugin() + strategy = NLPDDPStrategy() except (ImportError, ModuleNotFoundError): - plugin = None + strategy = None - trainer = pl.Trainer(plugins=plugin, **cfg.trainer) + trainer = pl.Trainer(strategy=strategy, **cfg.trainer) exp_manager(trainer, cfg.get("exp_manager", None)) if not cfg.pretrained_model: diff --git a/examples/nlp/zero_shot_intent_recognition/conf/zero_shot_intent_config.yaml b/examples/nlp/zero_shot_intent_recognition/conf/zero_shot_intent_config.yaml index a3f80d9cccbd..0817c486a089 100644 --- a/examples/nlp/zero_shot_intent_recognition/conf/zero_shot_intent_config.yaml +++ b/examples/nlp/zero_shot_intent_recognition/conf/zero_shot_intent_config.yaml @@ -17,7 +17,7 @@ trainer: devices: 1 # the number of gpus, 0 for CPU num_nodes: 1 max_epochs: 1 - max_steps: null # precedence over max_epochs + max_steps: -1 # precedence over max_epochs accumulate_grad_batches: 1 # accumulates grads every k batches precision: 16 accelerator: gpu diff --git a/examples/speaker_tasks/recognition/conf/SpeakerNet_recognition_3x2x512.yaml b/examples/speaker_tasks/recognition/conf/SpeakerNet_recognition_3x2x512.yaml index a4ec0a19b76e..bf7aa7d00aff 100644 --- a/examples/speaker_tasks/recognition/conf/SpeakerNet_recognition_3x2x512.yaml +++ b/examples/speaker_tasks/recognition/conf/SpeakerNet_recognition_3x2x512.yaml @@ -122,7 +122,7 @@ model: sched: name: CosineAnnealing iters_per_batch: 1 # computed at runtime - max_steps: null # computed at runtime or explicitly set here + max_steps: -1 # computed at runtime or explicitly set here # scheduler config override args: @@ -136,7 +136,7 @@ model: trainer: devices: 1 # number of gpus max_epochs: 200 - max_steps: null # computed at runtime if not set + max_steps: -1 # computed at runtime if not set num_nodes: 1 accelerator: gpu strategy: ddp diff --git a/nemo/collections/asr/modules/conv_asr.py b/nemo/collections/asr/modules/conv_asr.py index 85431604a883..e4a798dd5fae 100644 --- a/nemo/collections/asr/modules/conv_asr.py +++ b/nemo/collections/asr/modules/conv_asr.py @@ -763,7 +763,7 @@ class SpeakerDecoder(NeuralModule, Exportable): num_classes (int): Number of unique speakers in dataset emb_sizes (list) : shapes of intermediate embedding layers (we consider speaker embbeddings from 1st of this layers) Defaults to [1024,1024] - pool_mode (str) : Pooling stratergy type. options are 'xvector','tap', 'attention' + pool_mode (str) : Pooling strategy type. options are 'xvector','tap', 'attention' Defaults to 'xvector (mean and variance)' tap (temporal average pooling: just mean) attention (attention based pooling) diff --git a/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py b/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py index 4acffb63f4b0..27da5bd1812c 100644 --- a/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py +++ b/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py @@ -603,10 +603,6 @@ def build_memmap_dataset_from_config(self, cfg: DictConfig): data_prefix.append(weight) data_prefix.append(prefix) - if self.trainer.max_steps is None: - raise ValueError( - f"trainer.max_steps must be set to use blendable memmap datasets. Found {self.trainer.max_steps}." - ) num_train_samples = [self.trainer.max_steps * self._cfg.global_batch_size] _, _, num_train_samples_per_dataset = get_datasets_weights_and_num_samples(data_prefix, num_train_samples) num_train_samples_after_blend = sum([x[0] for x in num_train_samples_per_dataset]) diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py index e826947e3755..2738041ecf85 100644 --- a/nemo/collections/nlp/parts/nlp_overrides.py +++ b/nemo/collections/nlp/parts/nlp_overrides.py @@ -29,7 +29,7 @@ from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import NativeMixedPrecisionPlugin from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin -from pytorch_lightning.plugins.training_type.ddp import DDPPlugin +from pytorch_lightning.strategies.ddp import DDPStrategy from pytorch_lightning.trainer.trainer import Trainer from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.fetching import DataFetcher @@ -54,7 +54,7 @@ HAVE_APEX = False -class NLPDDPPlugin(DDPPlugin): +class NLPDDPStrategy(DDPStrategy): """ DDP plugin for Pytorch Lightning. Needed to customize DDP for model parallel models. Args: @@ -219,7 +219,7 @@ def distributed_sampler_kwargs(self): return distributed_sampler_kwargs else: - return super(NLPDDPPlugin, self).distributed_sampler_kwargs + return super(NLPDDPStrategy, self).distributed_sampler_kwargs class NLPSaveRestoreConnector(SaveRestoreConnector): diff --git a/nemo/core/classes/modelPT.py b/nemo/core/classes/modelPT.py index 8bab7c573ac1..032ce63c2dc3 100644 --- a/nemo/core/classes/modelPT.py +++ b/nemo/core/classes/modelPT.py @@ -483,7 +483,7 @@ def setup_optimization(self, optim_config: Optional[Union[DictConfig, Dict]] = N if not isinstance(self._trainer.accumulate_grad_batches, int): raise ValueError("We do not currently support gradient acculumation that is not an integer.") - if self._trainer.max_steps is None or self.trainer.max_steps < 0: + if self.trainer.max_steps < 0: # Store information needed to calculate max_steps optim_config['sched']['t_max_epochs'] = self._trainer.max_epochs optim_config['sched']['t_accumulate_grad_batches'] = self._trainer.accumulate_grad_batches @@ -1355,6 +1355,10 @@ def cfg(self): """ return self._cfg + @LightningModule.trainer.getter + def trainer(self): + return self._trainer + @cfg.setter def cfg(self, cfg): """ @@ -1469,3 +1473,34 @@ def on_train_batch_end(self, outputs, batch: Any, batch_idx: int, unused: int = if batch_idx == self._nsys_profile_end_step and get_rank() in self._nsys_profile_ranks: logging.info("====== End nsys profiling ======") torch.cuda.cudart().cudaProfilerStop() + + # TODO: Remove in PTL 1.7.2 + def cuda(self, device=None): + """ PTL is overriding this method and changing the pytorch behavior of a module. + The PTL LightingModule override will move the module to device 0 if device is None. + See the PTL method here: https://github.com/Lightning-AI/lightning/blob/master/src/pytorch_lightning/core/mixins/device_dtype_mixin.py#L113 + + Here we are overriding this to maintain the default Pytorch nn.module behavior: + https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/module.py#L728 + + Moves all model parameters and buffers to the GPU. + + This also makes associated parameters and buffers different objects. So + it should be called before constructing optimizer if the module will + live on GPU while being optimized. + + .. note:: + This method modifies the module in-place. + + Args: + device (int, optional): if specified, all parameters will be + copied to that device + + Returns: + Module: self + """ + if device is None: + device = torch.device("cuda", torch.cuda.current_device()) + elif isinstance(device, int): + device = torch.device("cuda", index=device) + return super().cuda(device=device) diff --git a/nemo/core/config/pytorch_lightning.py b/nemo/core/config/pytorch_lightning.py index 873a70ad4e0c..8ce3bdf236a0 100644 --- a/nemo/core/config/pytorch_lightning.py +++ b/nemo/core/config/pytorch_lightning.py @@ -36,17 +36,13 @@ class TrainerConfig: """ logger: Any = True - checkpoint_callback: Any = True callbacks: Optional[Any] = None default_root_dir: Optional[str] = None gradient_clip_val: float = 0 - process_position: int = 0 num_nodes: int = 1 gpus: Optional[Any] = None auto_select_gpus: bool = False tpu_cores: Optional[Any] = None - log_gpu_memory: Optional[str] = None - progress_bar_refresh_rate: int = 1 enable_progress_bar: bool = True overfit_batches: Any = 0.0 track_grad_norm: Any = -1 @@ -55,18 +51,16 @@ class TrainerConfig: accumulate_grad_batches: Any = 1 max_epochs: int = 1000 min_epochs: int = 1 - max_steps: Optional[int] = None + max_steps: Optional[int] = -1 min_steps: Optional[int] = None limit_train_batches: Any = 1.0 limit_val_batches: Any = 1.0 limit_test_batches: Any = 1.0 val_check_interval: Any = 1.0 - flush_logs_every_n_steps: int = 100 log_every_n_steps: int = 50 accelerator: Optional[str] = None sync_batchnorm: bool = False precision: Any = 32 - weights_summary: Optional[str] = "full" # ModelSummary.MODE_DEFAULT weights_save_path: Optional[str] = None num_sanity_val_steps: int = 2 resume_from_checkpoint: Optional[str] = None @@ -76,23 +70,20 @@ class TrainerConfig: auto_lr_find: Any = False replace_sampler_ddp: bool = True detect_anomaly: bool = False - terminate_on_nan: bool = False auto_scale_batch_size: Any = False - prepare_data_per_node: bool = True amp_backend: str = 'native' amp_level: Optional[str] = None plugins: Optional[Any] = None # Optional[Union[str, list]] move_metrics_to_cpu: bool = False multiple_trainloader_mode: str = 'max_size_cycle' limit_predict_batches: float = 1.0 - stochastic_weight_avg: bool = False gradient_clip_algorithm: str = 'norm' max_time: Optional[Any] = None # can be one of Union[str, timedelta, Dict[str, int], None] reload_dataloaders_every_n_epochs: int = 0 ipus: Optional[int] = None devices: Any = None strategy: Any = None - enable_checkpointing: bool = True + enable_checkpointing: bool = False enable_model_summary: bool = True diff --git a/nemo/core/optim/lr_scheduler.py b/nemo/core/optim/lr_scheduler.py index 269a80e2f536..922dc8636fe8 100644 --- a/nemo/core/optim/lr_scheduler.py +++ b/nemo/core/optim/lr_scheduler.py @@ -704,7 +704,7 @@ def prepare_lr_scheduler( sched: name: iters_per_batch: null # computed at runtime; mandatory to have - max_steps: null # computed at runtime or explicitly set here; mandatory to have + max_steps: -1 # computed at runtime or explicitly set here; mandatory to have # pytorch lightning args monitor: val_loss diff --git a/requirements/requirements_lightning.txt b/requirements/requirements_lightning.txt index d828e4796222..83df00e8e3e1 100644 --- a/requirements/requirements_lightning.txt +++ b/requirements/requirements_lightning.txt @@ -1,4 +1,4 @@ -pytorch-lightning>=1.6.1,<1.7.0 +pytorch-lightning>=1.7.0 torchmetrics>=0.4.1rc0 transformers>=4.0.1 webdataset>=0.1.48,<=0.1.62 diff --git a/scripts/checkpoint_averaging/megatron_checkpoint_averaging.py b/scripts/checkpoint_averaging/megatron_checkpoint_averaging.py index a48e1f3fd10f..4ac6a6546fde 100755 --- a/scripts/checkpoint_averaging/megatron_checkpoint_averaging.py +++ b/scripts/checkpoint_averaging/megatron_checkpoint_averaging.py @@ -37,7 +37,7 @@ import torch from pytorch_lightning.trainer.trainer import Trainer -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin, NLPSaveRestoreConnector +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector from nemo.core import ModelPT from nemo.utils import logging, model_utils @@ -71,7 +71,7 @@ def main(): device = torch.device("cpu") - trainer = Trainer(plugins=NLPDDPPlugin(), devices=1, num_nodes=1, precision=16, accelerator='gpu') + trainer = Trainer(strategy=NLPDDPStrategy(), devices=1, num_nodes=1, precision=16, accelerator='gpu') # loop over all folders with .nemo files (or .nemo files) for model_fname_i, model_fname in enumerate(args.model_fname_list): if not model_fname.endswith(".nemo"): diff --git a/scripts/export.py b/scripts/export.py index f8cd98d7d7b6..5c0bc2a0a6f1 100644 --- a/scripts/export.py +++ b/scripts/export.py @@ -90,7 +90,7 @@ def nemo_export(argv): num_nodes=1, # Need to set the following two to False as ExpManager will take care of them differently. logger=False, - checkpoint_callback=False, + enable_checkpointing=False, ) trainer = Trainer(cfg_trainer) diff --git a/scripts/nemo_legacy_import/nlp_checkpoint_port.py b/scripts/nemo_legacy_import/nlp_checkpoint_port.py index 55dc3e63984b..a0a83d761f92 100644 --- a/scripts/nemo_legacy_import/nlp_checkpoint_port.py +++ b/scripts/nemo_legacy_import/nlp_checkpoint_port.py @@ -88,7 +88,7 @@ def nemo_convert(argv): num_nodes=1, # Need to set the following two to False as ExpManager will take care of them differently. logger=False, - checkpoint_callback=False, + enable_checkpointing=False, ) trainer = pl.Trainer(cfg_trainer) diff --git a/scripts/speaker_tasks/filelist_to_manifest.py b/scripts/speaker_tasks/filelist_to_manifest.py index 49e4b97bb3e8..4369c177aef0 100644 --- a/scripts/speaker_tasks/filelist_to_manifest.py +++ b/scripts/speaker_tasks/filelist_to_manifest.py @@ -46,7 +46,7 @@ --min_spkrs_count: min number of samples per speaker to consider and ignore otherwise, defaults to 0 (all speakers) """ -DURATIONS = sorted([1, 2, 3, 4], reverse=True) +DURATIONS = sorted([3], reverse=True) MIN_ENERGY = 0.01 CWD = os.getcwd() diff --git a/tests/collections/asr/numba/rnnt_loss/test_rnnt_pytorch.py b/tests/collections/asr/numba/rnnt_loss/test_rnnt_pytorch.py index dc08d41bbd60..d303e5355bf9 100644 --- a/tests/collections/asr/numba/rnnt_loss/test_rnnt_pytorch.py +++ b/tests/collections/asr/numba/rnnt_loss/test_rnnt_pytorch.py @@ -431,7 +431,7 @@ def zero_grad(): pt_cost2, _ = wrap_and_call(fn_pt, acts2, labels2, device) pt_grads1_p_2 = base_layer.grad.clone().cpu().numpy() - assert np.allclose(pt_grads1_p_2, np_grads1 + np_grads2, atol=1e-6) + assert np.allclose(pt_grads1_p_2, np_grads1 + np_grads2, atol=1e-5) if __name__ == "__main__": diff --git a/tests/collections/nlp/test_gpt_eval.py b/tests/collections/nlp/test_gpt_eval.py index 2871a35ed8f8..0e64b989176f 100644 --- a/tests/collections/nlp/test_gpt_eval.py +++ b/tests/collections/nlp/test_gpt_eval.py @@ -20,7 +20,7 @@ from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy class TestGPTEval: @@ -38,7 +38,7 @@ def setup_method(self, test_method): model_file = '/home/TestData/nlp/megatron_gpt/125M/megatron_gpt.nemo' # trainer required for restoring model parallel models - trainer = Trainer(plugins=NLPDDPPlugin(), **trainer_config) + trainer = Trainer(strategy=NLPDDPStrategy(), **trainer_config) assert ( trainer_config["devices"] * trainer_config['num_nodes'] == tensor_model_parallel_size * pipeline_model_parallel_size diff --git a/tests/collections/nlp/test_gpt_model.py b/tests/collections/nlp/test_gpt_model.py index 9e125c1a1a4e..0c9104d06245 100644 --- a/tests/collections/nlp/test_gpt_model.py +++ b/tests/collections/nlp/test_gpt_model.py @@ -22,7 +22,7 @@ from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy DEVICE_CAPABILITY = None if torch.cuda.is_available(): @@ -133,9 +133,9 @@ def gpt_model(model_cfg, trainer_cfg, precision): model_cfg['precision'] = precision trainer_cfg['precision'] = precision - plugins = [NLPDDPPlugin()] + strategy = NLPDDPStrategy() - trainer = Trainer(plugins=plugins, **trainer_cfg) + trainer = Trainer(strategy=strategy, **trainer_cfg) cfg = DictConfig(model_cfg) diff --git a/tests/collections/nlp/test_nlp_exportables.py b/tests/collections/nlp/test_nlp_exportables.py index 21f65ec5d94b..da7afa90213e 100644 --- a/tests/collections/nlp/test_nlp_exportables.py +++ b/tests/collections/nlp/test_nlp_exportables.py @@ -87,6 +87,11 @@ def test_IntentSlotClassificationModel_export_to_onnx(self, dummy_data): config.trainer.devices = 1 config.trainer.precision = 32 config.trainer.strategy = None + config.trainer.max_steps = -1 + if ( + 'checkpoint_callback' in config.trainer + ): # TODO: Update this to create deafult config rather than pulling from git + del config.trainer.checkpoint_callback trainer = pl.Trainer(**config.trainer) model = IntentSlotClassificationModel(config.model, trainer=trainer) filename = os.path.join(tmpdir, 'isc.onnx') diff --git a/tests/collections/nlp/test_retrieval_module.py b/tests/collections/nlp/test_retrieval_module.py index 60fe81356a0d..b5da4e20085f 100644 --- a/tests/collections/nlp/test_retrieval_module.py +++ b/tests/collections/nlp/test_retrieval_module.py @@ -34,7 +34,7 @@ init_method_normal, scaled_init_method_normal, ) -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy try: from apex.transformer.enums import AttnMaskType @@ -52,15 +52,12 @@ def setup_class(cls): if not torch.cuda.is_available(): return GPUS = 1 - plugins = [NLPDDPPlugin()] TP_SIZE = GPUS PP_SIZE = 1 MB_SIZE = 4 GB_SIZE = 8 SEED = 1234 - trainer = Trainer( - plugins=plugins, devices=GPUS, accelerator='gpu', num_nodes=1, logger=None, log_gpu_memory=None - ) + trainer = Trainer(strategy=NLPDDPStrategy(), devices=GPUS, accelerator='gpu', num_nodes=1, logger=None,) initialize_model_parallel_for_nemo( world_size=trainer.world_size, diff --git a/tests/collections/nlp/test_retrieval_module_inference.py b/tests/collections/nlp/test_retrieval_module_inference.py index 437679c37478..fa74f8b14df7 100644 --- a/tests/collections/nlp/test_retrieval_module_inference.py +++ b/tests/collections/nlp/test_retrieval_module_inference.py @@ -35,7 +35,7 @@ init_method_normal, scaled_init_method_normal, ) -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy try: from apex.transformer.enums import AttnMaskType @@ -53,15 +53,12 @@ def setup_class(cls): if not torch.cuda.is_available(): return GPUS = 1 - plugins = [NLPDDPPlugin()] TP_SIZE = GPUS PP_SIZE = 1 MB_SIZE = 4 GB_SIZE = 8 SEED = 1234 - trainer = Trainer( - plugins=plugins, devices=GPUS, accelerator='gpu', num_nodes=1, logger=None, log_gpu_memory=None - ) + trainer = Trainer(strategy=NLPDDPStrategy(), devices=GPUS, accelerator='gpu', num_nodes=1, logger=None,) initialize_model_parallel_for_nemo( world_size=trainer.world_size, diff --git a/tests/core/test_optimizers_schedulers.py b/tests/core/test_optimizers_schedulers.py index 2bf827f61283..b0e0eee17216 100644 --- a/tests/core/test_optimizers_schedulers.py +++ b/tests/core/test_optimizers_schedulers.py @@ -876,8 +876,7 @@ def train( accumulate_grad_batches=accumulate_grad_batches, limit_train_batches=limit_train_batches, enable_checkpointing=False, - progress_bar_refresh_rate=0, - weights_summary=None, + enable_progress_bar=False, ) max_steps = optim.lr_scheduler.compute_max_steps( max_epochs, accumulate_grad_batches, limit_train_batches, devices, dataset_len, batch_size, drop_last, @@ -952,8 +951,7 @@ def train( accumulate_grad_batches=accumulate_grad_batches, limit_train_batches=limit_train_batches, enable_checkpointing=False, - progress_bar_refresh_rate=0, - weights_summary=None, + enable_progress_bar=False, ) model = ExampleModel(batch_size, dataset_len, drop_last, max_steps) trainer.callbacks.append(SchedulerNoOpCallback()) diff --git a/tests/core_ptl/test_ptl_stateless_timer.py b/tests/core_ptl/test_ptl_stateless_timer.py index c8ef019fcf9d..c78ee18edb87 100644 --- a/tests/core_ptl/test_ptl_stateless_timer.py +++ b/tests/core_ptl/test_ptl_stateless_timer.py @@ -96,8 +96,8 @@ def setup_model(self): accelerator='gpu', strategy='ddp', logger=None, + enable_checkpointing=False, callbacks=[StatelessTimer('00:00:00:03')], - checkpoint_callback=False, ) exp_manager_cfg = ExpManagerConfig( explicit_log_dir='./ptl_stateless_timer_check/', diff --git a/tutorials/00_NeMo_Primer.ipynb b/tutorials/00_NeMo_Primer.ipynb index 761dc2748791..0923f87ee517 100644 --- a/tutorials/00_NeMo_Primer.ipynb +++ b/tutorials/00_NeMo_Primer.ipynb @@ -652,7 +652,7 @@ " name: CosineAnnealing\n", "\n", " # Optional arguments\n", - " max_steps: null # computed at runtime or explicitly set here\n", + " max_steps: -1 # computed at runtime or explicitly set here\n", " monitor: val_loss\n", " reduce_on_plateau: false\n", "\n", diff --git a/tutorials/asr/Multilang_ASR.ipynb b/tutorials/asr/Multilang_ASR.ipynb index 96c73b668536..a6bc8adfe4c9 100644 --- a/tutorials/asr/Multilang_ASR.ipynb +++ b/tutorials/asr/Multilang_ASR.ipynb @@ -1550,10 +1550,10 @@ " max_epochs=MAX_EPOCHS, \n", " accumulate_grad_batches=GRAD_ACCUM,\n", " precision=16,\n", - " checkpoint_callback=False,\n", + " enable_checkpointing=False,\n", " logger=False,\n", " log_every_n_steps=LOG_EVERY_N_STEPS,\n", - " progress_bar_refresh_rate=1,\n", + " enable_progress_bar=True,\n", " check_val_every_n_epoch=1)" ] }, @@ -5032,4 +5032,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/tutorials/asr/Speech_Commands.ipynb b/tutorials/asr/Speech_Commands.ipynb index c82f2cfab549..5687b4f1600f 100644 --- a/tutorials/asr/Speech_Commands.ipynb +++ b/tutorials/asr/Speech_Commands.ipynb @@ -1323,7 +1323,7 @@ " devices=1,\n", " accelerator=accelerator,\n", " max_epochs=5,\n", - " max_steps=None, # computed at runtime if not set\n", + " max_steps=-1, # computed at runtime if not set\n", " num_nodes=1,\n", " accumulate_grad_batches=1,\n", " enable_checkpointing=False, # Provided by exp_manager\n", @@ -1623,4 +1623,4 @@ ] } ] -} \ No newline at end of file +} diff --git a/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb b/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb index f4b8e66b3892..abbd1b09537f 100644 --- a/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb +++ b/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb @@ -823,7 +823,7 @@ "source": [ "import torch\n", "import pytorch_lightning as pl\n", - "from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin\n", + "from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy\n", "from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment\n", "\n", "# lets modify some trainer configs\n", @@ -848,8 +848,8 @@ "os.environ[\"RANK\"] = '0'\n", "os.environ[\"WORLD_SIZE\"] = '1'\n", "\n", - "plugins = [NLPDDPPlugin(find_unused_parameters=False, no_ddp_communication_hook=True), TorchElasticEnvironment()]\n", - "trainer = pl.Trainer(plugins=plugins, **config.trainer)\n", + "strategy = NLPDDPStrategy(find_unused_parameters=False, no_ddp_communication_hook=True), TorchElasticEnvironment()\n", + "trainer = pl.Trainer(strategy=strategy, **config.trainer)\n", "\n", "print(\"Trainer config - \\n\")\n", "print(OmegaConf.to_yaml(config.trainer))" @@ -1136,8 +1136,8 @@ "config.model.optim.lr = 5e-4\n", "\n", "# Reset the trainer\n", - "plugins = [NLPDDPPlugin(find_unused_parameters=False, no_ddp_communication_hook=True), TorchElasticEnvironment()]\n", - "trainer = pl.Trainer(plugins=plugins, **config.trainer)\n", + "strategy = NLPDDPStrategy(find_unused_parameters=False, no_ddp_communication_hook=True), TorchElasticEnvironment()\n", + "trainer = pl.Trainer(strategy=strategy, **config.trainer)\n", "\n", "print(\"Trainer config - \\n\")\n", "print(OmegaConf.to_yaml(config.trainer))" diff --git a/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb b/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb index 00f99db6e243..9d0ae82c3ebf 100644 --- a/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb +++ b/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb @@ -826,7 +826,7 @@ " devices=1,\n", " accelerator=accelerator,\n", " max_epochs=5,\n", - " max_steps=None, # computed at runtime if not set\n", + " max_steps=-1, # computed at runtime if not set\n", " num_nodes=1,\n", " accumulate_grad_batches=1,\n", " enable_checkpointing=False, # Provided by exp_manager\n",