From e32da155a197af399e109b0e102c8181d6d9da75 Mon Sep 17 00:00:00 2001 From: Shay Aharon <80472096+shaydeci@users.noreply.github.com> Date: Mon, 30 Jan 2023 16:57:35 +0200 Subject: [PATCH] Hotfix/sg 645 regression tests essential fixes (#669) * release tag removed for check * missing ckpt root in recipes fixed * warn instead of error if max batches longer then loader * .__version__update * release tag filter added back --- .circleci/config.yml | 56 +++++++++---------- src/super_gradients/__init__.py | 2 +- .../recipes/imagenet_mobilenetv3_large.yaml | 2 +- .../recipes/imagenet_mobilenetv3_small.yaml | 2 +- .../recipes/imagenet_repvgg.yaml | 2 +- .../recipes/imagenet_vit_base.yaml | 2 +- .../recipes/imagenet_vit_large.yaml | 2 +- .../training/sg_trainer/sg_trainer.py | 22 ++++---- 8 files changed, 45 insertions(+), 45 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 1c29c6b5fe..286e26d331 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -443,12 +443,12 @@ jobs: python3.8 -m pip install -r requirements.txt python3.8 -m pip install . python3.8 -m pip install torch==1.12.0+cu116 torchvision==0.13.0+cu116 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu116 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_regnetY architecture=regnetY600 dataset_params.train_dataloader_params.batch_size=8 dataset_params.val_dataloader_params.batch_size=16 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_regnetY architecture=regnetY800 dataset_params.train_dataloader_params.batch_size=8 dataset_params.val_dataloader_params.batch_size=16 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_repvgg dataset_params.train_dataloader_params.batch_size=8 dataset_params.val_dataloader_params.batch_size=16 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_resnet50 dataset_params.train_dataloader_params.batch_size=8 dataset_params.val_dataloader_params.batch_size=16 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_vit_base dataset_params.train_dataloader_params.batch_size=8 dataset_params.val_dataloader_params.batch_size=16 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 - python3.8 src/super_gradients/examples/train_from_kd_recipe_example/train_from_kd_recipe.py --config-name=imagenet_resnet50_kd dataset_params.train_dataloader_params.batch_size=8 dataset_params.val_dataloader_params.batch_size=8 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_regnetY architecture=regnetY600 dataset_params.train_dataloader_params.batch_size=8 dataset_params.val_dataloader_params.batch_size=16 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False training_hyperparams.max_train_batches=100 training_hyperparams.max_valid_batches=100 multi_gpu=DDP num_gpus=4 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_regnetY architecture=regnetY800 dataset_params.train_dataloader_params.batch_size=8 dataset_params.val_dataloader_params.batch_size=16 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False training_hyperparams.max_train_batches=100 training_hyperparams.max_valid_batches=100 multi_gpu=DDP num_gpus=4 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_repvgg dataset_params.train_dataloader_params.batch_size=8 dataset_params.val_dataloader_params.batch_size=16 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False training_hyperparams.max_train_batches=100 training_hyperparams.max_valid_batches=100 multi_gpu=DDP num_gpus=4 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_resnet50 dataset_params.train_dataloader_params.batch_size=8 dataset_params.val_dataloader_params.batch_size=16 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False training_hyperparams.max_train_batches=100 training_hyperparams.max_valid_batches=100 multi_gpu=DDP num_gpus=4 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_vit_base dataset_params.train_dataloader_params.batch_size=8 dataset_params.val_dataloader_params.batch_size=16 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False training_hyperparams.max_train_batches=100 training_hyperparams.max_valid_batches=100 multi_gpu=DDP num_gpus=4 + python3.8 src/super_gradients/examples/train_from_kd_recipe_example/train_from_kd_recipe.py --config-name=imagenet_resnet50_kd dataset_params.train_dataloader_params.batch_size=8 dataset_params.val_dataloader_params.batch_size=8 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False training_hyperparams.max_train_batches=100 training_hyperparams.max_valid_batches=100 multi_gpu=DDP num_gpus=4 - run: name: Remove new environment when failed @@ -478,12 +478,12 @@ jobs: python3.8 -m pip install -r requirements.txt python3.8 -m pip install . python3.8 -m pip install torch==1.12.0+cu116 torchvision==0.13.0+cu116 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu116 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_efficientnet dataset_params.train_dataloader_params.batch_size=8 dataset_params.val_dataloader_params.batch_size=16 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_mobilenetv2 dataset_params.train_dataloader_params.batch_size=8 dataset_params.val_dataloader_params.batch_size=16 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_mobilenetv3_large dataset_params.train_dataloader_params.batch_size=8 dataset_params.val_dataloader_params.batch_size=16 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_mobilenetv3_small dataset_params.train_dataloader_params.batch_size=8 dataset_params.val_dataloader_params.batch_size=16 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_regnetY architecture=regnetY200 dataset_params.train_dataloader_params.batch_size=8 dataset_params.val_dataloader_params.batch_size=16 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_regnetY architecture=regnetY400 dataset_params.train_dataloader_params.batch_size=8 dataset_params.val_dataloader_params.batch_size=16 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_efficientnet dataset_params.train_dataloader_params.batch_size=8 dataset_params.val_dataloader_params.batch_size=16 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False training_hyperparams.max_train_batches=100 training_hyperparams.max_valid_batches=100 multi_gpu=DDP num_gpus=4 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_mobilenetv2 dataset_params.train_dataloader_params.batch_size=8 dataset_params.val_dataloader_params.batch_size=16 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False training_hyperparams.max_train_batches=100 training_hyperparams.max_valid_batches=100 multi_gpu=DDP num_gpus=4 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_mobilenetv3_large dataset_params.train_dataloader_params.batch_size=8 dataset_params.val_dataloader_params.batch_size=16 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False training_hyperparams.max_train_batches=100 training_hyperparams.max_valid_batches=100 multi_gpu=DDP num_gpus=4 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_mobilenetv3_small dataset_params.train_dataloader_params.batch_size=8 dataset_params.val_dataloader_params.batch_size=16 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False training_hyperparams.max_train_batches=100 training_hyperparams.max_valid_batches=100 multi_gpu=DDP num_gpus=4 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_regnetY architecture=regnetY200 dataset_params.train_dataloader_params.batch_size=8 dataset_params.val_dataloader_params.batch_size=16 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False training_hyperparams.max_train_batches=100 training_hyperparams.max_valid_batches=100 multi_gpu=DDP num_gpus=4 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=imagenet_regnetY architecture=regnetY400 dataset_params.train_dataloader_params.batch_size=8 dataset_params.val_dataloader_params.batch_size=16 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False training_hyperparams.max_train_batches=100 training_hyperparams.max_valid_batches=100 multi_gpu=DDP num_gpus=4 - run: name: Remove new environment when failed command: "rm -r << parameters.sg_new_env_name >>" @@ -515,18 +515,18 @@ jobs: python3.8 -m pip install torch==1.12.0+cu116 torchvision==0.13.0+cu116 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu116 wget -O $(pwd)/checkpoints/ddrnet23_slim_bb_imagenet.pth https://deci-pretrained-models.s3.amazonaws.com/ddrnet/imagenet_pt_backbones/ddrnet23_slim_bb_imagenet.pth wget -O $(pwd)/checkpoints/ddrnet23_bb_imagenet.pth https://deci-pretrained-models.s3.amazonaws.com/ddrnet/imagenet_pt_backbones/ddrnet23_bb_imagenet.pth - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_ddrnet checkpoint_params.checkpoint_path=$(pwd)/checkpoints/ddrnet23_bb_imagenet.pth dataset_params.train_dataloader_params.batch_size=3 dataset_params.val_dataloader_params.batch_size=3 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_ddrnet architecture=ddrnet_23_slim checkpoint_params.checkpoint_path=$(pwd)/checkpoints/ddrnet23_slim_bb_imagenet.pth dataset_params.train_dataloader_params.batch_size=3 dataset_params.val_dataloader_params.batch_size=3 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_ddrnet checkpoint_params.checkpoint_path=$(pwd)/checkpoints/ddrnet23_bb_imagenet.pth dataset_params.train_dataloader_params.batch_size=3 dataset_params.val_dataloader_params.batch_size=3 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False training_hyperparams.max_train_batches=100 training_hyperparams.max_valid_batches=100 multi_gpu=DDP num_gpus=4 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_ddrnet architecture=ddrnet_23_slim checkpoint_params.checkpoint_path=$(pwd)/checkpoints/ddrnet23_slim_bb_imagenet.pth dataset_params.train_dataloader_params.batch_size=3 dataset_params.val_dataloader_params.batch_size=3 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False training_hyperparams.max_train_batches=100 training_hyperparams.max_valid_batches=100 multi_gpu=DDP num_gpus=4 wget -O $(pwd)/checkpoints/stdc1_imagenet_pretrained.pth https://deci-pretrained-models.s3.amazonaws.com/stdc_backbones/stdc1_imagenet_pretrained.pth wget -O $(pwd)/checkpoints/stdc2_imagenet_pretrained.pth https://deci-pretrained-models.s3.amazonaws.com/stdc_backbones/stdc2_imagenet_pretrained.pth - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_pplite_seg50 checkpoint_params.checkpoint_path=$(pwd)/checkpoints/stdc1_imagenet_pretrained.pth architecture=pp_lite_t_seg dataset_params.train_dataloader_params.batch_size=3 dataset_params.val_dataloader_params.batch_size=3 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_pplite_seg50 checkpoint_params.checkpoint_path=$(pwd)/checkpoints/stdc2_imagenet_pretrained.pth architecture=pp_lite_b_seg dataset_params.train_dataloader_params.batch_size=3 dataset_params.val_dataloader_params.batch_size=3 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_pplite_seg75 checkpoint_params.checkpoint_path=$(pwd)/checkpoints/stdc1_imagenet_pretrained.pth architecture=pp_lite_t_seg dataset_params.train_dataloader_params.batch_size=3 dataset_params.val_dataloader_params.batch_size=3 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_pplite_seg75 checkpoint_params.checkpoint_path=$(pwd)/checkpoints/stdc2_imagenet_pretrained.pth architecture=pp_lite_b_seg dataset_params.train_dataloader_params.batch_size=3 dataset_params.val_dataloader_params.batch_size=3 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_stdc_seg50 checkpoint_params.checkpoint_path=$(pwd)/checkpoints/stdc1_imagenet_pretrained.pth dataset_params.train_dataloader_params.batch_size=3 dataset_params.val_dataloader_params.batch_size=3 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_stdc_seg50 checkpoint_params.checkpoint_path=$(pwd)/checkpoints/stdc2_imagenet_pretrained.pth architecture=stdc2_seg dataset_params.train_dataloader_params.batch_size=3 dataset_params.val_dataloader_params.batch_size=3 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_stdc_seg75 checkpoint_params.checkpoint_path=$(pwd)/checkpoints/stdc1_imagenet_pretrained.pth dataset_params.train_dataloader_params.batch_size=3 dataset_params.val_dataloader_params.batch_size=3 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_stdc_seg75 checkpoint_params.checkpoint_path=$(pwd)/checkpoints/stdc2_imagenet_pretrained.pth architecture=stdc2_seg dataset_params.train_dataloader_params.batch_size=3 dataset_params.val_dataloader_params.batch_size=3 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_pplite_seg50 checkpoint_params.checkpoint_path=$(pwd)/checkpoints/stdc1_imagenet_pretrained.pth architecture=pp_lite_t_seg dataset_params.train_dataloader_params.batch_size=3 dataset_params.val_dataloader_params.batch_size=3 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False training_hyperparams.max_train_batches=100 training_hyperparams.max_valid_batches=100 multi_gpu=DDP num_gpus=4 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_pplite_seg50 checkpoint_params.checkpoint_path=$(pwd)/checkpoints/stdc2_imagenet_pretrained.pth architecture=pp_lite_b_seg dataset_params.train_dataloader_params.batch_size=3 dataset_params.val_dataloader_params.batch_size=3 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False training_hyperparams.max_train_batches=100 training_hyperparams.max_valid_batches=100 multi_gpu=DDP num_gpus=4 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_pplite_seg75 checkpoint_params.checkpoint_path=$(pwd)/checkpoints/stdc1_imagenet_pretrained.pth architecture=pp_lite_t_seg dataset_params.train_dataloader_params.batch_size=3 dataset_params.val_dataloader_params.batch_size=3 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False training_hyperparams.max_train_batches=100 training_hyperparams.max_valid_batches=100 multi_gpu=DDP num_gpus=4 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_pplite_seg75 checkpoint_params.checkpoint_path=$(pwd)/checkpoints/stdc2_imagenet_pretrained.pth architecture=pp_lite_b_seg dataset_params.train_dataloader_params.batch_size=3 dataset_params.val_dataloader_params.batch_size=3 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False training_hyperparams.max_train_batches=100 training_hyperparams.max_valid_batches=100 multi_gpu=DDP num_gpus=4 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_stdc_seg50 checkpoint_params.checkpoint_path=$(pwd)/checkpoints/stdc1_imagenet_pretrained.pth dataset_params.train_dataloader_params.batch_size=3 dataset_params.val_dataloader_params.batch_size=3 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False training_hyperparams.max_train_batches=100 training_hyperparams.max_valid_batches=100 multi_gpu=DDP num_gpus=4 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_stdc_seg50 checkpoint_params.checkpoint_path=$(pwd)/checkpoints/stdc2_imagenet_pretrained.pth architecture=stdc2_seg dataset_params.train_dataloader_params.batch_size=3 dataset_params.val_dataloader_params.batch_size=3 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False training_hyperparams.max_train_batches=100 training_hyperparams.max_valid_batches=100 multi_gpu=DDP num_gpus=4 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_stdc_seg75 checkpoint_params.checkpoint_path=$(pwd)/checkpoints/stdc1_imagenet_pretrained.pth dataset_params.train_dataloader_params.batch_size=3 dataset_params.val_dataloader_params.batch_size=3 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False training_hyperparams.max_train_batches=100 training_hyperparams.max_valid_batches=100 multi_gpu=DDP num_gpus=4 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_stdc_seg75 checkpoint_params.checkpoint_path=$(pwd)/checkpoints/stdc2_imagenet_pretrained.pth architecture=stdc2_seg dataset_params.train_dataloader_params.batch_size=3 dataset_params.val_dataloader_params.batch_size=3 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False training_hyperparams.max_train_batches=100 training_hyperparams.max_valid_batches=100 multi_gpu=DDP num_gpus=4 - run: name: Remove new environment when failed command: "rm -r << parameters.sg_new_env_name >>" @@ -556,12 +556,12 @@ jobs: python3.8 -m pip install -r requirements.txt python3.8 -m pip install . python3.8 -m pip install torch==1.12.0+cu116 torchvision==0.13.0+cu116 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu116 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ssd_lite_mobilenet_v2 dataset_params.train_dataloader_params.batch_size=8 dataset_params.val_dataloader_params.batch_size=16 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_yolox architecture=yolox_n dataset_params.train_dataloader_params.batch_size=8 dataset_params.val_dataloader_params.batch_size=16 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_yolox architecture=yolox_t dataset_params.train_dataloader_params.batch_size=8 dataset_params.val_dataloader_params.batch_size=16 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_yolox architecture=yolox_s dataset_params.train_dataloader_params.batch_size=8 dataset_params.val_dataloader_params.batch_size=16 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_yolox architecture=yolox_m dataset_params.train_dataloader_params.batch_size=8 dataset_params.val_dataloader_params.batch_size=8 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 - python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_yolox architecture=yolox_l dataset_params.train_dataloader_params.batch_size=4 dataset_params.val_dataloader_params.batch_size=8 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_ssd_lite_mobilenet_v2 dataset_params.train_dataloader_params.batch_size=8 dataset_params.val_dataloader_params.batch_size=16 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False training_hyperparams.max_train_batches=100 training_hyperparams.max_valid_batches=100 multi_gpu=DDP num_gpus=4 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_yolox architecture=yolox_n dataset_params.train_dataloader_params.batch_size=8 dataset_params.val_dataloader_params.batch_size=16 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False training_hyperparams.max_train_batches=100 training_hyperparams.max_valid_batches=100 multi_gpu=DDP num_gpus=4 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_yolox architecture=yolox_t dataset_params.train_dataloader_params.batch_size=8 dataset_params.val_dataloader_params.batch_size=16 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False training_hyperparams.max_train_batches=100 training_hyperparams.max_valid_batches=100 multi_gpu=DDP num_gpus=4 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_yolox architecture=yolox_s dataset_params.train_dataloader_params.batch_size=8 dataset_params.val_dataloader_params.batch_size=16 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False training_hyperparams.max_train_batches=100 training_hyperparams.max_valid_batches=100 multi_gpu=DDP num_gpus=4 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_yolox architecture=yolox_m dataset_params.train_dataloader_params.batch_size=8 dataset_params.val_dataloader_params.batch_size=8 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False training_hyperparams.max_train_batches=100 training_hyperparams.max_valid_batches=100 multi_gpu=DDP num_gpus=4 + python3.8 src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=coco2017_yolox architecture=yolox_l dataset_params.train_dataloader_params.batch_size=4 dataset_params.val_dataloader_params.batch_size=8 training_hyperparams.max_epochs=1 training_hyperparams.average_best_models=False training_hyperparams.max_train_batches=100 training_hyperparams.max_valid_batches=100 multi_gpu=DDP num_gpus=4 - run: name: Remove new environment when failed diff --git a/src/super_gradients/__init__.py b/src/super_gradients/__init__.py index 42870dfa68..a35092dd10 100755 --- a/src/super_gradients/__init__.py +++ b/src/super_gradients/__init__.py @@ -22,6 +22,6 @@ "setup_device", ] -__version__ = "3.0.6" +__version__ = "3.0.7" env_sanity_check() diff --git a/src/super_gradients/recipes/imagenet_mobilenetv3_large.yaml b/src/super_gradients/recipes/imagenet_mobilenetv3_large.yaml index 6a96ccbdb6..99f4bb974a 100644 --- a/src/super_gradients/recipes/imagenet_mobilenetv3_large.yaml +++ b/src/super_gradients/recipes/imagenet_mobilenetv3_large.yaml @@ -20,7 +20,7 @@ arch_params: experiment_name: mobileNetv3_large_training architecture: mobilenet_v3_large - +ckpt_root_dir: # THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA hydra: diff --git a/src/super_gradients/recipes/imagenet_mobilenetv3_small.yaml b/src/super_gradients/recipes/imagenet_mobilenetv3_small.yaml index aae9e01b89..1526f1c8d6 100644 --- a/src/super_gradients/recipes/imagenet_mobilenetv3_small.yaml +++ b/src/super_gradients/recipes/imagenet_mobilenetv3_small.yaml @@ -20,7 +20,7 @@ arch_params: experiment_name: mobileNetv3_small_training architecture: mobilenet_v3_small - +ckpt_root_dir: # THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA hydra: diff --git a/src/super_gradients/recipes/imagenet_repvgg.yaml b/src/super_gradients/recipes/imagenet_repvgg.yaml index 048629998e..7667351c7e 100644 --- a/src/super_gradients/recipes/imagenet_repvgg.yaml +++ b/src/super_gradients/recipes/imagenet_repvgg.yaml @@ -37,7 +37,7 @@ multi_gpu: DDP num_gpus: 4 architecture: repvgg_a0 - +ckpt_root_dir: # THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA hydra: diff --git a/src/super_gradients/recipes/imagenet_vit_base.yaml b/src/super_gradients/recipes/imagenet_vit_base.yaml index d9ccde0918..88666dff7f 100644 --- a/src/super_gradients/recipes/imagenet_vit_base.yaml +++ b/src/super_gradients/recipes/imagenet_vit_base.yaml @@ -33,7 +33,7 @@ experiment_name: vit_base_imagenet1k architecture: vit_base multi_gpu: DDP num_gpus: 8 - +ckpt_root_dir: # THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA hydra: diff --git a/src/super_gradients/recipes/imagenet_vit_large.yaml b/src/super_gradients/recipes/imagenet_vit_large.yaml index 918a2ef83e..aa4801c442 100644 --- a/src/super_gradients/recipes/imagenet_vit_large.yaml +++ b/src/super_gradients/recipes/imagenet_vit_large.yaml @@ -29,7 +29,7 @@ architecture: vit_large experiment_name: vit_large_imagenet1k multi_gpu: DDP num_gpus: 8 - +ckpt_root_dir: # THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA hydra: diff --git a/src/super_gradients/training/sg_trainer/sg_trainer.py b/src/super_gradients/training/sg_trainer/sg_trainer.py index d30464a96c..675a4b0222 100755 --- a/src/super_gradients/training/sg_trainer/sg_trainer.py +++ b/src/super_gradients/training/sg_trainer/sg_trainer.py @@ -1192,19 +1192,19 @@ def forward(self, inputs, targets): self.ckpt_best_name = self.training_params.ckpt_best_name - if self.training_params.max_train_batches is not None and ( - self.training_params.max_train_batches > len(self.train_loader) or self.training_params.max_train_batches <= 0 - ): - - raise ValueError("max_train_batches must be positive and smaller then len(train_loader).") + if self.training_params.max_train_batches is not None: + if self.training_params.max_train_batches > len(self.train_loader): + logger.warning("max_train_batches is greater than len(self.train_loader) and will have no effect.") + elif self.training_params.max_train_batches <= 0: + raise ValueError("max_train_batches must be positive.") + + if self.training_params.max_valid_batches is not None: + if self.training_params.max_valid_batches > len(self.valid_loader): + logger.warning("max_valid_batches is greater than len(self.valid_loader) and will have no effect.") + elif self.training_params.max_valid_batches <= 0: + raise ValueError("max_valid_batches must be positive.") self.max_train_batches = self.training_params.max_train_batches - - if self.training_params.max_valid_batches is not None and ( - self.training_params.max_valid_batches > len(self.valid_loader) or self.training_params.max_valid_batches <= 0 - ): - - raise ValueError("max_valid_batches must be positive and smaller then len(valid_loader).") self.max_valid_batches = self.training_params.max_valid_batches # STATE ATTRIBUTE SET HERE FOR SUBSEQUENT TRAIN() CALLS