Feature/sg 1041 rename object names (#1446)

* first draft of registries with deprecate - still need to change how we register all the classes one at a time/// * docstring * wip - still need to change recipe * update LR scheduler * remove deprecated for .py and /md * update losses * undo unwantaed chang * go over the losses again * fix * fix LRWarmups * leftover * minor change * fix * LabelSmoothingCrossEntropyLoss -> CrossEntropyLoss * add deprecated to LRSchedulers * rename LRSchedulers by replacing Callbacks to Scheduler in name * add deprecate on LRWarmups * rename LRWarmups to be more simple * removed double registry from linear epoch lr warmup * fixed some failing breaking changes * fixed changed CELoss ref --------- Co-authored-by: Eugene Khvedchenya <ekhvedchenya@gmail.com> Co-authored-by: shayaharon <shay.aharon@deci.ai> Co-authored-by: Shay Aharon <80472096+shaydeci@users.noreply.github.com>
Deci-AI · Sep 19, 2023 · 1b558ed · 1b558ed
1 parent 4536d2d
commit 1b558ed
Show file tree

Hide file tree

Showing 118 changed files with 439 additions and 376 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -547,7 +547,7 @@ jobs:
             python3.8 src/super_gradients/train_from_recipe.py --config-name=coco2017_pose_dekr_w32_no_dc experiment_name=shortened_coco2017_pose_dekr_w32_ap_test batch_size=4 val_batch_size=8 epochs=1 training_hyperparams.lr_warmup_steps=0 training_hyperparams.average_best_models=False training_hyperparams.max_train_batches=1000 training_hyperparams.max_valid_batches=100 multi_gpu=DDP num_gpus=4
             python3.8 src/super_gradients/train_from_recipe.py --config-name=cifar10_resnet experiment_name=shortened_cifar10_resnet_accuracy_test epochs=100 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4
             python3.8 src/super_gradients/examples/convert_recipe_example/convert_recipe_example.py --config-name=cifar10_conversion_params experiment_name=shortened_cifar10_resnet_accuracy_test
-            python3.8 src/super_gradients/train_from_recipe.py --config-name=coco2017_yolox experiment_name=shortened_coco2017_yolox_n_map_test architecture=yolox_n training_hyperparams.loss=yolox_fast_loss epochs=10 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4
+            python3.8 src/super_gradients/train_from_recipe.py --config-name=coco2017_yolox experiment_name=shortened_coco2017_yolox_n_map_test architecture=yolox_n training_hyperparams.loss=YoloXFastDetectionLoss epochs=10 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4
             python3.8 src/super_gradients/train_from_recipe.py --config-name=cityscapes_regseg48 experiment_name=shortened_cityscapes_regseg48_iou_test epochs=10 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4
             coverage run --source=super_gradients -m unittest tests/deci_core_recipe_test_suite_runner.py
 

diff --git a/Makefile b/Makefile
@@ -10,7 +10,7 @@ yolo_nas_integration_tests:
 recipe_accuracy_tests:
 	python src/super_gradients/train_from_recipe.py --config-name=coco2017_pose_dekr_w32_no_dc experiment_name=shortened_coco2017_pose_dekr_w32_ap_test epochs=1 batch_size=4 val_batch_size=8 training_hyperparams.lr_warmup_steps=0 training_hyperparams.average_best_models=False training_hyperparams.max_train_batches=1000 training_hyperparams.max_valid_batches=100 multi_gpu=DDP num_gpus=4
 	python src/super_gradients/train_from_recipe.py --config-name=cifar10_resnet               experiment_name=shortened_cifar10_resnet_accuracy_test   epochs=100 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4
-	python src/super_gradients/train_from_recipe.py --config-name=coco2017_yolox               experiment_name=shortened_coco2017_yolox_n_map_test      epochs=10  architecture=yolox_n training_hyperparams.loss=yolox_fast_loss training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4
+	python src/super_gradients/train_from_recipe.py --config-name=coco2017_yolox               experiment_name=shortened_coco2017_yolox_n_map_test      epochs=10  architecture=yolox_n training_hyperparams.loss=YoloXFastDetectionLoss training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4
 	python src/super_gradients/train_from_recipe.py --config-name=cityscapes_regseg48          experiment_name=shortened_cityscapes_regseg48_iou_test   epochs=10 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4
 	python src/super_gradients/examples/convert_recipe_example/convert_recipe_example.py --config-name=cifar10_conversion_params experiment_name=shortened_cifar10_resnet_accuracy_test
 	coverage run --source=super_gradients -m unittest tests/deci_core_recipe_test_suite_runner.py

diff --git a/documentation/source/Checkpoints.md b/documentation/source/Checkpoints.md
@@ -79,7 +79,7 @@ model = models.get(model_name=Models.RESNET18, num_classes=10)
 
 train_params = {
     ...
-    "loss": "cross_entropy",
+    "loss": "LabelSmoothingCrossEntropyLoss",
     "criterion_params": {},
     "save_ckpt_epoch_list": [10,15]
     ...

diff --git a/documentation/source/Example_Classification.md b/documentation/source/Example_Classification.md
@@ -318,10 +318,10 @@ Output (Training parameters):
     'launch_tensorboard': False,
     'load_opt_params': True,
     'log_installed_packages': True,
-    'loss': 'cross_entropy',
+    'loss': "LabelSmoothingCrossEntropyLoss",
     'lr_cooldown_epochs': 0,
     'lr_decay_factor': 0.1,
-    'lr_mode': 'step',
+    'lr_mode': 'StepLRScheduler',
     'lr_schedule_function': None,
     'lr_updates': array([100, 150, 200]),
     'lr_warmup_epochs': 0,
@@ -355,7 +355,7 @@ Output (Training parameters):
     'train_metrics_list': ['Accuracy', 'Top5'],
     'valid_metrics_list': ['Accuracy', 'Top5'],
     'warmup_initial_lr': None,
-    'warmup_mode': 'linear_epoch_step',
+    'warmup_mode': 'LinearEpochLRWarmup',
     'zero_weight_decay_on_bias_and_bn': False
 }
 ```

diff --git a/documentation/source/Example_Training-an-external-model.md b/documentation/source/Example_Training-an-external-model.md
@@ -640,7 +640,7 @@ And lastly, we need to define the training hyperparameters:
 ```python
 train_params = {
     "max_epochs": 100,
-    "lr_mode": "cosine",
+    "lr_mode": "CosineLRScheduler",
     "initial_lr": 0.001,
     "optimizer": "Adam",
     "loss": CustomSegLoss(),

diff --git a/documentation/source/LRScheduling.md b/documentation/source/LRScheduling.md
@@ -7,15 +7,15 @@ Learning rate scheduling type is controlled by the training parameter `lr_mode`.
 
         When str:
 
-        Learning rate scheduling policy, one of ['step','poly','cosine','function'].
+        Learning rate scheduling policy, one of ['StepLRScheduler','PolyLRScheduler','CosineLRScheduler','FunctionLRScheduler'].
 
-        'step' refers to constant updates at epoch numbers passed through `lr_updates`. Each update decays the learning rate by `lr_decay_factor`.
+        'StepLRScheduler' refers to constant updates at epoch numbers passed through `lr_updates`. Each update decays the learning rate by `lr_decay_factor`.
 
-        'cosine' refers to the Cosine Anealing policy as mentioned in https://arxiv.org/abs/1608.03983. The final learning rate ratio is controlled by `cosine_final_lr_ratio` training parameter.
+        'CosineLRScheduler' refers to the Cosine Anealing policy as mentioned in https://arxiv.org/abs/1608.03983. The final learning rate ratio is controlled by `cosine_final_lr_ratio` training parameter.
 
-        'poly' refers to the polynomial decrease: in each epoch iteration `self.lr = self.initial_lr * pow((1.0 - (current_iter / max_iter)), 0.9)`
+        'PolyLRScheduler' refers to the polynomial decrease: in each epoch iteration `self.lr = self.initial_lr * pow((1.0 - (current_iter / max_iter)), 0.9)`
 
-        'function' refers to a user-defined learning rate scheduling function, that is passed through `lr_schedule_function`.
+        'FunctionLRScheduler' refers to a user-defined learning rate scheduling function, that is passed through `lr_schedule_function`.
 
 For example, the training code below will start with an initial learning rate of 0.1 and decay by 0.1 at epochs 100,150 and 200:
 
@@ -30,7 +30,7 @@ valid_dataloader = ...
 model = ...
 train_params = {
     "initial_lr": 0.1,
-    "lr_mode":"step",
+    "lr_mode":"StepLRScheduler",
     "lr_updates": [100, 150, 200],
     "lr_decay_factor": 0.1,
     ...,
@@ -45,7 +45,7 @@ trainer.train(model=model, training_params=train_params, train_loader=train_data
 ```yaml
 training_hyperparams:
     initial_lr: 0.1
-    lr_mode: step
+    lr_mode: StepLRScheduler
     user_lr_updates:
       - 100
       - 150
@@ -66,7 +66,7 @@ Prerequisites: [phase callbacks](PhaseCallbacks.md), [training with configuratio
 In SG, learning rate schedulers are implemented as [phase callbacks](PhaseCallbacks.md).
 They read the learning rate from the `PhaseContext` in their `__call__` method, calculate the new learning rate according to the current state of training, and update the optimizer's param groups.
 
-For example, the code snippet from the previous section translates "lr_mode":"step" to a `super_gradients.training.utils.callbacks.callbacks.StepLRCallback` instance, which is added to the phase callbacks list.
+For example, the code snippet from the previous section translates "lr_mode":"StepLRScheduler" to a `super_gradients.training.utils.callbacks.callbacks.StepLRScheduler` instance, which is added to the phase callbacks list.
 
 ### Implementing Your Own Scheduler
 A custom learning rate scheduler should inherit from `LRCallbackBase`, so let's take a look at it:

diff --git a/documentation/source/Losses.md b/documentation/source/Losses.md
@@ -2,18 +2,18 @@
 
 SuperGradients can support any PyTorch-based loss function. Additionally, multiple Loss function implementations for various tasks are also supported:
 
-    cross_entropy
-    mse
-    r_squared_loss
-    shelfnet_ohem_loss
-    shelfnet_se_loss
-    yolox_loss
-    yolox_fast_loss
-    ssd_loss
-    stdc_loss
-    bce_dice_loss
-    kd_loss
-    dice_ce_edge_loss
+    LabelSmoothingCrossEntropyLoss
+    MSE
+    RSquaredLoss
+    ShelfNetOHEMLoss
+    ShelfNetSemanticEncodingLoss
+    YoloXDetectionLoss
+    YoloXFastDetectionLoss
+    SSDLoss
+    STDCLoss
+    BCEDiceLoss
+    KDLogitsLoss
+    DiceCEEdgeLoss
 
 All the above, are just string aliases for the underlying torch.nn.Module classes, implementing the specified loss functions.
 
@@ -31,7 +31,7 @@ model = ...
 
 train_params = {
    ...
-   "loss": "cross_entropy",
+   "loss": "LabelSmoothingCrossEntropyLoss",
    "criterion_params": {}
    ...
 }
@@ -42,7 +42,7 @@ Since most IDEs support auto-completion, for your convenience, you can use our o
 ```python
 from super_gradients.common.object_names import Losses
 ```
-Then simply instead of "cross_entropy", use 
+Then simply instead of "LabelSmoothingCrossEntropyLoss", use 
 ```python
 Losses.CROSS_ENTROPY
 ```
@@ -54,14 +54,14 @@ When doing so, in your `my_training_hyperparams.yaml` file:
 ```yaml
 ...
 
-loss: yolox_loss
+loss: YoloXDetectionLoss
 
 criterion_params:
    strides: [8, 16, 32]  # output strides of all yolo outputs
    num_classes: 80
 ```
 
-Note that two `training_params` parameters define the loss function:  `loss` which defines the type of the loss, and`criterion_params` dictionary which will be unpacked to the underlying `yolox_loss` class constructor.
+Note that two `training_params` parameters define the loss function:  `loss` which defines the type of the loss, and`criterion_params` dictionary which will be unpacked to the underlying `YoloXDetectionLoss` class constructor.
 
 ## Passing Instantiated nn.Module Objects as Loss Functions
 

diff --git a/documentation/source/PhaseCallbacks.md b/documentation/source/PhaseCallbacks.md
@@ -8,13 +8,13 @@ SG's `super_gradients.training.utils.callbacks` module implements some common us
 
     ModelConversionCheckCallback
     LRCallbackBase
-    EpochStepWarmupLRCallback
-    BatchStepLinearWarmupLRCallback
-    StepLRCallback
-    ExponentialLRCallback
-    PolyLRCallback
-    CosineLRCallback
-    FunctionLRCallback
+    LinearEpochLRWarmup
+    LinearBatchLRWarmup
+    StepLRScheduler
+    ExponentialLRScheduler
+    PolyLRScheduler
+    CosineLRScheduler
+    FunctionLRScheduler
     LRSchedulerCallback
     DetectionVisualizationCallback
     BinarySegmentationVisualizationCallback
@@ -30,7 +30,7 @@ off augmentations and incorporate L1 loss starting from epoch 285:
 max_epochs: 300
 ...
 
-loss: yolox_loss
+loss: YoloXDetectionLoss
 
 ...
 
@@ -237,7 +237,7 @@ valid_dataloader = ...
 model = ...
 
 train_params = {
-    "loss": "cross_entropy",
+    "loss": "LabelSmoothingCrossEntropyLoss",
     "criterion_params": {},
     "phase_callbacks": [SaveFirstBatchCallback()],
     ...

diff --git a/documentation/source/QuickstartBasicToolkit.md b/documentation/source/QuickstartBasicToolkit.md
@@ -61,7 +61,7 @@ model = models.get(Models.RESNET18, num_classes=10)
 training_params = {
     "max_epochs": 20,
     "initial_lr": 0.1,
-    "loss": "cross_entropy",
+    "loss": "LabelSmoothingCrossEntropyLoss",
     "train_metrics_list": [Accuracy(), Top5()],
     "valid_metrics_list": [Accuracy(), Top5()],
     "metric_to_watch": "Accuracy",

diff --git a/documentation/source/Segmentation.md b/documentation/source/Segmentation.md
@@ -143,12 +143,12 @@ from super_gradients.training.metrics.segmentation_metrics import BinaryIOU
 
 train_params = {
     "max_epochs": 30,
-    "lr_mode": "cosine",
+    "lr_mode": "CosineLRScheduler",
     "initial_lr": 0.005,
     "lr_warmup_epochs": 5,
     "multiply_head_lr": 10,
     "optimizer": "SGD",
-    "loss": "bce_dice_loss",
+    "loss": "BCEDiceLoss",
     "ema": True,
     "zero_weight_decay_on_bias_and_bn": True,
     "average_best_models": True,

diff --git a/documentation/source/configuration_files.md b/documentation/source/configuration_files.md
@@ -28,7 +28,7 @@ lr_decay_factor: 0.1
 lr_mode: step
 lr_warmup_epochs: 0
 initial_lr: 0.1
-loss: cross_entropy
+loss: LabelSmoothingCrossEntropyLoss
 optimizer: SGD
 criterion_params: {}
 

diff --git a/src/super_gradients/common/object_names.py b/src/super_gradients/common/object_names.py
@@ -1,21 +1,21 @@
 class Losses:
     """Static class holding all the supported loss names"""
 
-    CROSS_ENTROPY = "cross_entropy"
-    MSE = "mse"
-    R_SQUARED_LOSS = "r_squared_loss"
-    SHELFNET_OHEM_LOSS = "shelfnet_ohem_loss"
-    SHELFNET_SE_LOSS = "shelfnet_se_loss"
-    YOLOX_LOSS = "yolox_loss"
-    PPYOLOE_LOSS = "ppyoloe_loss"
-    YOLOX_FAST_LOSS = "yolox_fast_loss"
-    SSD_LOSS = "ssd_loss"
-    STDC_LOSS = "stdc_loss"
-    BCE_DICE_LOSS = "bce_dice_loss"
-    KD_LOSS = "kd_loss"
-    DICE_CE_EDGE_LOSS = "dice_ce_edge_loss"
-    DEKR_LOSS = "dekr_loss"
-    RESCORING_LOSS = "rescoring_loss"
+    CROSS_ENTROPY = "CrossEntropyLoss"
+    MSE = "MSE"
+    R_SQUARED_LOSS = "RSquaredLoss"
+    SHELFNET_OHEM_LOSS = "ShelfNetOHEMLoss"
+    SHELFNET_SE_LOSS = "ShelfNetSemanticEncodingLoss"
+    YOLOX_LOSS = "YoloXDetectionLoss"
+    PPYOLOE_LOSS = "PPYoloELoss"
+    YOLOX_FAST_LOSS = "YoloXFastDetectionLoss"
+    SSD_LOSS = "SSDLoss"
+    STDC_LOSS = "STDCLoss"
+    BCE_DICE_LOSS = "BCEDiceLoss"
+    KD_LOSS = "KDLogitsLoss"
+    DICE_CE_EDGE_LOSS = "DiceCEEdgeLoss"
+    DEKR_LOSS = "DEKRLoss"
+    RESCORING_LOSS = "RescoringLoss"
 
 
 class Metrics:
@@ -154,19 +154,19 @@ class Callbacks:
 class LRSchedulers:
     """Static class to hold all the supported LR Scheduler names"""
 
-    STEP = "step"
-    POLY = "poly"
-    COSINE = "cosine"
-    EXP = "exp"
-    FUNCTION = "function"
+    STEP = "StepLRScheduler"
+    POLY = "PolyLRScheduler"
+    COSINE = "CosineLRScheduler"
+    EXP = "ExponentialLRScheduler"
+    FUNCTION = "FunctionLRScheduler"
 
 
 class LRWarmups:
     """Static class to hold all the supported LR Warmup names"""
 
-    LINEAR_STEP = "linear_step"
-    LINEAR_EPOCH_STEP = "linear_epoch_step"
-    LINEAR_BATCH_STEP = "linear_batch_step"
+    LINEAR_STEP = "LinearEpochLRWarmup"
+    LINEAR_EPOCH_STEP = "LinearEpochLRWarmup"
+    LINEAR_BATCH_STEP = "LinearBatchLRWarmup"
 
 
 class Samplers:

diff --git a/src/super_gradients/common/registry/registry.py b/src/super_gradients/common/registry/registry.py
@@ -68,7 +68,8 @@ def warn_if_deprecated(name: str, registry: dict):
     """
     deprecated_names = registry.get(_DEPRECATED_KEY, {})
     if name in deprecated_names:
-        warnings.warn(f"Using `{name}` in the recipe has been deprecated. Please use `{deprecated_names[name]}`", DeprecationWarning)
+        warnings.simplefilter("once", DeprecationWarning)  # Required, otherwise the warning may never be displayed.
+        warnings.warn(f"Object name `{name}` is now deprecated. Please replace it with `{deprecated_names[name]}`.", DeprecationWarning)
 
 
 ARCHITECTURES = {}
@@ -83,9 +84,9 @@ def warn_if_deprecated(name: str, registry: dict):
 METRICS = {}
 register_metric = create_register_decorator(registry=METRICS)
 
-LOSSES = {Losses.MSE: nn.MSELoss}
+LOSSES = {}
 register_loss = create_register_decorator(registry=LOSSES)
-
+register_loss(name=Losses.MSE, deprecated_name="mse")(nn.MSELoss)  # Register manually to benefit from deprecated logic
 
 ALL_DATALOADERS = {}
 register_dataloader = create_register_decorator(registry=ALL_DATALOADERS)

diff --git a/src/super_gradients/examples/ddrnet_imagenet/ddrnet_classification_example.py b/src/super_gradients/examples/ddrnet_imagenet/ddrnet_classification_example.py
@@ -39,13 +39,13 @@
 
 train_params_ddr = {
     "max_epochs": args.max_epochs,
-    "lr_mode": "step",
+    "lr_mode": "StepLRScheduler",
     "lr_updates": [30, 60, 90],
     "lr_decay_factor": 0.1,
     "initial_lr": 0.1 * devices,
     "optimizer": "SGD",
     "optimizer_params": {"weight_decay": 0.0001, "momentum": 0.9, "nesterov": True},
-    "loss": "cross_entropy",
+    "loss": "CrossEntropyLoss",
     "train_metrics_list": [Accuracy(), Top5()],
     "valid_metrics_list": [Accuracy(), Top5()],
     "metric_to_watch": "Accuracy",

diff --git a/src/super_gradients/examples/deci_lab_export_example/deci_lab_export_example.py b/src/super_gradients/examples/deci_lab_export_example/deci_lab_export_example.py
@@ -56,10 +56,10 @@ def main(architecture_name: str):
         "max_epochs": 2,
         "lr_updates": [1],
         "lr_decay_factor": 0.1,
-        "lr_mode": "step",
+        "lr_mode": "StepLRScheduler",
         "lr_warmup_epochs": 0,
         "initial_lr": 0.1,
-        "loss": "cross_entropy",
+        "loss": "CrossEntropyLoss",
         "optimizer": "SGD",
         "criterion_params": {},
         "train_metrics_list": [Accuracy(), Top5()],

diff --git a/src/super_gradients/examples/early_stop/early_stop_example.py b/src/super_gradients/examples/early_stop/early_stop_example.py
@@ -12,16 +12,16 @@
 super_gradients.init_trainer()
 
 early_stop_acc = EarlyStop(Phase.VALIDATION_EPOCH_END, monitor="Accuracy", mode="max", patience=3, verbose=True)
-early_stop_val_loss = EarlyStop(Phase.VALIDATION_EPOCH_END, monitor="LabelSmoothingCrossEntropyLoss", mode="min", patience=3, verbose=True)
+early_stop_val_loss = EarlyStop(Phase.VALIDATION_EPOCH_END, monitor="CrossEntropyLoss", mode="min", patience=3, verbose=True)
 
 train_params = {
     "max_epochs": 250,
     "lr_updates": [100, 150, 200],
     "lr_decay_factor": 0.1,
-    "lr_mode": "step",
+    "lr_mode": "StepLRScheduler",
     "lr_warmup_epochs": 0,
     "initial_lr": 0.1,
-    "loss": "cross_entropy",
+    "loss": "CrossEntropyLoss",
     "optimizer": "SGD",
     "criterion_params": {},
     "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},

diff --git a/src/super_gradients/examples/loggers_examples/clearml_logger_example.py b/src/super_gradients/examples/loggers_examples/clearml_logger_example.py
@@ -11,9 +11,9 @@
     "max_epochs": 20,
     "lr_updates": [5, 10, 15],
     "lr_decay_factor": 0.1,
-    "lr_mode": "step",
+    "lr_mode": "StepLRScheduler",
     "initial_lr": 0.1,
-    "loss": "cross_entropy",
+    "loss": "CrossEntropyLoss",
     "optimizer": "SGD",
     "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
     "train_metrics_list": [Accuracy(), Top5()],