From d996fa27c971289c4cb2a5cbc1f0721d61c73838 Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Sun, 3 Sep 2023 15:42:01 +0300
Subject: [PATCH 01/21] first draft of registries with deprecate - still need
 to change how we register all the classes one at a time///

---
 .../common/factories/base_factory.py          |  3 ++
 .../common/factories/type_factory.py          |  5 +-
 .../common/registry/registry.py               | 42 +++++++++++++----
 tests/unit_tests/test_registry.py             | 47 +++++++++++++++++++
 4 files changed, 88 insertions(+), 9 deletions(-)
 create mode 100644 tests/unit_tests/test_registry.py

diff --git a/src/super_gradients/common/factories/base_factory.py b/src/super_gradients/common/factories/base_factory.py
index 6edab85fed..ab81ff97ee 100644
--- a/src/super_gradients/common/factories/base_factory.py
+++ b/src/super_gradients/common/factories/base_factory.py
@@ -1,6 +1,7 @@
 from typing import Union, Mapping, Dict
 
 from super_gradients.common.exceptions.factory_exceptions import UnknownTypeException
+from super_gradients.common.registry.registry import warn_if_deprecated
 from super_gradients.training.utils.utils import fuzzy_str, fuzzy_keys, get_fuzzy_mapping_param
 
 
@@ -43,6 +44,7 @@ def get(self, conf: Union[str, dict]):
            If provided value is not one of the three above, the value will be returned as is
         """
         if isinstance(conf, str):
+            warn_if_deprecated(name=conf, registry=self.type_dict)
             if conf in self.type_dict:
                 return self.type_dict[conf]()
             elif fuzzy_str(conf) in fuzzy_keys(self.type_dict):
@@ -60,6 +62,7 @@ def get(self, conf: Union[str, dict]):
             _type = list(conf.keys())[0]  # THE TYPE NAME
             _params = list(conf.values())[0]  # A DICT CONTAINING THE PARAMETERS FOR INIT
             if _type in self.type_dict:
+                warn_if_deprecated(name=_type, registry=self.type_dict)
                 return self.type_dict[_type](**_params)
             elif fuzzy_str(_type) in fuzzy_keys(self.type_dict):
                 return get_fuzzy_mapping_param(_type, self.type_dict)(**_params)
diff --git a/src/super_gradients/common/factories/type_factory.py b/src/super_gradients/common/factories/type_factory.py
index aa4f368463..222509c080 100644
--- a/src/super_gradients/common/factories/type_factory.py
+++ b/src/super_gradients/common/factories/type_factory.py
@@ -3,7 +3,7 @@
 import importlib
 
 from super_gradients.common.exceptions.factory_exceptions import UnknownTypeException
-from super_gradients.common.factories.base_factory import AbstractFactory
+from super_gradients.common.factories.base_factory import AbstractFactory, warn_if_deprecated
 from super_gradients.training.utils import get_param
 
 
@@ -32,6 +32,9 @@ def get(self, conf: Union[str, type]):
            If provided value is already a class type, the value will be returned as is.
         """
         if isinstance(conf, str) or isinstance(conf, bool):
+            if isinstance(conf, str):
+                warn_if_deprecated(name=conf, registry=self.type_dict)
+
             if conf in self.type_dict:
                 return self.type_dict[conf]
             elif isinstance(conf, str) and get_param(self.type_dict, conf) is not None:
diff --git a/src/super_gradients/common/registry/registry.py b/src/super_gradients/common/registry/registry.py
index 95da751cb6..e86189e693 100644
--- a/src/super_gradients/common/registry/registry.py
+++ b/src/super_gradients/common/registry/registry.py
@@ -1,5 +1,6 @@
 import inspect
 from typing import Callable, Dict, Optional
+import warnings
 
 import torch
 from torch import nn, optim
@@ -7,6 +8,8 @@
 
 from super_gradients.common.object_names import Losses, Transforms, Samplers, Optimizers
 
+_DEPRECATED_KEY = "_deprecated_objects"
+
 
 def create_register_decorator(registry: Dict[str, Callable]) -> Callable:
     """
@@ -16,23 +19,40 @@ def create_register_decorator(registry: Dict[str, Callable]) -> Callable:
     :return:            Register function
     """
 
-    def register(name: Optional[str] = None) -> Callable:
+    def register(name: Optional[str] = None, deprecated_name: str = False) -> Callable:
         """
         Set up a register decorator.
 
-        :param name: If specified, the decorated object will be registered with this name.
-        :return:     Decorator that registers the callable.
+        :param name:            If specified, the decorated object will be registered with this name.
+        :param deprecated_name: If specified, the decorated object will be registered with this name but deprecate it.
+        :return:            Decorator that registers the callable.
         """
 
         def decorator(cls: Callable) -> Callable:
             """Register the decorated callable"""
-            cls_name = name if name is not None else cls.__name__
 
-            if cls_name in registry:
-                ref = registry[cls_name]
-                raise Exception(f"`{cls_name}` is already registered and points to `{inspect.getmodule(ref).__name__}.{ref.__name__}")
+            def _registered_cls(registration_name: str):
+                if registration_name in registry:
+                    registered_cls = registry[registration_name]
+                    if registered_cls != cls:
+                        raise Exception(
+                            f"`{registration_name}` is already registered and points to `{inspect.getmodule(registered_cls).__name__}.{registered_cls.__name__}"
+                        )
+                registry[registration_name] = cls
+
+            registration_name = name or cls.__name__
+            _registered_cls(registration_name=registration_name)
+
+            if deprecated_name:
+                # Deprecated objects are still registered in order to let users use them
+                _registered_cls(registration_name=deprecated_name)
+
+                # But deprecated objects are also listed in the _deprecated_objects key.
+                # This can later be used in the factories to know if a name is deprecated and how it should be named instead.
+                deprecated_registered_objects = registry.get(_DEPRECATED_KEY, {})
+                deprecated_registered_objects[deprecated_name] = registration_name  # Keep the information about how it should be named.
+                registry[_DEPRECATED_KEY] = deprecated_registered_objects
 
-            registry[cls_name] = cls
             return cls
 
         return decorator
@@ -40,6 +60,12 @@ def decorator(cls: Callable) -> Callable:
     return register
 
 
+def warn_if_deprecated(name: str, registry: dict):
+    deprecated_names = registry.get(_DEPRECATED_KEY, {})
+    if name in deprecated_names:
+        warnings.warn(f"Using `{name}` in the recipe has been deprecated. Please use `{deprecated_names[name]}`", DeprecationWarning)
+
+
 ARCHITECTURES = {}
 register_model = create_register_decorator(registry=ARCHITECTURES)
 
diff --git a/tests/unit_tests/test_registry.py b/tests/unit_tests/test_registry.py
new file mode 100644
index 0000000000..ac8f237d26
--- /dev/null
+++ b/tests/unit_tests/test_registry.py
@@ -0,0 +1,47 @@
+import unittest
+from typing import List
+
+from super_gradients.common.registry.registry import create_register_decorator
+from super_gradients.common.factories.base_factory import BaseFactory, UnknownTypeException
+
+
+class RegistryTest(unittest.TestCase):
+    def setUp(self) -> None:
+        # We do all the registration in `setUp` to avoid having registration ran on import
+        _DUMMY_REGISTRY = {}
+        register_class = create_register_decorator(registry=_DUMMY_REGISTRY)
+
+        @register_class("good_object_name")
+        class Class1:
+            def __init__(self, values: List[float]):
+                self.values = values
+
+        @register_class(deprecated_name="deprecated_object_name")
+        class Class2:
+            def __init__(self, values: List[float]):
+                self.values = values
+
+        self.Class1 = Class1  # Save classes, not instances
+        self.Class2 = Class2
+        self.factory = BaseFactory(type_dict=_DUMMY_REGISTRY)
+
+    def test_instantiate_from_name(self):
+        instance = self.factory.get({"good_object_name": {"values": [1.0, 2.0]}})
+        self.assertIsInstance(instance, self.Class1)
+
+    def test_instantiate_from_classname_when_name_set(self):
+        with self.assertRaises(UnknownTypeException):
+            self.factory.get({"Class1": {"values": [1.0, 2.0]}})
+
+    def test_instantiate_from_classname_when_no_name_set(self):
+        instance = self.factory.get({"Class2": {"values": [1.0, 2.0]}})
+        self.assertIsInstance(instance, self.Class2)
+
+    def test_instantiate_from_deprecated_name(self):
+        with self.assertWarns(DeprecationWarning):
+            instance = self.factory.get({"deprecated_object_name": {"values": [1.0, 2.0]}})
+        self.assertIsInstance(instance, self.Class2)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 1dbca6fde2d47ed8e1465121859bf2bb259c870e Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Sun, 3 Sep 2023 15:55:40 +0300
Subject: [PATCH 02/21] docstring

---
 src/super_gradients/common/registry/registry.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/super_gradients/common/registry/registry.py b/src/super_gradients/common/registry/registry.py
index e86189e693..f00f0a3193 100644
--- a/src/super_gradients/common/registry/registry.py
+++ b/src/super_gradients/common/registry/registry.py
@@ -19,13 +19,14 @@ def create_register_decorator(registry: Dict[str, Callable]) -> Callable:
     :return:            Register function
     """
 
-    def register(name: Optional[str] = None, deprecated_name: str = False) -> Callable:
+    def register(name: Optional[str] = None, deprecated_name: Optional[str] = None) -> Callable:
         """
         Set up a register decorator.
 
-        :param name:            If specified, the decorated object will be registered with this name.
-        :param deprecated_name: If specified, the decorated object will be registered with this name but deprecate it.
-        :return:            Decorator that registers the callable.
+        :param name:            If specified, the decorated object will be registered with this name. Otherwise, the class name will be used to register.
+        :param deprecated_name: If specified, the decorated object will be registered with this name.
+                                This is done on top of the `official` registration which is done by setting the `name` argument.
+        :return:                Decorator that registers the callable.
         """
 
         def decorator(cls: Callable) -> Callable:
@@ -44,7 +45,7 @@ def _registered_cls(registration_name: str):
             _registered_cls(registration_name=registration_name)
 
             if deprecated_name:
-                # Deprecated objects are still registered in order to let users use them
+                # Deprecated objects like other objects - This is meant to avoid any breaking change.
                 _registered_cls(registration_name=deprecated_name)
 
                 # But deprecated objects are also listed in the _deprecated_objects key.
@@ -61,6 +62,10 @@ def _registered_cls(registration_name: str):
 
 
 def warn_if_deprecated(name: str, registry: dict):
+    """If the name is deprecated, warn the user about it.
+    :param name:        The name of the object that we want to check if it is deprecated.
+    :param registry:    The registry that may or may not include deprecated objects.
+    """
     deprecated_names = registry.get(_DEPRECATED_KEY, {})
     if name in deprecated_names:
         warnings.warn(f"Using `{name}` in the recipe has been deprecated. Please use `{deprecated_names[name]}`", DeprecationWarning)

From b20764bdb109ed634e2306d9b39a02e58968384b Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Sun, 3 Sep 2023 17:21:28 +0300
Subject: [PATCH 03/21] wip - still need to change recipe

---
 src/super_gradients/common/object_names.py    | 72 +++++++++++++------
 .../common/registry/registry.py               |  5 +-
 .../training/losses/bce_dice_loss.py          |  2 +-
 .../training/losses/dekr_loss.py              |  2 +-
 .../training/losses/dice_ce_edge_loss.py      |  2 +-
 .../training/losses/kd_losses.py              |  2 +-
 .../label_smoothing_cross_entropy_loss.py     |  2 +-
 .../training/losses/ppyolo_loss.py            |  2 +-
 .../training/losses/r_squared_loss.py         |  2 +-
 .../training/losses/rescoring_loss.py         |  2 +-
 .../training/losses/shelfnet_ohem_loss.py     |  2 +-
 .../losses/shelfnet_semantic_encoding_loss.py |  2 +-
 .../training/losses/ssd_loss.py               |  2 +-
 .../training/losses/stdc_loss.py              |  2 +-
 .../training/losses/yolox_loss.py             |  4 +-
 .../training/sg_trainer/sg_trainer.py         |  4 ++
 .../training/utils/callbacks/callbacks.py     | 16 ++---
 17 files changed, 78 insertions(+), 47 deletions(-)

diff --git a/src/super_gradients/common/object_names.py b/src/super_gradients/common/object_names.py
index 26bd48f890..7ff406830f 100644
--- a/src/super_gradients/common/object_names.py
+++ b/src/super_gradients/common/object_names.py
@@ -1,21 +1,37 @@
 class Losses:
     """Static class holding all the supported loss names"""
 
-    CROSS_ENTROPY = "cross_entropy"
-    MSE = "mse"
-    R_SQUARED_LOSS = "r_squared_loss"
-    SHELFNET_OHEM_LOSS = "shelfnet_ohem_loss"
-    SHELFNET_SE_LOSS = "shelfnet_se_loss"
-    YOLOX_LOSS = "yolox_loss"
-    PPYOLOE_LOSS = "ppyoloe_loss"
-    YOLOX_FAST_LOSS = "yolox_fast_loss"
-    SSD_LOSS = "ssd_loss"
-    STDC_LOSS = "stdc_loss"
-    BCE_DICE_LOSS = "bce_dice_loss"
-    KD_LOSS = "kd_loss"
-    DICE_CE_EDGE_LOSS = "dice_ce_edge_loss"
-    DEKR_LOSS = "dekr_loss"
-    RESCORING_LOSS = "rescoring_loss"
+    CROSS_ENTROPY = "LabelSmoothingCrossEntropyLoss"
+    MSE = "MSE"
+    R_SQUARED_LOSS = "RSquaredLoss"
+    SHELFNET_OHEM_LOSS = "ShelfNetOHEMLoss"
+    SHELFNET_SE_LOSS = "ShelfNetSemanticEncodingLoss"
+    YOLOX_LOSS = "YoloXDetectionLoss"
+    PPYOLOE_LOSS = "PPYoloELoss"
+    YOLOX_FAST_LOSS = "YoloXFastDetectionLoss"
+    SSD_LOSS = "SSDLoss"
+    STDC_LOSS = "STDCLoss"
+    BCE_DICE_LOSS = "BCEDiceLoss"
+    KD_LOSS = "KDLogitsLoss"
+    DICE_CE_EDGE_LOSS = "DiceCEEdgeLoss"
+    DEKR_LOSS = "DEKRLoss"
+    RESCORING_LOSS = "RescoringLoss"
+
+    _DEPRECATED_CROSS_ENTROPY = "cross_entropy"
+    _DEPRECATED_MSE = "mse"
+    _DEPRECATED_R_SQUARED_LOSS = "r_squared_loss"
+    _DEPRECATED_SHELFNET_OHEM_LOSS = "shelfnet_ohem_loss"
+    _DEPRECATED_SHELFNET_SE_LOSS = "shelfnet_se_loss"
+    _DEPRECATED_YOLOX_LOSS = "yolox_loss"
+    _DEPRECATED_PPYOLOE_LOSS = "ppyoloe_loss"
+    _DEPRECATED_YOLOX_FAST_LOSS = "yolox_fast_loss"
+    _DEPRECATED_SSD_LOSS = "ssd_loss"
+    _DEPRECATED_STDC_LOSS = "stdc_loss"
+    _DEPRECATED_BCE_DICE_LOSS = "bce_dice_loss"
+    _DEPRECATED_KD_LOSS = "kd_loss"
+    _DEPRECATED_DICE_CE_EDGE_LOSS = "dice_ce_edge_loss"
+    _DEPRECATED_DEKR_LOSS = "dekr_loss"
+    _DEPRECATED_RESCORING_LOSS = "rescoring_loss"
 
 
 class Metrics:
@@ -154,19 +170,29 @@ class Callbacks:
 class LRSchedulers:
     """Static class to hold all the supported LR Scheduler names"""
 
-    STEP = "step"
-    POLY = "poly"
-    COSINE = "cosine"
-    EXP = "exp"
-    FUNCTION = "function"
+    STEP = "StepLRCallback"
+    POLY = "PolyLRCallback"
+    COSINE = "CosineLRCallback"
+    EXP = "ExponentialLRCallback"
+    FUNCTION = "FunctionLRCallback"
+
+    _DEPRECATED_STEP = "step"
+    _DEPRECATED_POLY = "poly"
+    _DEPRECATED_COSINE = "cosine"
+    _DEPRECATED_EXP = "exp"
+    _DEPRECATED_FUNCTION = "function"
 
 
 class LRWarmups:
     """Static class to hold all the supported LR Warmup names"""
 
-    LINEAR_STEP = "linear_step"
-    LINEAR_EPOCH_STEP = "linear_epoch_step"
-    LINEAR_BATCH_STEP = "linear_batch_step"
+    LINEAR_STEP = "LinearStepWarmupLRCallback"
+    LINEAR_EPOCH_STEP = "EpochStepWarmupLRCallback"
+    LINEAR_BATCH_STEP = "BatchStepLinearWarmupLRCallback"
+
+    _DEPRECATED_LINEAR_STEP = "linear_step"
+    _DEPRECATED_LINEAR_EPOCH_STEP = "linear_epoch_step"
+    _DEPRECATED_LINEAR_BATCH_STEP = "linear_batch_step"
 
 
 class Samplers:
diff --git a/src/super_gradients/common/registry/registry.py b/src/super_gradients/common/registry/registry.py
index f00f0a3193..4c154dc302 100644
--- a/src/super_gradients/common/registry/registry.py
+++ b/src/super_gradients/common/registry/registry.py
@@ -68,7 +68,8 @@ def warn_if_deprecated(name: str, registry: dict):
     """
     deprecated_names = registry.get(_DEPRECATED_KEY, {})
     if name in deprecated_names:
-        warnings.warn(f"Using `{name}` in the recipe has been deprecated. Please use `{deprecated_names[name]}`", DeprecationWarning)
+        warnings.simplefilter("once", DeprecationWarning)  # Required, otherwise the warning may never be displayed.
+        warnings.warn(f"Object name `{name}` is now deprecated. Please replace it with `{deprecated_names[name]}`.", DeprecationWarning)
 
 
 ARCHITECTURES = {}
@@ -85,7 +86,7 @@ def warn_if_deprecated(name: str, registry: dict):
 
 LOSSES = {Losses.MSE: nn.MSELoss}
 register_loss = create_register_decorator(registry=LOSSES)
-
+register_loss(name=Losses.MSE, deprecated_name="mse")(nn.MSELoss)  # Register manually to benefit from deprecated logic
 
 ALL_DATALOADERS = {}
 register_dataloader = create_register_decorator(registry=ALL_DATALOADERS)
diff --git a/src/super_gradients/training/losses/bce_dice_loss.py b/src/super_gradients/training/losses/bce_dice_loss.py
index 973dd9b2ca..7539120111 100644
--- a/src/super_gradients/training/losses/bce_dice_loss.py
+++ b/src/super_gradients/training/losses/bce_dice_loss.py
@@ -7,7 +7,7 @@
 from super_gradients.training.losses.dice_loss import BinaryDiceLoss
 
 
-@register_loss(Losses.BCE_DICE_LOSS)
+@register_loss(name=Losses.BCE_DICE_LOSS, deprecated_name="bce_dice_loss")
 class BCEDiceLoss(torch.nn.Module):
     """
     Binary Cross Entropy + Dice Loss
diff --git a/src/super_gradients/training/losses/dekr_loss.py b/src/super_gradients/training/losses/dekr_loss.py
index 26698db494..8b2a8ea8b5 100644
--- a/src/super_gradients/training/losses/dekr_loss.py
+++ b/src/super_gradients/training/losses/dekr_loss.py
@@ -7,7 +7,7 @@
 from super_gradients.common.registry.registry import register_loss
 
 
-@register_loss(Losses.DEKR_LOSS)
+@register_loss(name=Losses.DEKR_LOSS, deprecated_name="dekr_loss")
 class DEKRLoss(nn.Module):
     """
     Implementation of the loss function from the "Bottom-Up Human Pose Estimation Via Disentangled Keypoint Regression"
diff --git a/src/super_gradients/training/losses/dice_ce_edge_loss.py b/src/super_gradients/training/losses/dice_ce_edge_loss.py
index f7cec313a5..0a0550188e 100644
--- a/src/super_gradients/training/losses/dice_ce_edge_loss.py
+++ b/src/super_gradients/training/losses/dice_ce_edge_loss.py
@@ -11,7 +11,7 @@
 from super_gradients.training.losses.mask_loss import MaskAttentionLoss
 
 
-@register_loss(Losses.DICE_CE_EDGE_LOSS)
+@register_loss(name=Losses.DICE_CE_EDGE_LOSS, deprecated_name="dice_ce_edge_loss")
 class DiceCEEdgeLoss(_Loss):
     def __init__(
         self,
diff --git a/src/super_gradients/training/losses/kd_losses.py b/src/super_gradients/training/losses/kd_losses.py
index 2d3c1908c9..156245bc8b 100644
--- a/src/super_gradients/training/losses/kd_losses.py
+++ b/src/super_gradients/training/losses/kd_losses.py
@@ -15,7 +15,7 @@ def forward(self, student_output, teacher_output):
         return super(KDklDivLoss, self).forward(torch.log_softmax(student_output, dim=1), torch.softmax(teacher_output, dim=1))
 
 
-@register_loss(Losses.KD_LOSS)
+@register_loss(name=Losses.KD_LOSS, deprecated_name="kd_loss")
 class KDLogitsLoss(_Loss):
     """Knowledge distillation loss, wraps the task loss and distillation loss"""
 
diff --git a/src/super_gradients/training/losses/label_smoothing_cross_entropy_loss.py b/src/super_gradients/training/losses/label_smoothing_cross_entropy_loss.py
index affcbdb6db..c203c22af4 100755
--- a/src/super_gradients/training/losses/label_smoothing_cross_entropy_loss.py
+++ b/src/super_gradients/training/losses/label_smoothing_cross_entropy_loss.py
@@ -83,7 +83,7 @@ def cross_entropy(inputs, target, weight=None, ignore_index=-100, reduction="mea
     return loss
 
 
-@register_loss(Losses.CROSS_ENTROPY)
+@register_loss(name=Losses.CROSS_ENTROPY, deprecated_name="cross_entropy")
 class LabelSmoothingCrossEntropyLoss(nn.CrossEntropyLoss):
     """CrossEntropyLoss - with ability to recieve distrbution as targets, and optional label smoothing"""
 
diff --git a/src/super_gradients/training/losses/ppyolo_loss.py b/src/super_gradients/training/losses/ppyolo_loss.py
index e42588f2e7..8fc851896c 100644
--- a/src/super_gradients/training/losses/ppyolo_loss.py
+++ b/src/super_gradients/training/losses/ppyolo_loss.py
@@ -628,7 +628,7 @@ def __call__(self, pbox: Tensor, gbox: Tensor, iou_weight=1.0, loc_reweight=None
         return loss * self.loss_weight
 
 
-@register_loss(Losses.PPYOLOE_LOSS)
+@register_loss(name=Losses.PPYOLOE_LOSS, deprecated_name="ppyoloe_loss")
 class PPYoloELoss(nn.Module):
     def __init__(
         self,
diff --git a/src/super_gradients/training/losses/r_squared_loss.py b/src/super_gradients/training/losses/r_squared_loss.py
index ece6baa63c..d5c15fad82 100755
--- a/src/super_gradients/training/losses/r_squared_loss.py
+++ b/src/super_gradients/training/losses/r_squared_loss.py
@@ -9,7 +9,7 @@
 from super_gradients.training.utils import convert_to_tensor
 
 
-@register_loss(Losses.R_SQUARED_LOSS)
+@register_loss(name=Losses.R_SQUARED_LOSS, deprecated_name="r_squared_loss")
 class RSquaredLoss(_Loss):
     def forward(self, output, target):
         # FIXME - THIS NEEDS TO BE CHANGED SUCH THAT THIS CLASS INHERETS FROM _Loss (TAKE A LOOK AT YoLoV3DetectionLoss)
diff --git a/src/super_gradients/training/losses/rescoring_loss.py b/src/super_gradients/training/losses/rescoring_loss.py
index c27acef5b2..3ead9958bb 100644
--- a/src/super_gradients/training/losses/rescoring_loss.py
+++ b/src/super_gradients/training/losses/rescoring_loss.py
@@ -7,7 +7,7 @@
 from super_gradients.common.registry import register_loss
 
 
-@register_loss(Losses.RESCORING_LOSS)
+@register_loss(name=Losses.RESCORING_LOSS, deprecated_name="rescoring_loss")
 class RescoringLoss(nn.Module):
     def __init__(self):
         super().__init__()
diff --git a/src/super_gradients/training/losses/shelfnet_ohem_loss.py b/src/super_gradients/training/losses/shelfnet_ohem_loss.py
index f699bf83a8..61a06dde52 100755
--- a/src/super_gradients/training/losses/shelfnet_ohem_loss.py
+++ b/src/super_gradients/training/losses/shelfnet_ohem_loss.py
@@ -5,7 +5,7 @@
 from super_gradients.training.losses.ohem_ce_loss import OhemCELoss
 
 
-@register_loss(Losses.SHELFNET_OHEM_LOSS)
+@register_loss(name=Losses.SHELFNET_OHEM_LOSS, deprecated_name="shelfnet_ohem_loss")
 class ShelfNetOHEMLoss(OhemCELoss):
     def __init__(self, threshold: float = 0.7, mining_percent: float = 1e-4, ignore_lb: int = 255):
         """
diff --git a/src/super_gradients/training/losses/shelfnet_semantic_encoding_loss.py b/src/super_gradients/training/losses/shelfnet_semantic_encoding_loss.py
index 864788bf26..c630ebb973 100755
--- a/src/super_gradients/training/losses/shelfnet_semantic_encoding_loss.py
+++ b/src/super_gradients/training/losses/shelfnet_semantic_encoding_loss.py
@@ -6,7 +6,7 @@
 from super_gradients.common.registry.registry import register_loss
 
 
-@register_loss(Losses.SHELFNET_SE_LOSS)
+@register_loss(name=Losses.SHELFNET_SE_LOSS, deprecated_name="shelfnet_se_loss")
 class ShelfNetSemanticEncodingLoss(nn.CrossEntropyLoss):
     """2D Cross Entropy Loss with Auxilary Loss"""
 
diff --git a/src/super_gradients/training/losses/ssd_loss.py b/src/super_gradients/training/losses/ssd_loss.py
index c183c745ce..be1b27e906 100755
--- a/src/super_gradients/training/losses/ssd_loss.py
+++ b/src/super_gradients/training/losses/ssd_loss.py
@@ -52,7 +52,7 @@ def forward(self, pred_labels, target_labels):
         return closs
 
 
-@register_loss(Losses.SSD_LOSS)
+@register_loss(name=Losses.SSD_LOSS, deprecated_name="ssd_loss")
 class SSDLoss(_Loss):
     """
         Implements the loss as the sum of the followings:
diff --git a/src/super_gradients/training/losses/stdc_loss.py b/src/super_gradients/training/losses/stdc_loss.py
index ad0fa44182..6b0a3375e0 100644
--- a/src/super_gradients/training/losses/stdc_loss.py
+++ b/src/super_gradients/training/losses/stdc_loss.py
@@ -111,7 +111,7 @@ def forward(self, detail_out: torch.Tensor, detail_target: torch.Tensor):
         return self.weights[0] * bce_loss + self.weights[1] * dice_loss
 
 
-@register_loss(Losses.STDC_LOSS)
+@register_loss(name=Losses.STDC_LOSS, deprecated_name="stdc_loss")
 class STDCLoss(_Loss):
     """
     Loss class of STDC-Seg training.
diff --git a/src/super_gradients/training/losses/yolox_loss.py b/src/super_gradients/training/losses/yolox_loss.py
index f5d6696f1f..b8ffe022d7 100644
--- a/src/super_gradients/training/losses/yolox_loss.py
+++ b/src/super_gradients/training/losses/yolox_loss.py
@@ -81,7 +81,7 @@ def forward(self, pred, target):
         return loss
 
 
-@register_loss(Losses.YOLOX_LOSS)
+@register_loss(name=Losses.YOLOX_LOSS, deprecated_name="yolox_loss")
 class YoloXDetectionLoss(_Loss):
     """
     Calculate YOLOX loss:
@@ -626,7 +626,7 @@ def dynamic_k_matching(self, cost, pair_wise_ious, gt_classes, num_gt, fg_mask):
         return num_fg, gt_matched_classes, pred_ious_this_matching, matched_gt_inds
 
 
-@register_loss(Losses.YOLOX_FAST_LOSS)
+@register_loss(name=Losses.YOLOX_FAST_LOSS, deprecated_name="yolox_fast_loss")
 class YoloXFastDetectionLoss(YoloXDetectionLoss):
     """
     A completely new implementation of YOLOX loss.
diff --git a/src/super_gradients/training/sg_trainer/sg_trainer.py b/src/super_gradients/training/sg_trainer/sg_trainer.py
index a6017c646e..7a6d541bb9 100755
--- a/src/super_gradients/training/sg_trainer/sg_trainer.py
+++ b/src/super_gradients/training/sg_trainer/sg_trainer.py
@@ -1233,6 +1233,10 @@ def forward(self, inputs, targets):
         warmup_mode = self.training_params.warmup_mode
         warmup_callback_cls = None
         if isinstance(warmup_mode, str):
+            from super_gradients.common.registry.registry import warn_if_deprecated
+
+            warn_if_deprecated(warmup_mode, LR_WARMUP_CLS_DICT)
+
             warmup_callback_cls = LR_WARMUP_CLS_DICT[warmup_mode]
         elif isinstance(warmup_mode, type) and issubclass(warmup_mode, LRCallbackBase):
             warmup_callback_cls = warmup_mode
diff --git a/src/super_gradients/training/utils/callbacks/callbacks.py b/src/super_gradients/training/utils/callbacks/callbacks.py
index 11796e31ab..5f53cb7e9c 100644
--- a/src/super_gradients/training/utils/callbacks/callbacks.py
+++ b/src/super_gradients/training/utils/callbacks/callbacks.py
@@ -276,7 +276,7 @@ def update_lr(self, optimizer, epoch, batch_idx=None):
                 param_group["lr"] = self.lr
 
 
-@register_lr_warmup(LRWarmups.LINEAR_EPOCH_STEP)
+@register_lr_warmup(LRWarmups.LINEAR_EPOCH_STEP, deprecated_name="linear_epoch_step")
 class EpochStepWarmupLRCallback(LRCallbackBase):
     """
     LR scheduling callback for linear step warmup. This scheduler uses a whole epoch as single step.
@@ -300,7 +300,7 @@ def is_lr_scheduling_enabled(self, context):
         return self.training_params.lr_warmup_epochs > 0 and self.training_params.lr_warmup_epochs >= context.epoch
 
 
-@register_lr_warmup(LRWarmups.LINEAR_STEP)
+@register_lr_warmup(LRWarmups.LINEAR_STEP, deprecated_name="linear_step")
 class LinearStepWarmupLRCallback(EpochStepWarmupLRCallback):
     """Deprecated, use EpochStepWarmupLRCallback instead"""
 
@@ -312,7 +312,7 @@ def __init__(self, **kwargs):
         super(LinearStepWarmupLRCallback, self).__init__(**kwargs)
 
 
-@register_lr_warmup(LRWarmups.LINEAR_BATCH_STEP)
+@register_lr_warmup(LRWarmups.LINEAR_BATCH_STEP, deprecated_name="linear_batch_step")
 class BatchStepLinearWarmupLRCallback(Callback):
     """
     LR scheduling callback for linear step warmup on each batch step.
@@ -384,7 +384,7 @@ def update_lr(self, optimizer, epoch, batch_idx=None):
                 param_group["lr"] = self.lr
 
 
-@register_lr_scheduler(LRSchedulers.STEP)
+@register_lr_scheduler(LRSchedulers.STEP, deprecated_name="step")
 class StepLRCallback(LRCallbackBase):
     """
     Hard coded step learning rate scheduling (i.e at specific milestones).
@@ -415,7 +415,7 @@ def is_lr_scheduling_enabled(self, context):
         return self.training_params.lr_warmup_epochs <= context.epoch
 
 
-@register_lr_scheduler(LRSchedulers.EXP)
+@register_lr_scheduler(LRSchedulers.EXP, deprecated_name="exp")
 class ExponentialLRCallback(LRCallbackBase):
     """
     Exponential decay learning rate scheduling. Decays the learning rate by `lr_decay_factor` every epoch.
@@ -436,7 +436,7 @@ def is_lr_scheduling_enabled(self, context):
         return self.training_params.lr_warmup_epochs <= context.epoch < post_warmup_epochs
 
 
-@register_lr_scheduler(LRSchedulers.POLY)
+@register_lr_scheduler(LRSchedulers.POLY, deprecated_name="poly")
 class PolyLRCallback(LRCallbackBase):
     """
     Hard coded polynomial decay learning rate scheduling (i.e at specific milestones).
@@ -459,7 +459,7 @@ def is_lr_scheduling_enabled(self, context):
         return self.training_params.lr_warmup_epochs <= context.epoch < post_warmup_epochs
 
 
-@register_lr_scheduler(LRSchedulers.COSINE)
+@register_lr_scheduler(LRSchedulers.COSINE, deprecated_name="cosine")
 class CosineLRCallback(LRCallbackBase):
     """
     Hard coded step Cosine anealing learning rate scheduling.
@@ -497,7 +497,7 @@ def compute_learning_rate(cls, step: Union[float, np.ndarray], total_steps: floa
         return lr * (1 - final_lr_ratio) + (initial_lr * final_lr_ratio)
 
 
-@register_lr_scheduler(LRSchedulers.FUNCTION)
+@register_lr_scheduler(LRSchedulers.FUNCTION, deprecated_name="function")
 class FunctionLRCallback(LRCallbackBase):
     """
     Hard coded rate scheduling for user defined lr scheduling function.

From 838ac9144c1be4a256eb569ccc2d6803450456ad Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Mon, 4 Sep 2023 14:46:11 +0300
Subject: [PATCH 04/21] update LR scheduler

---
 src/super_gradients/recipes/cityscapes_regseg48.yaml          | 2 +-
 src/super_gradients/recipes/cityscapes_segformer_b0.yaml      | 2 +-
 src/super_gradients/recipes/cityscapes_segformer_b1.yaml      | 2 +-
 src/super_gradients/recipes/cityscapes_segformer_b2.yaml      | 2 +-
 src/super_gradients/recipes/cityscapes_segformer_b3.yaml      | 2 +-
 src/super_gradients/recipes/cityscapes_segformer_b4.yaml      | 2 +-
 src/super_gradients/recipes/cityscapes_segformer_b5.yaml      | 2 +-
 .../training_hyperparams/cifar10_resnet_train_params.yaml     | 3 +--
 .../training_hyperparams/cityscapes_default_train_params.yaml | 2 +-
 .../training_hyperparams/coco2017_dekr_pose_train_params.yaml | 2 +-
 .../training_hyperparams/coco2017_ppyoloe_train_params.yaml   | 2 +-
 .../training_hyperparams/coco2017_rescoring_train_params.yaml | 2 +-
 .../coco2017_ssd_lite_mobilenet_v2_train_params.yaml          | 2 +-
 .../training_hyperparams/coco2017_yolo_nas_train_params.yaml  | 2 +-
 .../training_hyperparams/coco2017_yolox_train_params.yaml     | 2 +-
 .../coco_segmentation_shelfnet_lw_train_params.yaml           | 2 +-
 .../recipes/training_hyperparams/default_train_params.yaml    | 2 +-
 .../imagenet_efficientnet_train_params.yaml                   | 2 +-
 .../imagenet_mobilenetv2_train_params.yaml                    | 2 +-
 .../imagenet_mobilenetv3_train_params.yaml                    | 2 +-
 .../training_hyperparams/imagenet_regnetY_train_params.yaml   | 2 +-
 .../training_hyperparams/imagenet_repvgg_train_params.yaml    | 2 +-
 .../imagenet_resnet50_kd_train_params.yaml                    | 4 ++--
 .../training_hyperparams/imagenet_resnet50_train_params.yaml  | 4 ++--
 .../training_hyperparams/imagenet_vit_train_params.yaml       | 2 +-
 .../supervisely_default_train_params.yaml                     | 2 +-
 26 files changed, 28 insertions(+), 29 deletions(-)

diff --git a/src/super_gradients/recipes/cityscapes_regseg48.yaml b/src/super_gradients/recipes/cityscapes_regseg48.yaml
index 4e426100af..a9e8b0e393 100644
--- a/src/super_gradients/recipes/cityscapes_regseg48.yaml
+++ b/src/super_gradients/recipes/cityscapes_regseg48.yaml
@@ -52,7 +52,7 @@ training_hyperparams:
   sync_bn: True
   resume: ${resume}
   max_epochs: 800
-  lr_mode: poly
+  lr_mode: PolyLRCallback
   initial_lr: 0.02   # for effective batch_size=16
   lr_warmup_epochs: 0
   optimizer: SGD
diff --git a/src/super_gradients/recipes/cityscapes_segformer_b0.yaml b/src/super_gradients/recipes/cityscapes_segformer_b0.yaml
index 2415776c9e..865732a5a3 100644
--- a/src/super_gradients/recipes/cityscapes_segformer_b0.yaml
+++ b/src/super_gradients/recipes/cityscapes_segformer_b0.yaml
@@ -18,7 +18,7 @@ experiment_name: ${architecture}_cityscapes
 training_hyperparams:
   max_epochs: 2
 
-  lr_mode: poly
+  lr_mode: PolyLRCallback
   initial_lr: 0.00006  # for effective batch_size=8
 
 multi_gpu: DDP
diff --git a/src/super_gradients/recipes/cityscapes_segformer_b1.yaml b/src/super_gradients/recipes/cityscapes_segformer_b1.yaml
index 0bb8b2fef0..d0347f8cb7 100644
--- a/src/super_gradients/recipes/cityscapes_segformer_b1.yaml
+++ b/src/super_gradients/recipes/cityscapes_segformer_b1.yaml
@@ -18,7 +18,7 @@ experiment_name: ${architecture}_cityscapes
 training_hyperparams:
   max_epochs: 2
 
-  lr_mode: poly
+  lr_mode: PolyLRCallback
   initial_lr: 0.00006  # for effective batch_size=8
 
 multi_gpu: DDP
diff --git a/src/super_gradients/recipes/cityscapes_segformer_b2.yaml b/src/super_gradients/recipes/cityscapes_segformer_b2.yaml
index 0a0a05ea90..683c8b4966 100644
--- a/src/super_gradients/recipes/cityscapes_segformer_b2.yaml
+++ b/src/super_gradients/recipes/cityscapes_segformer_b2.yaml
@@ -18,7 +18,7 @@ experiment_name: ${architecture}_cityscapes
 training_hyperparams:
   max_epochs: 2
 
-  lr_mode: poly
+  lr_mode: PolyLRCallback
   initial_lr: 0.00006  # for effective batch_size=8
 
 multi_gpu: DDP
diff --git a/src/super_gradients/recipes/cityscapes_segformer_b3.yaml b/src/super_gradients/recipes/cityscapes_segformer_b3.yaml
index dfde685aa3..682817388c 100644
--- a/src/super_gradients/recipes/cityscapes_segformer_b3.yaml
+++ b/src/super_gradients/recipes/cityscapes_segformer_b3.yaml
@@ -18,7 +18,7 @@ experiment_name: ${architecture}_cityscapes
 training_hyperparams:
   max_epochs: 2
 
-  lr_mode: poly
+  lr_mode: PolyLRCallback
   initial_lr: 0.00006  # for effective batch_size=8
 
 multi_gpu: DDP
diff --git a/src/super_gradients/recipes/cityscapes_segformer_b4.yaml b/src/super_gradients/recipes/cityscapes_segformer_b4.yaml
index 1c40dcef90..5a42f3bec3 100644
--- a/src/super_gradients/recipes/cityscapes_segformer_b4.yaml
+++ b/src/super_gradients/recipes/cityscapes_segformer_b4.yaml
@@ -18,7 +18,7 @@ experiment_name: ${architecture}_cityscapes
 training_hyperparams:
   max_epochs: 2
 
-  lr_mode: poly
+  lr_mode: PolyLRCallback
   initial_lr: 0.00006  # for effective batch_size=8
 
   mixed_precision: True
diff --git a/src/super_gradients/recipes/cityscapes_segformer_b5.yaml b/src/super_gradients/recipes/cityscapes_segformer_b5.yaml
index eba8aaab15..03cd95e87b 100644
--- a/src/super_gradients/recipes/cityscapes_segformer_b5.yaml
+++ b/src/super_gradients/recipes/cityscapes_segformer_b5.yaml
@@ -18,7 +18,7 @@ experiment_name: ${architecture}_cityscapes
 training_hyperparams:
   max_epochs: 2
 
-  lr_mode: poly
+  lr_mode: PolyLRCallback
   initial_lr: 0.00006  # for effective batch_size=8
 
   mixed_precision: True
diff --git a/src/super_gradients/recipes/training_hyperparams/cifar10_resnet_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/cifar10_resnet_train_params.yaml
index 55c65dc639..38fb211a75 100644
--- a/src/super_gradients/recipes/training_hyperparams/cifar10_resnet_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/cifar10_resnet_train_params.yaml
@@ -10,7 +10,7 @@ lr_updates:
   step: 50
 
 lr_decay_factor: 0.1
-lr_mode: step
+lr_mode: StepLRCallback
 lr_warmup_epochs: 0
 initial_lr: 0.1
 loss: cross_entropy
@@ -34,4 +34,3 @@ valid_metrics_list:                               # metrics for evaluation
   - Top5
 
 _convert_: all
-
diff --git a/src/super_gradients/recipes/training_hyperparams/cityscapes_default_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/cityscapes_default_train_params.yaml
index 47f4baaa03..dcccdde62a 100644
--- a/src/super_gradients/recipes/training_hyperparams/cityscapes_default_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/cityscapes_default_train_params.yaml
@@ -2,7 +2,7 @@ defaults:
   - default_train_params
 
 max_epochs: 800
-lr_mode: poly
+lr_mode: PolyLRCallback
 initial_lr: 0.01   # for effective batch_size=32
 lr_warmup_epochs: 10
 multiply_head_lr: 10.
diff --git a/src/super_gradients/recipes/training_hyperparams/coco2017_dekr_pose_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/coco2017_dekr_pose_train_params.yaml
index 045d74986d..03bfc50de0 100644
--- a/src/super_gradients/recipes/training_hyperparams/coco2017_dekr_pose_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/coco2017_dekr_pose_train_params.yaml
@@ -8,7 +8,7 @@ ema_params:
   beta: 20
 
 max_epochs: 150
-lr_mode: cosine
+lr_mode: CosineLRCallback
 cosine_final_lr_ratio: 0.1
 batch_accumulate: 1
 initial_lr: 1e-3
diff --git a/src/super_gradients/recipes/training_hyperparams/coco2017_ppyoloe_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/coco2017_ppyoloe_train_params.yaml
index 21f24ec157..bae2cda7f7 100644
--- a/src/super_gradients/recipes/training_hyperparams/coco2017_ppyoloe_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/coco2017_ppyoloe_train_params.yaml
@@ -10,7 +10,7 @@ lr_warmup_steps: 1000
 lr_warmup_epochs: 0
 
 initial_lr:  2e-3
-lr_mode: cosine
+lr_mode: CosineLRCallback
 cosine_final_lr_ratio: 0.1
 
 zero_weight_decay_on_bias_and_bn: False
diff --git a/src/super_gradients/recipes/training_hyperparams/coco2017_rescoring_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/coco2017_rescoring_train_params.yaml
index 8257edd78e..f440564436 100644
--- a/src/super_gradients/recipes/training_hyperparams/coco2017_rescoring_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/coco2017_rescoring_train_params.yaml
@@ -8,7 +8,7 @@ ema_params:
   beta: 20
 
 max_epochs: 50
-lr_mode: cosine
+lr_mode: CosineLRCallback
 cosine_final_lr_ratio: 0.1
 batch_accumulate: 1
 initial_lr: 0.001
diff --git a/src/super_gradients/recipes/training_hyperparams/coco2017_ssd_lite_mobilenet_v2_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/coco2017_ssd_lite_mobilenet_v2_train_params.yaml
index f3e29743cb..1a6b39d4c1 100644
--- a/src/super_gradients/recipes/training_hyperparams/coco2017_ssd_lite_mobilenet_v2_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/coco2017_ssd_lite_mobilenet_v2_train_params.yaml
@@ -3,7 +3,7 @@ defaults:
 
 ema: True
 max_epochs: 400
-lr_mode: cosine
+lr_mode: CosineLRCallback
 cosine_final_lr_ratio: 0.01
 batch_accumulate: 1
 initial_lr: 0.01
diff --git a/src/super_gradients/recipes/training_hyperparams/coco2017_yolo_nas_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/coco2017_yolo_nas_train_params.yaml
index 83d6ec799b..5885da0983 100644
--- a/src/super_gradients/recipes/training_hyperparams/coco2017_yolo_nas_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/coco2017_yolo_nas_train_params.yaml
@@ -9,7 +9,7 @@ lr_warmup_steps: 1000
 lr_warmup_epochs: 0
 
 initial_lr:  2e-4
-lr_mode: cosine
+lr_mode: CosineLRCallback
 cosine_final_lr_ratio: 0.1
 
 zero_weight_decay_on_bias_and_bn: True
diff --git a/src/super_gradients/recipes/training_hyperparams/coco2017_yolox_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/coco2017_yolox_train_params.yaml
index cb0df61965..cd16842c37 100644
--- a/src/super_gradients/recipes/training_hyperparams/coco2017_yolox_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/coco2017_yolox_train_params.yaml
@@ -2,7 +2,7 @@ defaults:
   - default_train_params
 
 max_epochs: 300
-lr_mode: cosine
+lr_mode: CosineLRCallback
 cosine_final_lr_ratio: 0.05
 lr_warmup_epochs: 5
 lr_cooldown_epochs: 15
diff --git a/src/super_gradients/recipes/training_hyperparams/coco_segmentation_shelfnet_lw_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/coco_segmentation_shelfnet_lw_train_params.yaml
index fd644b0913..ca6f3782c9 100644
--- a/src/super_gradients/recipes/training_hyperparams/coco_segmentation_shelfnet_lw_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/coco_segmentation_shelfnet_lw_train_params.yaml
@@ -7,7 +7,7 @@ loss: shelfnet_ohem_loss
 optimizer: SGD
 mixed_precision: True
 batch_accumulate: 3
-lr_mode: poly
+lr_mode: PolyLRCallback
 optimizer_params:
   momentum: 0.9
   weight_decay: 1e-4
diff --git a/src/super_gradients/recipes/training_hyperparams/default_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/default_train_params.yaml
index a37e6c05f5..0749ba7b11 100644
--- a/src/super_gradients/recipes/training_hyperparams/default_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/default_train_params.yaml
@@ -13,7 +13,7 @@ resume_from_remote_sg_logger: False # bool (default=False), When true, ckpt_name
 ckpt_name: ckpt_latest.pth  # The checkpoint (.pth file) filename in CKPT_ROOT_DIR/EXPERIMENT_NAME/ to use when resume=True and resume_path=None
 
 lr_mode: # Union[str, Mapping]
-         # when str: Learning rate scheduling policy, one of ['step','poly','cosine','function']
+         # when str: Learning rate scheduling policy, one of ["StepLRCallback", "PolyLRCallback", "CosineLRCallback", "ExponentialLRCallback", "FunctionLRCallback"]
          # when Mapping: refers to a torch.optim.lr_scheduler._LRScheduler, following the below API: lr_mode = {LR_SCHEDULER_CLASS_NAME: {**LR_SCHEDULER_KWARGS, "phase": XXX, "metric_name": XXX)
 
 lr_schedule_function: # Learning rate scheduling function to be used when `lr_mode` is 'function'.
diff --git a/src/super_gradients/recipes/training_hyperparams/imagenet_efficientnet_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/imagenet_efficientnet_train_params.yaml
index 3f8c1b122c..31a96c0f2f 100644
--- a/src/super_gradients/recipes/training_hyperparams/imagenet_efficientnet_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/imagenet_efficientnet_train_params.yaml
@@ -2,7 +2,7 @@ defaults:
   - default_train_params
 
 max_epochs: 450
-lr_mode: step
+lr_mode: StepLRCallback
 step_lr_update_freq: 2.4
 initial_lr: 0.016
 lr_warmup_epochs: 3
diff --git a/src/super_gradients/recipes/training_hyperparams/imagenet_mobilenetv2_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/imagenet_mobilenetv2_train_params.yaml
index a0703b43b1..56a5a8e665 100644
--- a/src/super_gradients/recipes/training_hyperparams/imagenet_mobilenetv2_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/imagenet_mobilenetv2_train_params.yaml
@@ -3,7 +3,7 @@ defaults:
 
 max_epochs: 450
 
-lr_mode: step
+lr_mode: StepLRCallback
 initial_lr: 0.032   # for total batch-size of 512
 lr_decay_factor: 0.973
 lr_updates:
diff --git a/src/super_gradients/recipes/training_hyperparams/imagenet_mobilenetv3_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/imagenet_mobilenetv3_train_params.yaml
index b7aa565199..c6c5305660 100644
--- a/src/super_gradients/recipes/training_hyperparams/imagenet_mobilenetv3_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/imagenet_mobilenetv3_train_params.yaml
@@ -2,7 +2,7 @@ defaults:
   - default_train_params
 
 max_epochs: 150
-lr_mode: cosine
+lr_mode: CosineLRCallback
 initial_lr: 0.1
 optimizer: SGD
 
diff --git a/src/super_gradients/recipes/training_hyperparams/imagenet_regnetY_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/imagenet_regnetY_train_params.yaml
index ad8d2f498c..d0b9add9de 100644
--- a/src/super_gradients/recipes/training_hyperparams/imagenet_regnetY_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/imagenet_regnetY_train_params.yaml
@@ -2,7 +2,7 @@ defaults:
   - default_train_params
 
 max_epochs: 450
-lr_mode: step
+lr_mode: StepLRCallback
 step_lr_update_freq: 2.4
 initial_lr: 0.016
 lr_warmup_epochs: 3
diff --git a/src/super_gradients/recipes/training_hyperparams/imagenet_repvgg_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/imagenet_repvgg_train_params.yaml
index 215923583e..a219060350 100644
--- a/src/super_gradients/recipes/training_hyperparams/imagenet_repvgg_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/imagenet_repvgg_train_params.yaml
@@ -2,7 +2,7 @@ defaults:
   - default_train_params
 
 max_epochs: 120
-lr_mode: cosine
+lr_mode: CosineLRCallback
 initial_lr: 0.1
 cosine_final_lr_ratio: 0
 
diff --git a/src/super_gradients/recipes/training_hyperparams/imagenet_resnet50_kd_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/imagenet_resnet50_kd_train_params.yaml
index 84c1a09501..f36c971229 100644
--- a/src/super_gradients/recipes/training_hyperparams/imagenet_resnet50_kd_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/imagenet_resnet50_kd_train_params.yaml
@@ -3,7 +3,7 @@ defaults:
 
 max_epochs: 610
 initial_lr: 5e-3
-lr_mode: cosine
+lr_mode: CosineLRCallback
 lr_warmup_epochs: 5
 lr_cooldown_epochs: 10
 ema: True
@@ -21,4 +21,4 @@ valid_metrics_list:                               # metrics for evaluation
   - Top5
 
 
-_convert_: all
\ No newline at end of file
+_convert_: all
diff --git a/src/super_gradients/recipes/training_hyperparams/imagenet_resnet50_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/imagenet_resnet50_train_params.yaml
index 11df8abe3c..61948dbdaa 100644
--- a/src/super_gradients/recipes/training_hyperparams/imagenet_resnet50_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/imagenet_resnet50_train_params.yaml
@@ -3,7 +3,7 @@ defaults:
 
 max_epochs: 400
 initial_lr: 0.1
-lr_mode: cosine
+lr_mode: CosineLRCallback
 lr_warmup_epochs: 5
 ema: False
 save_ckpt_epoch_list: [ 50, 100, 150, 200, 300 ]
@@ -21,4 +21,4 @@ valid_metrics_list:                               # metrics for evaluation
 metric_to_watch: Accuracy
 greater_metric_to_watch_is_better: True
 
-_convert_: all
\ No newline at end of file
+_convert_: all
diff --git a/src/super_gradients/recipes/training_hyperparams/imagenet_vit_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/imagenet_vit_train_params.yaml
index 749a1dc921..126a6c6a41 100644
--- a/src/super_gradients/recipes/training_hyperparams/imagenet_vit_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/imagenet_vit_train_params.yaml
@@ -3,7 +3,7 @@ defaults:
 
 max_epochs: 10
 initial_lr: 0.03
-lr_mode: cosine
+lr_mode: CosineLRCallback
 cosine_final_lr_ratio: 0
 lr_warmup_epochs: 1
 warmup_initial_lr: 0
diff --git a/src/super_gradients/recipes/training_hyperparams/supervisely_default_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/supervisely_default_train_params.yaml
index 7a770dd305..3a5507e52c 100644
--- a/src/super_gradients/recipes/training_hyperparams/supervisely_default_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/supervisely_default_train_params.yaml
@@ -2,7 +2,7 @@ defaults:
   - default_train_params
 
 max_epochs: 100
-lr_mode: cosine
+lr_mode: CosineLRCallback
 cosine_final_lr_ratio: 0.01
 initial_lr: 0.1
 lr_warmup_epochs: 0

From 894e2eb75eb798a2d188f48c3ca39090adeea11e Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Mon, 4 Sep 2023 15:01:02 +0300
Subject: [PATCH 05/21] remove deprecated for .py and /md

---
 .../Example_Training-an-external-model.md      |  2 +-
 documentation/source/LRScheduling.md           |  4 ++--
 documentation/source/Segmentation.md           |  2 +-
 src/super_gradients/common/object_names.py     | 10 ----------
 .../ddrnet_classification_example.py           |  2 +-
 .../deci_lab_export_example.py                 |  2 +-
 .../examples/early_stop/early_stop_example.py  |  2 +-
 .../loggers_examples/clearml_logger_example.py |  2 +-
 .../deci_platform_logger_example.py            |  2 +-
 .../regseg_transfer_learning_example.py        |  2 +-
 .../recipes/roboflow_yolo_nas_m.yaml           |  2 +-
 .../recipes/roboflow_yolo_nas_s.yaml           |  2 +-
 .../coco2017_ppyoloe_train_params.yaml         |  2 +-
 .../coco2017_yolo_nas_train_params.yaml        |  2 +-
 .../default_train_params.yaml                  | 12 ++++++------
 .../imagenet_vit_train_params.yaml             |  2 +-
 src/super_gradients/training/params.py         |  4 ++--
 .../pre_launch_callbacks.py                    |  4 ++--
 tests/end_to_end_tests/trainer_test.py         |  2 +-
 .../conversion_callback_test.py                |  4 ++--
 .../integration_tests/deci_lab_export_test.py  |  2 +-
 .../ema_train_integration_test.py              |  2 +-
 tests/integration_tests/lr_test.py             |  8 ++++----
 .../pretrained_models_test.py                  | 12 ++++++------
 .../coded_qat_launch_test.py                   |  4 ++--
 tests/unit_tests/dataset_statistics_test.py    |  2 +-
 tests/unit_tests/detection_dataset_test.py     |  2 +-
 tests/unit_tests/double_training_test.py       |  2 +-
 tests/unit_tests/early_stop_test.py            |  2 +-
 tests/unit_tests/extreme_batch_cb_test.py      |  4 ++--
 tests/unit_tests/factories_test.py             |  4 ++--
 tests/unit_tests/forward_pass_prep_fn_test.py  |  2 +-
 .../initialize_with_dataloaders_test.py        |  2 +-
 tests/unit_tests/kd_ema_test.py                |  2 +-
 tests/unit_tests/kd_trainer_test.py            |  2 +-
 tests/unit_tests/load_ema_ckpt_test.py         |  2 +-
 .../local_ckpt_head_replacement_test.py        |  2 +-
 tests/unit_tests/loss_loggings_test.py         |  6 +++---
 tests/unit_tests/lr_cooldown_test.py           |  2 +-
 tests/unit_tests/lr_warmup_test.py             | 18 +++++++++---------
 .../unit_tests/max_batches_loop_break_test.py  |  4 ++--
 .../optimizer_params_override_test.py          |  4 ++--
 tests/unit_tests/phase_context_test.py         |  2 +-
 tests/unit_tests/preprocessing_unit_test.py    |  4 ++--
 tests/unit_tests/resume_training_test.py       |  8 ++++----
 tests/unit_tests/save_ckpt_test.py             |  2 +-
 tests/unit_tests/train_after_test_test.py      |  2 +-
 tests/unit_tests/train_logging_test.py         |  2 +-
 .../train_with_intialized_param_args_test.py   |  8 ++++----
 tests/unit_tests/train_with_precise_bn_test.py |  4 ++--
 .../update_param_groups_unit_test.py           |  2 +-
 tests/unit_tests/vit_unit_test.py              |  2 +-
 52 files changed, 91 insertions(+), 101 deletions(-)

diff --git a/documentation/source/Example_Training-an-external-model.md b/documentation/source/Example_Training-an-external-model.md
index 4978ef0d88..c1bbc51919 100644
--- a/documentation/source/Example_Training-an-external-model.md
+++ b/documentation/source/Example_Training-an-external-model.md
@@ -640,7 +640,7 @@ And lastly, we need to define the training hyperparameters:
 ```python
 train_params = {
     "max_epochs": 100,
-    "lr_mode": "cosine",
+    "lr_mode": "CosineLRCallback",
     "initial_lr": 0.001,
     "optimizer": "Adam",
     "loss": CustomSegLoss(),
diff --git a/documentation/source/LRScheduling.md b/documentation/source/LRScheduling.md
index 4912ffccb0..e1992762c2 100644
--- a/documentation/source/LRScheduling.md
+++ b/documentation/source/LRScheduling.md
@@ -30,7 +30,7 @@ valid_dataloader = ...
 model = ...
 train_params = {
     "initial_lr": 0.1,
-    "lr_mode":"step",
+    "lr_mode":"StepLRCallback",
     "lr_updates": [100, 150, 200],
     "lr_decay_factor": 0.1,
     ...,
@@ -66,7 +66,7 @@ Prerequisites: [phase callbacks](PhaseCallbacks.md), [training with configuratio
 In SG, learning rate schedulers are implemented as [phase callbacks](PhaseCallbacks.md).
 They read the learning rate from the `PhaseContext` in their `__call__` method, calculate the new learning rate according to the current state of training, and update the optimizer's param groups.
 
-For example, the code snippet from the previous section translates "lr_mode":"step" to a `super_gradients.training.utils.callbacks.callbacks.StepLRCallback` instance, which is added to the phase callbacks list.
+For example, the code snippet from the previous section translates "lr_mode":"StepLRCallback" to a `super_gradients.training.utils.callbacks.callbacks.StepLRCallback` instance, which is added to the phase callbacks list.
 
 ### Implementing Your Own Scheduler
 A custom learning rate scheduler should inherit from `LRCallbackBase`, so let's take a look at it:
diff --git a/documentation/source/Segmentation.md b/documentation/source/Segmentation.md
index 6876615567..7529dce3b1 100644
--- a/documentation/source/Segmentation.md
+++ b/documentation/source/Segmentation.md
@@ -143,7 +143,7 @@ from super_gradients.training.metrics.segmentation_metrics import BinaryIOU
 
 train_params = {
     "max_epochs": 30,
-    "lr_mode": "cosine",
+    "lr_mode": "CosineLRCallback",
     "initial_lr": 0.005,
     "lr_warmup_epochs": 5,
     "multiply_head_lr": 10,
diff --git a/src/super_gradients/common/object_names.py b/src/super_gradients/common/object_names.py
index 7ff406830f..120b68d854 100644
--- a/src/super_gradients/common/object_names.py
+++ b/src/super_gradients/common/object_names.py
@@ -176,12 +176,6 @@ class LRSchedulers:
     EXP = "ExponentialLRCallback"
     FUNCTION = "FunctionLRCallback"
 
-    _DEPRECATED_STEP = "step"
-    _DEPRECATED_POLY = "poly"
-    _DEPRECATED_COSINE = "cosine"
-    _DEPRECATED_EXP = "exp"
-    _DEPRECATED_FUNCTION = "function"
-
 
 class LRWarmups:
     """Static class to hold all the supported LR Warmup names"""
@@ -190,10 +184,6 @@ class LRWarmups:
     LINEAR_EPOCH_STEP = "EpochStepWarmupLRCallback"
     LINEAR_BATCH_STEP = "BatchStepLinearWarmupLRCallback"
 
-    _DEPRECATED_LINEAR_STEP = "linear_step"
-    _DEPRECATED_LINEAR_EPOCH_STEP = "linear_epoch_step"
-    _DEPRECATED_LINEAR_BATCH_STEP = "linear_batch_step"
-
 
 class Samplers:
     """Static class to hold all the supported Samplers names"""
diff --git a/src/super_gradients/examples/ddrnet_imagenet/ddrnet_classification_example.py b/src/super_gradients/examples/ddrnet_imagenet/ddrnet_classification_example.py
index eba1463bdf..f1a0e33242 100644
--- a/src/super_gradients/examples/ddrnet_imagenet/ddrnet_classification_example.py
+++ b/src/super_gradients/examples/ddrnet_imagenet/ddrnet_classification_example.py
@@ -39,7 +39,7 @@
 
 train_params_ddr = {
     "max_epochs": args.max_epochs,
-    "lr_mode": "step",
+    "lr_mode": "StepLRCallback",
     "lr_updates": [30, 60, 90],
     "lr_decay_factor": 0.1,
     "initial_lr": 0.1 * devices,
diff --git a/src/super_gradients/examples/deci_lab_export_example/deci_lab_export_example.py b/src/super_gradients/examples/deci_lab_export_example/deci_lab_export_example.py
index 8080cf2ae8..1a7ce24eb7 100644
--- a/src/super_gradients/examples/deci_lab_export_example/deci_lab_export_example.py
+++ b/src/super_gradients/examples/deci_lab_export_example/deci_lab_export_example.py
@@ -56,7 +56,7 @@ def main(architecture_name: str):
         "max_epochs": 2,
         "lr_updates": [1],
         "lr_decay_factor": 0.1,
-        "lr_mode": "step",
+        "lr_mode": "StepLRCallback",
         "lr_warmup_epochs": 0,
         "initial_lr": 0.1,
         "loss": "cross_entropy",
diff --git a/src/super_gradients/examples/early_stop/early_stop_example.py b/src/super_gradients/examples/early_stop/early_stop_example.py
index e2cbb782cf..ca6553e407 100644
--- a/src/super_gradients/examples/early_stop/early_stop_example.py
+++ b/src/super_gradients/examples/early_stop/early_stop_example.py
@@ -18,7 +18,7 @@
     "max_epochs": 250,
     "lr_updates": [100, 150, 200],
     "lr_decay_factor": 0.1,
-    "lr_mode": "step",
+    "lr_mode": "StepLRCallback",
     "lr_warmup_epochs": 0,
     "initial_lr": 0.1,
     "loss": "cross_entropy",
diff --git a/src/super_gradients/examples/loggers_examples/clearml_logger_example.py b/src/super_gradients/examples/loggers_examples/clearml_logger_example.py
index 700bb6f584..5727d9fcf2 100644
--- a/src/super_gradients/examples/loggers_examples/clearml_logger_example.py
+++ b/src/super_gradients/examples/loggers_examples/clearml_logger_example.py
@@ -11,7 +11,7 @@
     "max_epochs": 20,
     "lr_updates": [5, 10, 15],
     "lr_decay_factor": 0.1,
-    "lr_mode": "step",
+    "lr_mode": "StepLRCallback",
     "initial_lr": 0.1,
     "loss": "cross_entropy",
     "optimizer": "SGD",
diff --git a/src/super_gradients/examples/loggers_examples/deci_platform_logger_example.py b/src/super_gradients/examples/loggers_examples/deci_platform_logger_example.py
index 988373e9ee..3eed37ed00 100644
--- a/src/super_gradients/examples/loggers_examples/deci_platform_logger_example.py
+++ b/src/super_gradients/examples/loggers_examples/deci_platform_logger_example.py
@@ -14,7 +14,7 @@
     "max_epochs": 20,
     "lr_updates": [5, 10, 15],
     "lr_decay_factor": 0.1,
-    "lr_mode": "step",
+    "lr_mode": "StepLRCallback",
     "initial_lr": 0.1,
     "loss": "cross_entropy",
     "optimizer": "SGD",
diff --git a/src/super_gradients/examples/regseg_transfer_learning_example/regseg_transfer_learning_example.py b/src/super_gradients/examples/regseg_transfer_learning_example/regseg_transfer_learning_example.py
index e8bd1b167e..0265a31a41 100644
--- a/src/super_gradients/examples/regseg_transfer_learning_example/regseg_transfer_learning_example.py
+++ b/src/super_gradients/examples/regseg_transfer_learning_example/regseg_transfer_learning_example.py
@@ -39,7 +39,7 @@
 # DEFINE TRAINING PARAMS. SEE DOCS FOR THE FULL LIST.
 train_params = {
     "max_epochs": 50,
-    "lr_mode": "cosine",
+    "lr_mode": "CosineLRCallback",
     "initial_lr": 0.0064,  # for batch_size=16
     "optimizer_params": {"momentum": 0.843, "weight_decay": 0.00036, "nesterov": True},
     "cosine_final_lr_ratio": 0.1,
diff --git a/src/super_gradients/recipes/roboflow_yolo_nas_m.yaml b/src/super_gradients/recipes/roboflow_yolo_nas_m.yaml
index 0c0c43c569..79582bab0b 100644
--- a/src/super_gradients/recipes/roboflow_yolo_nas_m.yaml
+++ b/src/super_gradients/recipes/roboflow_yolo_nas_m.yaml
@@ -46,7 +46,7 @@ training_hyperparams:
   zero_weight_decay_on_bias_and_bn: True
 
   lr_warmup_epochs: 3
-  warmup_mode: linear_epoch_step
+  warmup_mode: EpochStepWarmupLRCallback
 
   initial_lr: 4e-4
   cosine_final_lr_ratio: 0.1
diff --git a/src/super_gradients/recipes/roboflow_yolo_nas_s.yaml b/src/super_gradients/recipes/roboflow_yolo_nas_s.yaml
index cf94960756..a3a54f5140 100644
--- a/src/super_gradients/recipes/roboflow_yolo_nas_s.yaml
+++ b/src/super_gradients/recipes/roboflow_yolo_nas_s.yaml
@@ -46,7 +46,7 @@ training_hyperparams:
   zero_weight_decay_on_bias_and_bn: True
 
   lr_warmup_epochs: 3
-  warmup_mode: linear_epoch_step
+  warmup_mode: EpochStepWarmupLRCallback
 
   initial_lr: 5e-4
   cosine_final_lr_ratio: 0.1
diff --git a/src/super_gradients/recipes/training_hyperparams/coco2017_ppyoloe_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/coco2017_ppyoloe_train_params.yaml
index bae2cda7f7..53d6fcf965 100644
--- a/src/super_gradients/recipes/training_hyperparams/coco2017_ppyoloe_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/coco2017_ppyoloe_train_params.yaml
@@ -4,7 +4,7 @@ defaults:
 max_epochs: 500
 static_assigner_end_epoch: 150
 
-warmup_mode: "linear_batch_step"
+warmup_mode: BatchStepLinearWarmupLRCallback
 warmup_initial_lr:  1e-6
 lr_warmup_steps: 1000
 lr_warmup_epochs: 0
diff --git a/src/super_gradients/recipes/training_hyperparams/coco2017_yolo_nas_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/coco2017_yolo_nas_train_params.yaml
index 5885da0983..9eea40f750 100644
--- a/src/super_gradients/recipes/training_hyperparams/coco2017_yolo_nas_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/coco2017_yolo_nas_train_params.yaml
@@ -3,7 +3,7 @@ defaults:
 
 max_epochs: 300
 
-warmup_mode: "linear_batch_step"
+warmup_mode: BatchStepLinearWarmupLRCallback
 warmup_initial_lr:  1e-6
 lr_warmup_steps: 1000
 lr_warmup_epochs: 0
diff --git a/src/super_gradients/recipes/training_hyperparams/default_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/default_train_params.yaml
index 0749ba7b11..c3de57c2d4 100644
--- a/src/super_gradients/recipes/training_hyperparams/default_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/default_train_params.yaml
@@ -16,14 +16,14 @@ lr_mode: # Union[str, Mapping]
          # when str: Learning rate scheduling policy, one of ["StepLRCallback", "PolyLRCallback", "CosineLRCallback", "ExponentialLRCallback", "FunctionLRCallback"]
          # when Mapping: refers to a torch.optim.lr_scheduler._LRScheduler, following the below API: lr_mode = {LR_SCHEDULER_CLASS_NAME: {**LR_SCHEDULER_KWARGS, "phase": XXX, "metric_name": XXX)
 
-lr_schedule_function: # Learning rate scheduling function to be used when `lr_mode` is 'function'.
+lr_schedule_function: # Learning rate scheduling function to be used when `lr_mode` is 'FunctionLRCallback'.
 lr_warmup_epochs: 0 # number of epochs for learning rate warm up - see https://arxiv.org/pdf/1706.02677.pdf (Section 2.2).
-lr_warmup_steps: 0  # number of warmup steps (Used when warmup_mode=linear_batch_step)
+lr_warmup_steps: 0  # number of warmup steps (Used when warmup_mode=BatchStepLinearWarmupLRCallback)
 lr_cooldown_epochs: 0 # epochs to cooldown LR (i.e the last epoch from scheduling view point=max_epochs-cooldown)
-warmup_initial_lr: # Initial lr for linear_epoch_step/linear_batch_step. When none is given, initial_lr/(warmup_epochs+1) will be used.
-step_lr_update_freq: # (float) update frequency in epoch units for computing lr_updates when lr_mode=`step`.
-cosine_final_lr_ratio: 0.01 # final learning rate ratio (only relevant when `lr_mode`='cosine')
-warmup_mode: linear_epoch_step # learning rate warmup scheme, currently 'linear_epoch_step' and 'linear_batch_step' are supported
+warmup_initial_lr: # Initial lr for EpochStepWarmupLRCallback/BatchStepLinearWarmupLRCallback. When none is given, initial_lr/(warmup_epochs+1) will be used.
+step_lr_update_freq: # (float) update frequency in epoch units for computing lr_updates when lr_mode=`StepLRCallback`.
+cosine_final_lr_ratio: 0.01 # final learning rate ratio (only relevant when `lr_mode`='CosineLRCallback')
+warmup_mode: EpochStepWarmupLRCallback # learning rate warmup scheme, currently ['LinearStepWarmupLRCallback', 'EpochStepWarmupLRCallback', 'BatchStepLinearWarmupLRCallback'] are supported
 
 lr_updates:
   _target_: super_gradients.training.utils.utils.empty_list # This is a workaround to instantiate a list using _target_. If we would instantiate as "lr_updates: []",
diff --git a/src/super_gradients/recipes/training_hyperparams/imagenet_vit_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/imagenet_vit_train_params.yaml
index 126a6c6a41..97c19b5347 100644
--- a/src/super_gradients/recipes/training_hyperparams/imagenet_vit_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/imagenet_vit_train_params.yaml
@@ -7,7 +7,7 @@ lr_mode: CosineLRCallback
 cosine_final_lr_ratio: 0
 lr_warmup_epochs: 1
 warmup_initial_lr: 0
-warmup_mode: linear_epoch_step
+warmup_mode: EpochStepWarmupLRCallback
 ema: False
 loss: cross_entropy
 clip_grad_norm: 1
diff --git a/src/super_gradients/training/params.py b/src/super_gradients/training/params.py
index 21c18d05c6..907d3ffaa4 100755
--- a/src/super_gradients/training/params.py
+++ b/src/super_gradients/training/params.py
@@ -48,7 +48,7 @@
         "save_tensorboard_remote": False,  # upload tensorboard files to s3
         "save_logs_remote": False,
     },  # upload log files to s3
-    "warmup_mode": "linear_step",
+    "warmup_mode": "LinearStepWarmupLRCallback",
     "step_lr_update_freq": None,
     "lr_updates": [],
     "clip_grad_norm": None,
@@ -100,7 +100,7 @@
         "lr_warmup_epochs": {"type": "number", "minimum": 0, "maximum": 10},
         "initial_lr": {"type": "number", "exclusiveMinimum": 0, "maximum": 10},
     },
-    "if": {"properties": {"lr_mode": {"const": "step"}}},
+    "if": {"properties": {"lr_mode": {"const": "StepLRCallback"}}},
     "then": {"required": ["lr_updates", "lr_decay_factor"]},
     "required": ["max_epochs", "lr_mode", "initial_lr", "loss"],
 }
diff --git a/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py b/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py
index 486db3f085..e8420ba62e 100644
--- a/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py
+++ b/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py
@@ -300,8 +300,8 @@ def modify_params_for_qat(
     logger.warning(f"New learning rate: {training_hyperparams['initial_lr']}")
     logger.warning(f"New weight decay: {training_hyperparams['optimizer_params']['weight_decay']}")
     # as recommended by pytorch-quantization docs
-    if get_param(training_hyperparams, "lr_mode") != "cosine":
-        training_hyperparams["lr_mode"] = "cosine"
+    if get_param(training_hyperparams, "lr_mode") != "CosineLRCallback":
+        training_hyperparams["lr_mode"] = "CosineLRCallback"
     training_hyperparams["cosine_final_lr_ratio"] = cosine_final_lr_ratio
     logger.warning(
         f"lr_mode will be set to cosine for QAT run instead of {get_param(training_hyperparams, 'lr_mode')} with "
diff --git a/tests/end_to_end_tests/trainer_test.py b/tests/end_to_end_tests/trainer_test.py
index 122f1e7cca..f1f57eee86 100644
--- a/tests/end_to_end_tests/trainer_test.py
+++ b/tests/end_to_end_tests/trainer_test.py
@@ -25,7 +25,7 @@ def setUp(cls):
             "lr_decay_factor": 0.1,
             "initial_lr": 0.1,
             "lr_updates": [4],
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "loss": "cross_entropy",
             "train_metrics_list": [Accuracy(), Top5()],
             "valid_metrics_list": [Accuracy(), Top5()],
diff --git a/tests/integration_tests/conversion_callback_test.py b/tests/integration_tests/conversion_callback_test.py
index 709287b4fc..fe2d6bc4cb 100644
--- a/tests/integration_tests/conversion_callback_test.py
+++ b/tests/integration_tests/conversion_callback_test.py
@@ -54,7 +54,7 @@ def test_classification_architectures(self):
                 "max_epochs": 2,
                 "lr_updates": [1],
                 "lr_decay_factor": 0.1,
-                "lr_mode": "step",
+                "lr_mode": "StepLRCallback",
                 "lr_warmup_epochs": 0,
                 "initial_lr": 0.1,
                 "loss": "cross_entropy",
@@ -107,7 +107,7 @@ def get_architecture_custom_config(architecture_name: str):
             train_params = {
                 "max_epochs": 3,
                 "initial_lr": 1e-2,
-                "lr_mode": "poly",
+                "lr_mode": "PolyLRCallback",
                 "ema": True,  # unlike the paper (not specified in paper)
                 "optimizer": "SGD",
                 "optimizer_params": {"weight_decay": 5e-4, "momentum": 0.9},
diff --git a/tests/integration_tests/deci_lab_export_test.py b/tests/integration_tests/deci_lab_export_test.py
index 3130b9d785..8f18dfcd29 100644
--- a/tests/integration_tests/deci_lab_export_test.py
+++ b/tests/integration_tests/deci_lab_export_test.py
@@ -44,7 +44,7 @@ def test_train_with_deci_lab_integration(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "cross_entropy",
diff --git a/tests/integration_tests/ema_train_integration_test.py b/tests/integration_tests/ema_train_integration_test.py
index 777e5b319c..50c778f5c5 100644
--- a/tests/integration_tests/ema_train_integration_test.py
+++ b/tests/integration_tests/ema_train_integration_test.py
@@ -49,7 +49,7 @@ def _train(self, ema_params):
         training_params = {
             "max_epochs": 4,
             "lr_updates": [4],
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_decay_factor": 0.1,
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
diff --git a/tests/integration_tests/lr_test.py b/tests/integration_tests/lr_test.py
index 82bdaec94d..20a9afb2a6 100644
--- a/tests/integration_tests/lr_test.py
+++ b/tests/integration_tests/lr_test.py
@@ -45,12 +45,12 @@ def test_lr_function(initial_lr, epoch, iter, max_epoch, iters_per_epoch, **kwar
             return initial_lr * (1 - ((epoch * iters_per_epoch + iter) / (max_epoch * iters_per_epoch)))
 
         # test if we are able that lr_function supports functions with this structure
-        training_params = {**self.training_params, "lr_mode": "function", "lr_schedule_function": test_lr_function}
+        training_params = {**self.training_params, "lr_mode": "FunctionLRCallback", "lr_schedule_function": test_lr_function}
         trainer.train(
             model=model, training_params=training_params, train_loader=classification_test_dataloader(), valid_loader=classification_test_dataloader()
         )
         # test that we assert lr_function is callable
-        training_params = {**self.training_params, "lr_mode": "function"}
+        training_params = {**self.training_params, "lr_mode": "FunctionLRCallback"}
         with self.assertRaises(AssertionError):
             trainer.train(
                 model=model, training_params=training_params, train_loader=classification_test_dataloader(), valid_loader=classification_test_dataloader()
@@ -58,14 +58,14 @@ def test_lr_function(initial_lr, epoch, iter, max_epoch, iters_per_epoch, **kwar
 
     def test_cosine_lr(self):
         trainer, model = self.get_trainer(self.folder_name)
-        training_params = {**self.training_params, "lr_mode": "cosine", "cosine_final_lr_ratio": 0.01}
+        training_params = {**self.training_params, "lr_mode": "CosineLRCallback", "cosine_final_lr_ratio": 0.01}
         trainer.train(
             model=model, training_params=training_params, train_loader=classification_test_dataloader(), valid_loader=classification_test_dataloader()
         )
 
     def test_step_lr(self):
         trainer, model = self.get_trainer(self.folder_name)
-        training_params = {**self.training_params, "lr_mode": "step", "lr_decay_factor": 0.1, "lr_updates": [4]}
+        training_params = {**self.training_params, "lr_mode": "StepLRCallback", "lr_decay_factor": 0.1, "lr_updates": [4]}
         trainer.train(
             model=model, training_params=training_params, train_loader=classification_test_dataloader(), valid_loader=classification_test_dataloader()
         )
diff --git a/tests/integration_tests/pretrained_models_test.py b/tests/integration_tests/pretrained_models_test.py
index fbf7abd96f..dfe37846cc 100644
--- a/tests/integration_tests/pretrained_models_test.py
+++ b/tests/integration_tests/pretrained_models_test.py
@@ -87,7 +87,7 @@ def setUp(self) -> None:
             "lr_decay_factor": 0.1,
             "initial_lr": 0.6,
             "loss": "cross_entropy",
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "optimizer_params": {"weight_decay": 0.000, "momentum": 0.9},
             "train_metrics_list": [Accuracy()],
             "valid_metrics_list": [Accuracy()],
@@ -128,7 +128,7 @@ def setUp(self) -> None:
         ssd_dboxes = DEFAULT_SSD_LITE_MOBILENET_V2_ARCH_PARAMS["heads"]["SSDHead"]["anchors"]
         self.transfer_detection_train_params_ssd = {
             "max_epochs": 3,
-            "lr_mode": "cosine",
+            "lr_mode": "CosineLRCallback",
             "initial_lr": 0.01,
             "cosine_final_lr_ratio": 0.01,
             "lr_warmup_epochs": 3,
@@ -145,7 +145,7 @@ def setUp(self) -> None:
         }
         self.transfer_detection_train_params_yolox = {
             "max_epochs": 3,
-            "lr_mode": "cosine",
+            "lr_mode": "CosineLRCallback",
             "cosine_final_lr_ratio": 0.05,
             "warmup_bias_lr": 0.0,
             "warmup_momentum": 0.9,
@@ -215,7 +215,7 @@ def setUp(self) -> None:
             "max_epochs": 3,
             "initial_lr": 1e-2,
             "loss": DDRNetLoss(),
-            "lr_mode": "poly",
+            "lr_mode": "PolyLRCallback",
             "ema": True,  # unlike the paper (not specified in paper)
             "average_best_models": True,
             "optimizer": "SGD",
@@ -232,7 +232,7 @@ def setUp(self) -> None:
             "max_epochs": 3,
             "initial_lr": 1e-2,
             "loss": STDCLoss(num_classes=5),
-            "lr_mode": "poly",
+            "lr_mode": "PolyLRCallback",
             "ema": True,  # unlike the paper (not specified in paper)
             "optimizer": "SGD",
             "optimizer_params": {"weight_decay": 5e-4, "momentum": 0.9},
@@ -247,7 +247,7 @@ def setUp(self) -> None:
             "max_epochs": 3,
             "initial_lr": 1e-2,
             "loss": "cross_entropy",
-            "lr_mode": "poly",
+            "lr_mode": "PolyLRCallback",
             "ema": True,  # unlike the paper (not specified in paper)
             "optimizer": "SGD",
             "optimizer_params": {"weight_decay": 5e-4, "momentum": 0.9},
diff --git a/tests/recipe_training_tests/coded_qat_launch_test.py b/tests/recipe_training_tests/coded_qat_launch_test.py
index e5bb8531c1..87728c7dd1 100644
--- a/tests/recipe_training_tests/coded_qat_launch_test.py
+++ b/tests/recipe_training_tests/coded_qat_launch_test.py
@@ -17,7 +17,7 @@ def test_qat_launch(self):
             "max_epochs": 10,
             "lr_updates": [],
             "lr_decay_factor": 0.1,
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "cross_entropy",
@@ -78,7 +78,7 @@ def test_ptq_launch(self):
             "max_epochs": 10,
             "lr_updates": [],
             "lr_decay_factor": 0.1,
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "cross_entropy",
diff --git a/tests/unit_tests/dataset_statistics_test.py b/tests/unit_tests/dataset_statistics_test.py
index b3881b4c9a..a475bc1277 100644
--- a/tests/unit_tests/dataset_statistics_test.py
+++ b/tests/unit_tests/dataset_statistics_test.py
@@ -24,7 +24,7 @@ def test_dataset_statistics_tensorboard_logger(self):
 
         training_params = {
             "max_epochs": 1,  # we dont really need the actual training to run
-            "lr_mode": "cosine",
+            "lr_mode": "CosineLRCallback",
             "initial_lr": 0.01,
             "loss": "yolox_loss",
             "criterion_params": {"strides": [8, 16, 32], "num_classes": 80},
diff --git a/tests/unit_tests/detection_dataset_test.py b/tests/unit_tests/detection_dataset_test.py
index 6e6efdd523..5f418635ed 100644
--- a/tests/unit_tests/detection_dataset_test.py
+++ b/tests/unit_tests/detection_dataset_test.py
@@ -168,7 +168,7 @@ def test_coco_detection_metrics_with_classwise_ap(self):
 
         detection_train_params_yolox = {
             "max_epochs": 5,
-            "lr_mode": "cosine",
+            "lr_mode": "CosineLRCallback",
             "cosine_final_lr_ratio": 0.05,
             "warmup_bias_lr": 0.0,
             "warmup_momentum": 0.9,
diff --git a/tests/unit_tests/double_training_test.py b/tests/unit_tests/double_training_test.py
index b556aaabc2..44bb373565 100644
--- a/tests/unit_tests/double_training_test.py
+++ b/tests/unit_tests/double_training_test.py
@@ -24,7 +24,7 @@ def test_call_train_twice(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": torch.nn.CrossEntropyLoss(),
diff --git a/tests/unit_tests/early_stop_test.py b/tests/unit_tests/early_stop_test.py
index 1feeb6a9df..22d1777637 100644
--- a/tests/unit_tests/early_stop_test.py
+++ b/tests/unit_tests/early_stop_test.py
@@ -49,7 +49,7 @@ def setUp(self) -> None:
             "max_epochs": self.max_epochs,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "cross_entropy",
diff --git a/tests/unit_tests/extreme_batch_cb_test.py b/tests/unit_tests/extreme_batch_cb_test.py
index b4692b6274..db767f3b36 100644
--- a/tests/unit_tests/extreme_batch_cb_test.py
+++ b/tests/unit_tests/extreme_batch_cb_test.py
@@ -40,7 +40,7 @@ def setUpClass(cls):
             "max_epochs": 3,
             "initial_lr": 1e-2,
             "loss": DDRNetLoss(),
-            "lr_mode": "poly",
+            "lr_mode": "PolyLRCallback",
             "ema": True,
             "optimizer": "SGD",
             "mixed_precision": False,
@@ -56,7 +56,7 @@ def setUpClass(cls):
             "max_epochs": 3,
             "initial_lr": 1e-2,
             "loss": PPYoloELoss(num_classes=1, use_static_assigner=False, reg_max=16),
-            "lr_mode": "poly",
+            "lr_mode": "PolyLRCallback",
             "ema": True,
             "optimizer": "SGD",
             "mixed_precision": False,
diff --git a/tests/unit_tests/factories_test.py b/tests/unit_tests/factories_test.py
index c0def96302..22a44e469d 100644
--- a/tests/unit_tests/factories_test.py
+++ b/tests/unit_tests/factories_test.py
@@ -21,7 +21,7 @@ def test_training_with_factories(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "cross_entropy",
@@ -47,7 +47,7 @@ def test_training_with_factories_with_typos(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "crossEnt_ropy",
diff --git a/tests/unit_tests/forward_pass_prep_fn_test.py b/tests/unit_tests/forward_pass_prep_fn_test.py
index 97c8a27af7..b15c0fffeb 100644
--- a/tests/unit_tests/forward_pass_prep_fn_test.py
+++ b/tests/unit_tests/forward_pass_prep_fn_test.py
@@ -38,7 +38,7 @@ def test_resizing_with_forward_pass_prep_fn(self):
         train_params = {
             "max_epochs": 2,
             "cosine_final_lr_ratio": 0.2,
-            "lr_mode": "cosine",
+            "lr_mode": "CosineLRCallback",
             "lr_cooldown_epochs": 2,
             "lr_warmup_epochs": 3,
             "initial_lr": 1,
diff --git a/tests/unit_tests/initialize_with_dataloaders_test.py b/tests/unit_tests/initialize_with_dataloaders_test.py
index 6d9c14034d..cfecb17110 100644
--- a/tests/unit_tests/initialize_with_dataloaders_test.py
+++ b/tests/unit_tests/initialize_with_dataloaders_test.py
@@ -35,7 +35,7 @@ def test_train_with_dataloaders(self):
                 "max_epochs": 2,
                 "lr_updates": [5, 6, 12],
                 "lr_decay_factor": 0.01,
-                "lr_mode": "step",
+                "lr_mode": "StepLRCallback",
                 "initial_lr": 0.01,
                 "loss": "cross_entropy",
                 "optimizer": "SGD",
diff --git a/tests/unit_tests/kd_ema_test.py b/tests/unit_tests/kd_ema_test.py
index 1f59084fe7..ebbd642a8c 100644
--- a/tests/unit_tests/kd_ema_test.py
+++ b/tests/unit_tests/kd_ema_test.py
@@ -20,7 +20,7 @@ def setUp(cls):
             "max_epochs": 3,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": KDLogitsLoss(torch.nn.CrossEntropyLoss()),
diff --git a/tests/unit_tests/kd_trainer_test.py b/tests/unit_tests/kd_trainer_test.py
index 3b866e7b2a..454078eb94 100644
--- a/tests/unit_tests/kd_trainer_test.py
+++ b/tests/unit_tests/kd_trainer_test.py
@@ -42,7 +42,7 @@ def setUp(cls):
             "max_epochs": 3,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": KDLogitsLoss(torch.nn.CrossEntropyLoss()),
diff --git a/tests/unit_tests/load_ema_ckpt_test.py b/tests/unit_tests/load_ema_ckpt_test.py
index b070c8d862..d310e1ced0 100644
--- a/tests/unit_tests/load_ema_ckpt_test.py
+++ b/tests/unit_tests/load_ema_ckpt_test.py
@@ -23,7 +23,7 @@ def setUp(self) -> None:
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "cross_entropy",
diff --git a/tests/unit_tests/local_ckpt_head_replacement_test.py b/tests/unit_tests/local_ckpt_head_replacement_test.py
index 8ba7371683..6f8053c2d7 100644
--- a/tests/unit_tests/local_ckpt_head_replacement_test.py
+++ b/tests/unit_tests/local_ckpt_head_replacement_test.py
@@ -14,7 +14,7 @@ def test_local_ckpt_head_replacement(self):
             "max_epochs": 1,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "cross_entropy",
diff --git a/tests/unit_tests/loss_loggings_test.py b/tests/unit_tests/loss_loggings_test.py
index 54d476f0ad..151cff0475 100644
--- a/tests/unit_tests/loss_loggings_test.py
+++ b/tests/unit_tests/loss_loggings_test.py
@@ -35,7 +35,7 @@ def test_single_item_logging(self):
             "max_epochs": 1,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": torch.nn.CrossEntropyLoss(),
@@ -59,7 +59,7 @@ def test_multiple_unnamed_components_loss_logging(self):
             "max_epochs": 1,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": CriterionWithUnnamedComponents(),
@@ -83,7 +83,7 @@ def test_multiple_named_components_loss_logging(self):
             "max_epochs": 1,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": CriterionWithNamedComponents(),
diff --git a/tests/unit_tests/lr_cooldown_test.py b/tests/unit_tests/lr_cooldown_test.py
index 2f04d56d5b..b8125b4da5 100644
--- a/tests/unit_tests/lr_cooldown_test.py
+++ b/tests/unit_tests/lr_cooldown_test.py
@@ -19,7 +19,7 @@ def test_lr_cooldown_with_lr_scheduling(self):
         train_params = {
             "max_epochs": 7,
             "cosine_final_lr_ratio": 0.2,
-            "lr_mode": "cosine",
+            "lr_mode": "CosineLRCallback",
             "lr_cooldown_epochs": 2,
             "lr_warmup_epochs": 3,
             "initial_lr": 1,
diff --git a/tests/unit_tests/lr_warmup_test.py b/tests/unit_tests/lr_warmup_test.py
index c8473cdb53..a1a95b82c3 100644
--- a/tests/unit_tests/lr_warmup_test.py
+++ b/tests/unit_tests/lr_warmup_test.py
@@ -58,7 +58,7 @@ def test_lr_warmup(self):
             "max_epochs": 5,
             "lr_updates": [],
             "lr_decay_factor": 0.1,
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 3,
             "initial_lr": 1,
             "loss": "cross_entropy",
@@ -71,7 +71,7 @@ def test_lr_warmup(self):
             "greater_metric_to_watch_is_better": True,
             "ema": False,
             "phase_callbacks": phase_callbacks,
-            "warmup_mode": "linear_epoch_step",
+            "warmup_mode": "EpochStepWarmupLRCallback",
         }
 
         expected_lrs = [0.25, 0.5, 0.75, 1.0, 1.0]
@@ -94,7 +94,7 @@ def test_lr_warmup_with_lr_scheduling(self):
         train_params = {
             "max_epochs": 5,
             "cosine_final_lr_ratio": 0.2,
-            "lr_mode": "cosine",
+            "lr_mode": "CosineLRCallback",
             "lr_warmup_epochs": 3,
             "initial_lr": 1,
             "loss": "cross_entropy",
@@ -107,7 +107,7 @@ def test_lr_warmup_with_lr_scheduling(self):
             "greater_metric_to_watch_is_better": True,
             "ema": False,
             "phase_callbacks": phase_callbacks,
-            "warmup_mode": "linear_epoch_step",
+            "warmup_mode": "EpochStepWarmupLRCallback",
         }
 
         expected_lrs = [0.25, 0.5, 0.75, 0.9236067977499791, 0.4763932022500211]
@@ -137,10 +137,10 @@ def test_warmup_linear_batch_step(self):
 
         train_params = {
             "max_epochs": max_epochs,
-            "lr_mode": "cosine",
+            "lr_mode": "CosineLRCallback",
             "cosine_final_lr_ratio": cosine_final_lr_ratio,
             "warmup_initial_lr": warmup_initial_lr,
-            "warmup_mode": "linear_batch_step",
+            "warmup_mode": "BatchStepLinearWarmupLRCallback",
             "lr_warmup_steps": lr_warmup_steps,
             "initial_lr": 1,
             "loss": "cross_entropy",
@@ -186,7 +186,7 @@ def test_warmup_linear_epoch_step(self):
             "max_epochs": 5,
             "lr_updates": [],
             "lr_decay_factor": 0.1,
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 3,
             "initial_lr": 1,
             "warmup_initial_lr": 4.0,
@@ -200,7 +200,7 @@ def test_warmup_linear_epoch_step(self):
             "greater_metric_to_watch_is_better": True,
             "ema": False,
             "phase_callbacks": [collect_lr_callback],
-            "warmup_mode": "linear_epoch_step",
+            "warmup_mode": "EpochStepWarmupLRCallback",
         }
 
         expected_lrs = [4.0, 3.0, 2.0, 1.0, 1.0]
@@ -224,7 +224,7 @@ def test_custom_lr_warmup(self):
             "max_epochs": 5,
             "lr_updates": [],
             "lr_decay_factor": 0.1,
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 3,
             "loss": "cross_entropy",
             "optimizer": "SGD",
diff --git a/tests/unit_tests/max_batches_loop_break_test.py b/tests/unit_tests/max_batches_loop_break_test.py
index 075b5a590b..2c71a02911 100644
--- a/tests/unit_tests/max_batches_loop_break_test.py
+++ b/tests/unit_tests/max_batches_loop_break_test.py
@@ -23,7 +23,7 @@ def test_max_train_batches_loop_break(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "cross_entropy",
@@ -58,7 +58,7 @@ def test_max_valid_batches_loop_break(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "cross_entropy",
diff --git a/tests/unit_tests/optimizer_params_override_test.py b/tests/unit_tests/optimizer_params_override_test.py
index a3bcf9789c..d63fa3677e 100644
--- a/tests/unit_tests/optimizer_params_override_test.py
+++ b/tests/unit_tests/optimizer_params_override_test.py
@@ -16,7 +16,7 @@ def test_optimizer_params_partial_override(self):
             "max_epochs": 1,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "cross_entropy",
@@ -45,7 +45,7 @@ def test_optimizer_params_full_override(self):
             "max_epochs": 1,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "cross_entropy",
diff --git a/tests/unit_tests/phase_context_test.py b/tests/unit_tests/phase_context_test.py
index a9d37f7f6f..2f8199dead 100644
--- a/tests/unit_tests/phase_context_test.py
+++ b/tests/unit_tests/phase_context_test.py
@@ -28,7 +28,7 @@ def context_information_in_train_test(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "cross_entropy",
diff --git a/tests/unit_tests/preprocessing_unit_test.py b/tests/unit_tests/preprocessing_unit_test.py
index 9416f309ba..60df92f286 100644
--- a/tests/unit_tests/preprocessing_unit_test.py
+++ b/tests/unit_tests/preprocessing_unit_test.py
@@ -97,7 +97,7 @@ def test_setting_preprocessing_params_from_validation_set(self):
 
         detection_train_params_yolox = {
             "max_epochs": 1,
-            "lr_mode": "cosine",
+            "lr_mode": "CosineLRCallback",
             "cosine_final_lr_ratio": 0.05,
             "warmup_bias_lr": 0.0,
             "warmup_momentum": 0.9,
@@ -168,7 +168,7 @@ def test_setting_preprocessing_params_from_checkpoint(self):
 
         detection_train_params_yolox = {
             "max_epochs": 1,
-            "lr_mode": "cosine",
+            "lr_mode": "CosineLRCallback",
             "cosine_final_lr_ratio": 0.05,
             "warmup_bias_lr": 0.0,
             "warmup_momentum": 0.9,
diff --git a/tests/unit_tests/resume_training_test.py b/tests/unit_tests/resume_training_test.py
index 0c3bf69abb..6d6a47c546 100644
--- a/tests/unit_tests/resume_training_test.py
+++ b/tests/unit_tests/resume_training_test.py
@@ -31,7 +31,7 @@ def test_resume_training(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "cross_entropy",
@@ -77,7 +77,7 @@ def test_resume_run_id_training(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "cross_entropy",
@@ -142,7 +142,7 @@ def test_resume_external_training(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "cross_entropy",
@@ -190,7 +190,7 @@ def test_resume_external_training_same_dir(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "cross_entropy",
diff --git a/tests/unit_tests/save_ckpt_test.py b/tests/unit_tests/save_ckpt_test.py
index dcb5208744..c292640af9 100644
--- a/tests/unit_tests/save_ckpt_test.py
+++ b/tests/unit_tests/save_ckpt_test.py
@@ -13,7 +13,7 @@ def setUp(self):
             "max_epochs": 4,
             "lr_decay_factor": 0.1,
             "lr_updates": [4],
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "cross_entropy",
diff --git a/tests/unit_tests/train_after_test_test.py b/tests/unit_tests/train_after_test_test.py
index 870fa072bf..b2e3158e7e 100644
--- a/tests/unit_tests/train_after_test_test.py
+++ b/tests/unit_tests/train_after_test_test.py
@@ -20,7 +20,7 @@ def setUp(self) -> None:
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": torch.nn.CrossEntropyLoss(),
diff --git a/tests/unit_tests/train_logging_test.py b/tests/unit_tests/train_logging_test.py
index 759af58988..23451d69d0 100644
--- a/tests/unit_tests/train_logging_test.py
+++ b/tests/unit_tests/train_logging_test.py
@@ -19,7 +19,7 @@ def test_train_logging(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "cross_entropy",
diff --git a/tests/unit_tests/train_with_intialized_param_args_test.py b/tests/unit_tests/train_with_intialized_param_args_test.py
index d1ed21f175..40c43bf577 100644
--- a/tests/unit_tests/train_with_intialized_param_args_test.py
+++ b/tests/unit_tests/train_with_intialized_param_args_test.py
@@ -28,7 +28,7 @@ def test_train_with_external_criterion(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": torch.nn.CrossEntropyLoss(),
@@ -52,7 +52,7 @@ def test_train_with_external_optimizer(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "cross_entropy",
@@ -148,7 +148,7 @@ def test_train_with_external_metric(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "cross_entropy",
@@ -178,7 +178,7 @@ def test_train_with_external_dataloaders(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "cross_entropy",
diff --git a/tests/unit_tests/train_with_precise_bn_test.py b/tests/unit_tests/train_with_precise_bn_test.py
index 7a2eff2a99..07cd75e6bb 100644
--- a/tests/unit_tests/train_with_precise_bn_test.py
+++ b/tests/unit_tests/train_with_precise_bn_test.py
@@ -18,7 +18,7 @@ def test_train_with_precise_bn_explicit_size(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "cross_entropy",
@@ -47,7 +47,7 @@ def test_train_with_precise_bn_implicit_size(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "cross_entropy",
diff --git a/tests/unit_tests/update_param_groups_unit_test.py b/tests/unit_tests/update_param_groups_unit_test.py
index f0c85c71a2..bc3a5fc490 100644
--- a/tests/unit_tests/update_param_groups_unit_test.py
+++ b/tests/unit_tests/update_param_groups_unit_test.py
@@ -34,7 +34,7 @@ def test_lr_scheduling_with_update_param_groups(self):
 
         train_params = {
             "max_epochs": 3,
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_updates": [0, 1, 2],
             "initial_lr": 0.1,
             "lr_decay_factor": 1,
diff --git a/tests/unit_tests/vit_unit_test.py b/tests/unit_tests/vit_unit_test.py
index a943671abb..3d34e8e4e4 100644
--- a/tests/unit_tests/vit_unit_test.py
+++ b/tests/unit_tests/vit_unit_test.py
@@ -15,7 +15,7 @@ def setUp(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "step",
+            "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "cross_entropy",

From 45941b7085c4069b0fcc520646bca9e2b4c8ce37 Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Mon, 4 Sep 2023 15:11:11 +0300
Subject: [PATCH 06/21] update losses

---
 documentation/source/Checkpoints.md              |  2 +-
 documentation/source/Losses.md                   |  4 ++--
 documentation/source/PhaseCallbacks.md           |  2 +-
 documentation/source/QuickstartBasicToolkit.md   |  2 +-
 documentation/source/Segmentation.md             |  2 +-
 src/super_gradients/common/object_names.py       | 16 ----------------
 .../ddrnet_classification_example.py             |  2 +-
 .../deci_lab_export_example.py                   |  2 +-
 .../examples/early_stop/early_stop_example.py    |  2 +-
 .../loggers_examples/clearml_logger_example.py   |  2 +-
 .../deci_platform_logger_example.py              |  2 +-
 .../examples/quantization/resnet_qat_example.py  |  2 +-
 .../regseg_transfer_learning_example.py          |  2 +-
 .../train_with_test_example.py                   |  2 +-
 src/super_gradients/training/losses/dekr_loss.py |  4 ++--
 .../training/sg_trainer/sg_trainer.py            | 14 +++++++-------
 tests/end_to_end_tests/cifar_trainer_test.py     |  4 ++--
 tests/end_to_end_tests/trainer_test.py           |  2 +-
 .../conversion_callback_test.py                  |  4 ++--
 tests/integration_tests/deci_lab_export_test.py  |  2 +-
 .../ema_train_integration_test.py                |  2 +-
 tests/integration_tests/lr_test.py               |  2 +-
 .../integration_tests/pretrained_models_test.py  |  8 ++++----
 .../coded_qat_launch_test.py                     |  4 ++--
 tests/unit_tests/dataset_statistics_test.py      |  2 +-
 tests/unit_tests/detection_dataset_test.py       |  2 +-
 tests/unit_tests/early_stop_test.py              |  2 +-
 tests/unit_tests/factories_test.py               |  2 +-
 tests/unit_tests/forward_pass_prep_fn_test.py    |  2 +-
 .../initialize_with_dataloaders_test.py          |  2 +-
 tests/unit_tests/load_ema_ckpt_test.py           |  2 +-
 .../local_ckpt_head_replacement_test.py          |  2 +-
 tests/unit_tests/lr_cooldown_test.py             |  2 +-
 tests/unit_tests/lr_warmup_test.py               | 10 +++++-----
 tests/unit_tests/max_batches_loop_break_test.py  |  4 ++--
 .../unit_tests/optimizer_params_override_test.py |  4 ++--
 tests/unit_tests/phase_context_test.py           |  2 +-
 tests/unit_tests/preprocessing_unit_test.py      |  4 ++--
 tests/unit_tests/resume_training_test.py         |  8 ++++----
 tests/unit_tests/save_ckpt_test.py               |  4 ++--
 tests/unit_tests/train_logging_test.py           |  2 +-
 .../train_with_intialized_param_args_test.py     | 12 ++++++------
 tests/unit_tests/train_with_precise_bn_test.py   |  4 ++--
 tests/unit_tests/training_params_factory_test.py |  4 ++--
 .../unit_tests/update_param_groups_unit_test.py  |  2 +-
 tests/unit_tests/vit_unit_test.py                |  2 +-
 46 files changed, 77 insertions(+), 93 deletions(-)

diff --git a/documentation/source/Checkpoints.md b/documentation/source/Checkpoints.md
index 295ab687ca..48847ca871 100644
--- a/documentation/source/Checkpoints.md
+++ b/documentation/source/Checkpoints.md
@@ -79,7 +79,7 @@ model = models.get(model_name=Models.RESNET18, num_classes=10)
 
 train_params = {
     ...
-    "loss": "cross_entropy",
+    "loss": "LabelSmoothingCrossEntropyLoss",
     "criterion_params": {},
     "save_ckpt_epoch_list": [10,15]
     ...
diff --git a/documentation/source/Losses.md b/documentation/source/Losses.md
index 3def77c7e5..c5dda6dd1e 100644
--- a/documentation/source/Losses.md
+++ b/documentation/source/Losses.md
@@ -31,7 +31,7 @@ model = ...
 
 train_params = {
    ...
-   "loss": "cross_entropy",
+   "loss": "LabelSmoothingCrossEntropyLoss",
    "criterion_params": {}
    ...
 }
@@ -42,7 +42,7 @@ Since most IDEs support auto-completion, for your convenience, you can use our o
 ```python
 from super_gradients.common.object_names import Losses
 ```
-Then simply instead of "cross_entropy", use 
+Then simply instead of "LabelSmoothingCrossEntropyLoss", use 
 ```python
 Losses.CROSS_ENTROPY
 ```
diff --git a/documentation/source/PhaseCallbacks.md b/documentation/source/PhaseCallbacks.md
index cd8512ef2c..a447843d62 100644
--- a/documentation/source/PhaseCallbacks.md
+++ b/documentation/source/PhaseCallbacks.md
@@ -237,7 +237,7 @@ valid_dataloader = ...
 model = ...
 
 train_params = {
-    "loss": "cross_entropy",
+    "loss": "LabelSmoothingCrossEntropyLoss",
     "criterion_params": {},
     "phase_callbacks": [SaveFirstBatchCallback()],
     ...
diff --git a/documentation/source/QuickstartBasicToolkit.md b/documentation/source/QuickstartBasicToolkit.md
index a13ec89cee..4968d72000 100644
--- a/documentation/source/QuickstartBasicToolkit.md
+++ b/documentation/source/QuickstartBasicToolkit.md
@@ -61,7 +61,7 @@ model = models.get(Models.RESNET18, num_classes=10)
 training_params = {
     "max_epochs": 20,
     "initial_lr": 0.1,
-    "loss": "cross_entropy",
+    "loss": "LabelSmoothingCrossEntropyLoss",
     "train_metrics_list": [Accuracy(), Top5()],
     "valid_metrics_list": [Accuracy(), Top5()],
     "metric_to_watch": "Accuracy",
diff --git a/documentation/source/Segmentation.md b/documentation/source/Segmentation.md
index 7529dce3b1..3e6ba3a940 100644
--- a/documentation/source/Segmentation.md
+++ b/documentation/source/Segmentation.md
@@ -148,7 +148,7 @@ train_params = {
     "lr_warmup_epochs": 5,
     "multiply_head_lr": 10,
     "optimizer": "SGD",
-    "loss": "bce_dice_loss",
+    "loss": "BCEDiceLoss",
     "ema": True,
     "zero_weight_decay_on_bias_and_bn": True,
     "average_best_models": True,
diff --git a/src/super_gradients/common/object_names.py b/src/super_gradients/common/object_names.py
index 120b68d854..bc510657a3 100644
--- a/src/super_gradients/common/object_names.py
+++ b/src/super_gradients/common/object_names.py
@@ -17,22 +17,6 @@ class Losses:
     DEKR_LOSS = "DEKRLoss"
     RESCORING_LOSS = "RescoringLoss"
 
-    _DEPRECATED_CROSS_ENTROPY = "cross_entropy"
-    _DEPRECATED_MSE = "mse"
-    _DEPRECATED_R_SQUARED_LOSS = "r_squared_loss"
-    _DEPRECATED_SHELFNET_OHEM_LOSS = "shelfnet_ohem_loss"
-    _DEPRECATED_SHELFNET_SE_LOSS = "shelfnet_se_loss"
-    _DEPRECATED_YOLOX_LOSS = "yolox_loss"
-    _DEPRECATED_PPYOLOE_LOSS = "ppyoloe_loss"
-    _DEPRECATED_YOLOX_FAST_LOSS = "yolox_fast_loss"
-    _DEPRECATED_SSD_LOSS = "ssd_loss"
-    _DEPRECATED_STDC_LOSS = "stdc_loss"
-    _DEPRECATED_BCE_DICE_LOSS = "bce_dice_loss"
-    _DEPRECATED_KD_LOSS = "kd_loss"
-    _DEPRECATED_DICE_CE_EDGE_LOSS = "dice_ce_edge_loss"
-    _DEPRECATED_DEKR_LOSS = "dekr_loss"
-    _DEPRECATED_RESCORING_LOSS = "rescoring_loss"
-
 
 class Metrics:
     """Static class holding all the supported metric names"""
diff --git a/src/super_gradients/examples/ddrnet_imagenet/ddrnet_classification_example.py b/src/super_gradients/examples/ddrnet_imagenet/ddrnet_classification_example.py
index f1a0e33242..20ceb0ede5 100644
--- a/src/super_gradients/examples/ddrnet_imagenet/ddrnet_classification_example.py
+++ b/src/super_gradients/examples/ddrnet_imagenet/ddrnet_classification_example.py
@@ -45,7 +45,7 @@
     "initial_lr": 0.1 * devices,
     "optimizer": "SGD",
     "optimizer_params": {"weight_decay": 0.0001, "momentum": 0.9, "nesterov": True},
-    "loss": "cross_entropy",
+    "loss": "LabelSmoothingCrossEntropyLoss",
     "train_metrics_list": [Accuracy(), Top5()],
     "valid_metrics_list": [Accuracy(), Top5()],
     "metric_to_watch": "Accuracy",
diff --git a/src/super_gradients/examples/deci_lab_export_example/deci_lab_export_example.py b/src/super_gradients/examples/deci_lab_export_example/deci_lab_export_example.py
index 1a7ce24eb7..d5dc9cb0b6 100644
--- a/src/super_gradients/examples/deci_lab_export_example/deci_lab_export_example.py
+++ b/src/super_gradients/examples/deci_lab_export_example/deci_lab_export_example.py
@@ -59,7 +59,7 @@ def main(architecture_name: str):
         "lr_mode": "StepLRCallback",
         "lr_warmup_epochs": 0,
         "initial_lr": 0.1,
-        "loss": "cross_entropy",
+        "loss": "LabelSmoothingCrossEntropyLoss",
         "optimizer": "SGD",
         "criterion_params": {},
         "train_metrics_list": [Accuracy(), Top5()],
diff --git a/src/super_gradients/examples/early_stop/early_stop_example.py b/src/super_gradients/examples/early_stop/early_stop_example.py
index ca6553e407..93c691808a 100644
--- a/src/super_gradients/examples/early_stop/early_stop_example.py
+++ b/src/super_gradients/examples/early_stop/early_stop_example.py
@@ -21,7 +21,7 @@
     "lr_mode": "StepLRCallback",
     "lr_warmup_epochs": 0,
     "initial_lr": 0.1,
-    "loss": "cross_entropy",
+    "loss": "LabelSmoothingCrossEntropyLoss",
     "optimizer": "SGD",
     "criterion_params": {},
     "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
diff --git a/src/super_gradients/examples/loggers_examples/clearml_logger_example.py b/src/super_gradients/examples/loggers_examples/clearml_logger_example.py
index 5727d9fcf2..94228bb964 100644
--- a/src/super_gradients/examples/loggers_examples/clearml_logger_example.py
+++ b/src/super_gradients/examples/loggers_examples/clearml_logger_example.py
@@ -13,7 +13,7 @@
     "lr_decay_factor": 0.1,
     "lr_mode": "StepLRCallback",
     "initial_lr": 0.1,
-    "loss": "cross_entropy",
+    "loss": "LabelSmoothingCrossEntropyLoss",
     "optimizer": "SGD",
     "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
     "train_metrics_list": [Accuracy(), Top5()],
diff --git a/src/super_gradients/examples/loggers_examples/deci_platform_logger_example.py b/src/super_gradients/examples/loggers_examples/deci_platform_logger_example.py
index 3eed37ed00..981c93ca55 100644
--- a/src/super_gradients/examples/loggers_examples/deci_platform_logger_example.py
+++ b/src/super_gradients/examples/loggers_examples/deci_platform_logger_example.py
@@ -16,7 +16,7 @@
     "lr_decay_factor": 0.1,
     "lr_mode": "StepLRCallback",
     "initial_lr": 0.1,
-    "loss": "cross_entropy",
+    "loss": "LabelSmoothingCrossEntropyLoss",
     "optimizer": "SGD",
     "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
     "train_metrics_list": [Accuracy(), Top5()],
diff --git a/src/super_gradients/examples/quantization/resnet_qat_example.py b/src/super_gradients/examples/quantization/resnet_qat_example.py
index d9512c74c5..5b213e4359 100644
--- a/src/super_gradients/examples/quantization/resnet_qat_example.py
+++ b/src/super_gradients/examples/quantization/resnet_qat_example.py
@@ -89,7 +89,7 @@ def sg_selective_qdq_resnet50():
         "initial_lr": args.lr,
         "optimizer": "SGD",
         "optimizer_params": {"weight_decay": 0.0001, "momentum": 0.9, "nesterov": True},
-        "loss": "cross_entropy",
+        "loss": "LabelSmoothingCrossEntropyLoss",
         "train_metrics_list": [Accuracy(), Top5()],
         "valid_metrics_list": [Accuracy(), Top5()],
         "test_metrics_list": [Accuracy(), Top5()],
diff --git a/src/super_gradients/examples/regseg_transfer_learning_example/regseg_transfer_learning_example.py b/src/super_gradients/examples/regseg_transfer_learning_example/regseg_transfer_learning_example.py
index 0265a31a41..f04adea84c 100644
--- a/src/super_gradients/examples/regseg_transfer_learning_example/regseg_transfer_learning_example.py
+++ b/src/super_gradients/examples/regseg_transfer_learning_example/regseg_transfer_learning_example.py
@@ -45,7 +45,7 @@
     "cosine_final_lr_ratio": 0.1,
     "multiply_head_lr": 10,
     "optimizer": "SGD",
-    "loss": "bce_dice_loss",
+    "loss": "BCEDiceLoss",
     "ema": True,
     "zero_weight_decay_on_bias_and_bn": True,
     "average_best_models": True,
diff --git a/src/super_gradients/examples/train_with_test_set/train_with_test_example.py b/src/super_gradients/examples/train_with_test_set/train_with_test_example.py
index 6d8eb4c77b..ceb3b6cf42 100644
--- a/src/super_gradients/examples/train_with_test_set/train_with_test_example.py
+++ b/src/super_gradients/examples/train_with_test_set/train_with_test_example.py
@@ -12,7 +12,7 @@
     "lr_decay_factor": 0.1,
     "lr_mode": "step",
     "initial_lr": 0.1,
-    "loss": "cross_entropy",
+    "loss": "LabelSmoothingCrossEntropyLoss",
     "optimizer": "SGD",
     "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
     "train_metrics_list": [Accuracy(), Top5()],
diff --git a/src/super_gradients/training/losses/dekr_loss.py b/src/super_gradients/training/losses/dekr_loss.py
index 8b2a8ea8b5..c94708e81d 100644
--- a/src/super_gradients/training/losses/dekr_loss.py
+++ b/src/super_gradients/training/losses/dekr_loss.py
@@ -16,14 +16,14 @@ class DEKRLoss(nn.Module):
     This loss should be used in conjunction with DEKRTargetsGenerator.
     """
 
-    def __init__(self, heatmap_loss_factor: float = 1.0, offset_loss_factor: float = 0.1, heatmap_loss: str = "mse"):
+    def __init__(self, heatmap_loss_factor: float = 1.0, offset_loss_factor: float = 0.1, heatmap_loss: str = "MSE"):
         """
         Instantiate the DEKR loss function. It is two-component loss function, consisting of a heatmap (MSE) loss and an offset (Smooth L1) losses.
         The total loss is the sum of the two individual losses, weighted by the corresponding factors.
 
         :param heatmap_loss_factor: Weighting factor for heatmap loss
         :param offset_loss_factor: Weighting factor for offset loss
-        :param heatmap_loss: Type of heatmap loss to use. Can be "mse" (Used in DEKR paper) or "qfl" (Quality Focal Loss).
+        :param heatmap_loss: Type of heatmap loss to use. Can be "MSE" (Used in DEKR paper) or "qfl" (Quality Focal Loss).
                              We use QFL in our recipe as it produces better results.
         """
         super().__init__()
diff --git a/src/super_gradients/training/sg_trainer/sg_trainer.py b/src/super_gradients/training/sg_trainer/sg_trainer.py
index 7a6d541bb9..7417a44082 100755
--- a/src/super_gradients/training/sg_trainer/sg_trainer.py
+++ b/src/super_gradients/training/sg_trainer/sg_trainer.py
@@ -856,13 +856,13 @@ def train(
                     Loss function for training.
                     One of SuperGradient's built in options:
 
-                              "cross_entropy": LabelSmoothingCrossEntropyLoss,
-                              "mse": MSELoss,
-                              "r_squared_loss": RSquaredLoss,
-                              "detection_loss": YoLoV3DetectionLoss,
-                              "shelfnet_ohem_loss": ShelfNetOHEMLoss,
-                              "shelfnet_se_loss": ShelfNetSemanticEncodingLoss,
-                              "ssd_loss": SSDLoss,
+                        - LabelSmoothingCrossEntropyLoss,
+                        - MSELoss,
+                        - RSquaredLoss,
+                        - YoLoV3DetectionLoss,
+                        - ShelfNetOHEMLoss,
+                        - ShelfNetSemanticEncodingLoss,
+                        - SSDLoss,
 
 
                     or user defined nn.module loss function.
diff --git a/tests/end_to_end_tests/cifar_trainer_test.py b/tests/end_to_end_tests/cifar_trainer_test.py
index 7b91a59cba..d95f28fe68 100644
--- a/tests/end_to_end_tests/cifar_trainer_test.py
+++ b/tests/end_to_end_tests/cifar_trainer_test.py
@@ -25,7 +25,7 @@ def test_train_cifar10_dataloader(self):
             training_params={
                 "max_epochs": 1,
                 "initial_lr": 0.1,
-                "loss": "cross_entropy",
+                "loss": "LabelSmoothingCrossEntropyLoss",
                 "train_metrics_list": ["Accuracy"],
                 "valid_metrics_list": ["Accuracy"],
                 "metric_to_watch": "Accuracy",
@@ -44,7 +44,7 @@ def test_train_cifar100_dataloader(self):
             training_params={
                 "max_epochs": 1,
                 "initial_lr": 0.1,
-                "loss": "cross_entropy",
+                "loss": "LabelSmoothingCrossEntropyLoss",
                 "train_metrics_list": ["Accuracy"],
                 "valid_metrics_list": ["Accuracy"],
                 "metric_to_watch": "Accuracy",
diff --git a/tests/end_to_end_tests/trainer_test.py b/tests/end_to_end_tests/trainer_test.py
index f1f57eee86..228e9ec372 100644
--- a/tests/end_to_end_tests/trainer_test.py
+++ b/tests/end_to_end_tests/trainer_test.py
@@ -26,7 +26,7 @@ def setUp(cls):
             "initial_lr": 0.1,
             "lr_updates": [4],
             "lr_mode": "StepLRCallback",
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "train_metrics_list": [Accuracy(), Top5()],
             "valid_metrics_list": [Accuracy(), Top5()],
             "metric_to_watch": "Accuracy",
diff --git a/tests/integration_tests/conversion_callback_test.py b/tests/integration_tests/conversion_callback_test.py
index fe2d6bc4cb..c172318f93 100644
--- a/tests/integration_tests/conversion_callback_test.py
+++ b/tests/integration_tests/conversion_callback_test.py
@@ -57,7 +57,7 @@ def test_classification_architectures(self):
                 "lr_mode": "StepLRCallback",
                 "lr_warmup_epochs": 0,
                 "initial_lr": 0.1,
-                "loss": "cross_entropy",
+                "loss": "LabelSmoothingCrossEntropyLoss",
                 "optimizer": "SGD",
                 "criterion_params": {},
                 "train_metrics_list": [Accuracy(), Top5()],
@@ -90,7 +90,7 @@ def get_architecture_custom_config(architecture_name: str):
                 }
             elif re.search(r"regseg", architecture_name):
                 return {
-                    "loss": "cross_entropy",
+                    "loss": "LabelSmoothingCrossEntropyLoss",
                 }
             else:
                 raise Exception("You tried to run a conversion test on an unknown architecture")
diff --git a/tests/integration_tests/deci_lab_export_test.py b/tests/integration_tests/deci_lab_export_test.py
index 8f18dfcd29..a5a116aaee 100644
--- a/tests/integration_tests/deci_lab_export_test.py
+++ b/tests/integration_tests/deci_lab_export_test.py
@@ -47,7 +47,7 @@ def test_train_with_deci_lab_integration(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "optimizer": self.optimizer,
             "criterion_params": {},
             "train_metrics_list": [Accuracy(), Top5()],
diff --git a/tests/integration_tests/ema_train_integration_test.py b/tests/integration_tests/ema_train_integration_test.py
index 50c778f5c5..990a698915 100644
--- a/tests/integration_tests/ema_train_integration_test.py
+++ b/tests/integration_tests/ema_train_integration_test.py
@@ -53,7 +53,7 @@ def _train(self, ema_params):
             "lr_decay_factor": 0.1,
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "ema": True,
diff --git a/tests/integration_tests/lr_test.py b/tests/integration_tests/lr_test.py
index 20a9afb2a6..836b053cae 100644
--- a/tests/integration_tests/lr_test.py
+++ b/tests/integration_tests/lr_test.py
@@ -19,7 +19,7 @@ def setUp(cls):
             "max_epochs": 1,
             "silent_mode": True,
             "initial_lr": 0.1,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "train_metrics_list": [Accuracy(), Top5()],
             "valid_metrics_list": [Accuracy(), Top5()],
             "metric_to_watch": "Accuracy",
diff --git a/tests/integration_tests/pretrained_models_test.py b/tests/integration_tests/pretrained_models_test.py
index dfe37846cc..269ec484f0 100644
--- a/tests/integration_tests/pretrained_models_test.py
+++ b/tests/integration_tests/pretrained_models_test.py
@@ -86,7 +86,7 @@ def setUp(self) -> None:
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
             "initial_lr": 0.6,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "lr_mode": "StepLRCallback",
             "optimizer_params": {"weight_decay": 0.000, "momentum": 0.9},
             "train_metrics_list": [Accuracy()],
@@ -133,7 +133,7 @@ def setUp(self) -> None:
             "cosine_final_lr_ratio": 0.01,
             "lr_warmup_epochs": 3,
             "batch_accumulate": 1,
-            "loss": "ssd_loss",
+            "loss": "SSDLoss",
             "criterion_params": {"dboxes": ssd_dboxes},
             "optimizer": "SGD",
             "warmup_momentum": 0.8,
@@ -150,7 +150,7 @@ def setUp(self) -> None:
             "warmup_bias_lr": 0.0,
             "warmup_momentum": 0.9,
             "initial_lr": 0.02,
-            "loss": "yolox_loss",
+            "loss": "YoloXDetectionLoss",
             "criterion_params": {"strides": [8, 16, 32], "num_classes": 5},  # output strides of all yolo outputs
             "train_metrics_list": [],
             "valid_metrics_list": [DetectionMetrics(post_prediction_callback=YoloXPostPredictionCallback(), normalize_targets=True, num_cls=5)],
@@ -246,7 +246,7 @@ def setUp(self) -> None:
         self.regseg_transfer_segmentation_train_params = {
             "max_epochs": 3,
             "initial_lr": 1e-2,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "lr_mode": "PolyLRCallback",
             "ema": True,  # unlike the paper (not specified in paper)
             "optimizer": "SGD",
diff --git a/tests/recipe_training_tests/coded_qat_launch_test.py b/tests/recipe_training_tests/coded_qat_launch_test.py
index 87728c7dd1..f6abb8bac2 100644
--- a/tests/recipe_training_tests/coded_qat_launch_test.py
+++ b/tests/recipe_training_tests/coded_qat_launch_test.py
@@ -20,7 +20,7 @@ def test_qat_launch(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
@@ -81,7 +81,7 @@ def test_ptq_launch(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
diff --git a/tests/unit_tests/dataset_statistics_test.py b/tests/unit_tests/dataset_statistics_test.py
index a475bc1277..875ab475cf 100644
--- a/tests/unit_tests/dataset_statistics_test.py
+++ b/tests/unit_tests/dataset_statistics_test.py
@@ -26,7 +26,7 @@ def test_dataset_statistics_tensorboard_logger(self):
             "max_epochs": 1,  # we dont really need the actual training to run
             "lr_mode": "CosineLRCallback",
             "initial_lr": 0.01,
-            "loss": "yolox_loss",
+            "loss": "YoloXDetectionLoss",
             "criterion_params": {"strides": [8, 16, 32], "num_classes": 80},
             "dataset_statistics": True,
             "launch_tensorboard": True,
diff --git a/tests/unit_tests/detection_dataset_test.py b/tests/unit_tests/detection_dataset_test.py
index 5f418635ed..55bd3767db 100644
--- a/tests/unit_tests/detection_dataset_test.py
+++ b/tests/unit_tests/detection_dataset_test.py
@@ -173,7 +173,7 @@ def test_coco_detection_metrics_with_classwise_ap(self):
             "warmup_bias_lr": 0.0,
             "warmup_momentum": 0.9,
             "initial_lr": 0.02,
-            "loss": "yolox_loss",
+            "loss": "YoloXDetectionLoss",
             "mixed_precision": False,
             "criterion_params": {"strides": [8, 16, 32], "num_classes": 80},  # output strides of all yolo outputs
             "train_metrics_list": [],
diff --git a/tests/unit_tests/early_stop_test.py b/tests/unit_tests/early_stop_test.py
index 22d1777637..fa73207f19 100644
--- a/tests/unit_tests/early_stop_test.py
+++ b/tests/unit_tests/early_stop_test.py
@@ -52,7 +52,7 @@ def setUp(self) -> None:
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
diff --git a/tests/unit_tests/factories_test.py b/tests/unit_tests/factories_test.py
index 22a44e469d..0897ddb3ab 100644
--- a/tests/unit_tests/factories_test.py
+++ b/tests/unit_tests/factories_test.py
@@ -24,7 +24,7 @@ def test_training_with_factories(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "optimizer": "torch.optim.ASGD",  # use an optimizer by factory
             "criterion_params": {},
             "optimizer_params": {"lambd": 0.0001, "alpha": 0.75},
diff --git a/tests/unit_tests/forward_pass_prep_fn_test.py b/tests/unit_tests/forward_pass_prep_fn_test.py
index b15c0fffeb..2832686d25 100644
--- a/tests/unit_tests/forward_pass_prep_fn_test.py
+++ b/tests/unit_tests/forward_pass_prep_fn_test.py
@@ -42,7 +42,7 @@ def test_resizing_with_forward_pass_prep_fn(self):
             "lr_cooldown_epochs": 2,
             "lr_warmup_epochs": 3,
             "initial_lr": 1,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
diff --git a/tests/unit_tests/initialize_with_dataloaders_test.py b/tests/unit_tests/initialize_with_dataloaders_test.py
index cfecb17110..eb21132a06 100644
--- a/tests/unit_tests/initialize_with_dataloaders_test.py
+++ b/tests/unit_tests/initialize_with_dataloaders_test.py
@@ -37,7 +37,7 @@ def test_train_with_dataloaders(self):
                 "lr_decay_factor": 0.01,
                 "lr_mode": "StepLRCallback",
                 "initial_lr": 0.01,
-                "loss": "cross_entropy",
+                "loss": "LabelSmoothingCrossEntropyLoss",
                 "optimizer": "SGD",
                 "optimizer_params": {"weight_decay": 1e-5, "momentum": 0.9},
                 "train_metrics_list": [Accuracy()],
diff --git a/tests/unit_tests/load_ema_ckpt_test.py b/tests/unit_tests/load_ema_ckpt_test.py
index d310e1ced0..c49f38f988 100644
--- a/tests/unit_tests/load_ema_ckpt_test.py
+++ b/tests/unit_tests/load_ema_ckpt_test.py
@@ -26,7 +26,7 @@ def setUp(self) -> None:
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
diff --git a/tests/unit_tests/local_ckpt_head_replacement_test.py b/tests/unit_tests/local_ckpt_head_replacement_test.py
index 6f8053c2d7..e45788860e 100644
--- a/tests/unit_tests/local_ckpt_head_replacement_test.py
+++ b/tests/unit_tests/local_ckpt_head_replacement_test.py
@@ -17,7 +17,7 @@ def test_local_ckpt_head_replacement(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
diff --git a/tests/unit_tests/lr_cooldown_test.py b/tests/unit_tests/lr_cooldown_test.py
index b8125b4da5..ee0aedc0eb 100644
--- a/tests/unit_tests/lr_cooldown_test.py
+++ b/tests/unit_tests/lr_cooldown_test.py
@@ -23,7 +23,7 @@ def test_lr_cooldown_with_lr_scheduling(self):
             "lr_cooldown_epochs": 2,
             "lr_warmup_epochs": 3,
             "initial_lr": 1,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
diff --git a/tests/unit_tests/lr_warmup_test.py b/tests/unit_tests/lr_warmup_test.py
index a1a95b82c3..36fbfd570d 100644
--- a/tests/unit_tests/lr_warmup_test.py
+++ b/tests/unit_tests/lr_warmup_test.py
@@ -61,7 +61,7 @@ def test_lr_warmup(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 3,
             "initial_lr": 1,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
@@ -97,7 +97,7 @@ def test_lr_warmup_with_lr_scheduling(self):
             "lr_mode": "CosineLRCallback",
             "lr_warmup_epochs": 3,
             "initial_lr": 1,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
@@ -143,7 +143,7 @@ def test_warmup_linear_batch_step(self):
             "warmup_mode": "BatchStepLinearWarmupLRCallback",
             "lr_warmup_steps": lr_warmup_steps,
             "initial_lr": 1,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
@@ -190,7 +190,7 @@ def test_warmup_linear_epoch_step(self):
             "lr_warmup_epochs": 3,
             "initial_lr": 1,
             "warmup_initial_lr": 4.0,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
@@ -226,7 +226,7 @@ def test_custom_lr_warmup(self):
             "lr_decay_factor": 0.1,
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 3,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
diff --git a/tests/unit_tests/max_batches_loop_break_test.py b/tests/unit_tests/max_batches_loop_break_test.py
index 2c71a02911..c00ca8fe4c 100644
--- a/tests/unit_tests/max_batches_loop_break_test.py
+++ b/tests/unit_tests/max_batches_loop_break_test.py
@@ -26,7 +26,7 @@ def test_max_train_batches_loop_break(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
@@ -61,7 +61,7 @@ def test_max_valid_batches_loop_break(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
diff --git a/tests/unit_tests/optimizer_params_override_test.py b/tests/unit_tests/optimizer_params_override_test.py
index d63fa3677e..b8aac3dd46 100644
--- a/tests/unit_tests/optimizer_params_override_test.py
+++ b/tests/unit_tests/optimizer_params_override_test.py
@@ -19,7 +19,7 @@ def test_optimizer_params_partial_override(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"momentum": 0.9},
@@ -48,7 +48,7 @@ def test_optimizer_params_full_override(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "zero_weight_decay_on_bias_and_bn": True,
diff --git a/tests/unit_tests/phase_context_test.py b/tests/unit_tests/phase_context_test.py
index 2f8199dead..ba680f3822 100644
--- a/tests/unit_tests/phase_context_test.py
+++ b/tests/unit_tests/phase_context_test.py
@@ -31,7 +31,7 @@ def context_information_in_train_test(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
diff --git a/tests/unit_tests/preprocessing_unit_test.py b/tests/unit_tests/preprocessing_unit_test.py
index 60df92f286..1e98d694c9 100644
--- a/tests/unit_tests/preprocessing_unit_test.py
+++ b/tests/unit_tests/preprocessing_unit_test.py
@@ -102,7 +102,7 @@ def test_setting_preprocessing_params_from_validation_set(self):
             "warmup_bias_lr": 0.0,
             "warmup_momentum": 0.9,
             "initial_lr": 0.02,
-            "loss": "yolox_loss",
+            "loss": "YoloXDetectionLoss",
             "criterion_params": {"strides": [8, 16, 32], "num_classes": 80},  # output strides of all yolo outputs
             "train_metrics_list": [],
             "valid_metrics_list": [DetectionMetrics(post_prediction_callback=YoloXPostPredictionCallback(), normalize_targets=True, num_cls=80)],
@@ -173,7 +173,7 @@ def test_setting_preprocessing_params_from_checkpoint(self):
             "warmup_bias_lr": 0.0,
             "warmup_momentum": 0.9,
             "initial_lr": 0.02,
-            "loss": "yolox_loss",
+            "loss": "YoloXDetectionLoss",
             "criterion_params": {"strides": [8, 16, 32], "num_classes": 80},  # output strides of all yolo outputs
             "train_metrics_list": [],
             "valid_metrics_list": [DetectionMetrics(post_prediction_callback=YoloXPostPredictionCallback(), normalize_targets=True, num_cls=80)],
diff --git a/tests/unit_tests/resume_training_test.py b/tests/unit_tests/resume_training_test.py
index 6d6a47c546..ee7bab7076 100644
--- a/tests/unit_tests/resume_training_test.py
+++ b/tests/unit_tests/resume_training_test.py
@@ -34,7 +34,7 @@ def test_resume_training(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
@@ -80,7 +80,7 @@ def test_resume_run_id_training(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
@@ -145,7 +145,7 @@ def test_resume_external_training(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
@@ -193,7 +193,7 @@ def test_resume_external_training_same_dir(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
diff --git a/tests/unit_tests/save_ckpt_test.py b/tests/unit_tests/save_ckpt_test.py
index c292640af9..06d4c539c6 100644
--- a/tests/unit_tests/save_ckpt_test.py
+++ b/tests/unit_tests/save_ckpt_test.py
@@ -16,12 +16,12 @@ def setUp(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
             "save_ckpt_epoch_list": [1, 3],
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "train_metrics_list": [Accuracy(), Top5()],
             "valid_metrics_list": [Accuracy(), Top5()],
             "metric_to_watch": "Accuracy",
diff --git a/tests/unit_tests/train_logging_test.py b/tests/unit_tests/train_logging_test.py
index 23451d69d0..19cb4fe488 100644
--- a/tests/unit_tests/train_logging_test.py
+++ b/tests/unit_tests/train_logging_test.py
@@ -22,7 +22,7 @@ def test_train_logging(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
diff --git a/tests/unit_tests/train_with_intialized_param_args_test.py b/tests/unit_tests/train_with_intialized_param_args_test.py
index 40c43bf577..52e18de2d4 100644
--- a/tests/unit_tests/train_with_intialized_param_args_test.py
+++ b/tests/unit_tests/train_with_intialized_param_args_test.py
@@ -55,7 +55,7 @@ def test_train_with_external_optimizer(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "optimizer": optimizer,
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
@@ -81,7 +81,7 @@ def test_train_with_external_scheduler(self):
             "phase_callbacks": phase_callbacks,
             "lr_warmup_epochs": 0,
             "initial_lr": lr,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "optimizer": optimizer,
             "criterion_params": {},
             "train_metrics_list": [Accuracy(), Top5()],
@@ -103,7 +103,7 @@ def test_train_with_external_scheduler_class(self):
             "max_epochs": 2,
             "lr_warmup_epochs": 0,
             "initial_lr": 0.3,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "optimizer": optimizer,
             "criterion_params": {},
             "train_metrics_list": [Accuracy(), Top5()],
@@ -128,7 +128,7 @@ def test_train_with_reduce_on_plateau(self):
             "phase_callbacks": phase_callbacks,
             "lr_warmup_epochs": 0,
             "initial_lr": lr,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "optimizer": optimizer,
             "criterion_params": {},
             "train_metrics_list": [Accuracy(), Top5()],
@@ -151,7 +151,7 @@ def test_train_with_external_metric(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
@@ -181,7 +181,7 @@ def test_train_with_external_dataloaders(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
diff --git a/tests/unit_tests/train_with_precise_bn_test.py b/tests/unit_tests/train_with_precise_bn_test.py
index 07cd75e6bb..20c2974c2e 100644
--- a/tests/unit_tests/train_with_precise_bn_test.py
+++ b/tests/unit_tests/train_with_precise_bn_test.py
@@ -21,7 +21,7 @@ def test_train_with_precise_bn_explicit_size(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
@@ -50,7 +50,7 @@ def test_train_with_precise_bn_implicit_size(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
diff --git a/tests/unit_tests/training_params_factory_test.py b/tests/unit_tests/training_params_factory_test.py
index b574cce8a2..5e30984841 100644
--- a/tests/unit_tests/training_params_factory_test.py
+++ b/tests/unit_tests/training_params_factory_test.py
@@ -5,12 +5,12 @@
 class TrainingParamsTest(unittest.TestCase):
     def test_get_train_params(self):
         train_params = training_hyperparams.coco2017_yolox_train_params()
-        self.assertTrue(train_params["loss"] == "yolox_loss")
+        self.assertTrue(train_params["loss"] == "YoloXDetectionLoss")
         self.assertTrue(train_params["max_epochs"] == 300)
 
     def test_get_train_params_with_overrides(self):
         train_params = training_hyperparams.coco2017_yolox_train_params(overriding_params={"max_epochs": 5})
-        self.assertTrue(train_params["loss"] == "yolox_loss")
+        self.assertTrue(train_params["loss"] == "YoloXDetectionLoss")
         self.assertTrue(train_params["max_epochs"] == 5)
 
 
diff --git a/tests/unit_tests/update_param_groups_unit_test.py b/tests/unit_tests/update_param_groups_unit_test.py
index bc3a5fc490..3a483de818 100644
--- a/tests/unit_tests/update_param_groups_unit_test.py
+++ b/tests/unit_tests/update_param_groups_unit_test.py
@@ -38,7 +38,7 @@ def test_lr_scheduling_with_update_param_groups(self):
             "lr_updates": [0, 1, 2],
             "initial_lr": 0.1,
             "lr_decay_factor": 1,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
diff --git a/tests/unit_tests/vit_unit_test.py b/tests/unit_tests/vit_unit_test.py
index 3d34e8e4e4..b005653438 100644
--- a/tests/unit_tests/vit_unit_test.py
+++ b/tests/unit_tests/vit_unit_test.py
@@ -18,7 +18,7 @@ def setUp(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "cross_entropy",
+            "loss": "LabelSmoothingCrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},

From 9713d6f9c4fc5653e932348d7199ca87a8a28311 Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Mon, 4 Sep 2023 15:29:39 +0300
Subject: [PATCH 07/21] undo unwantaed chang

---
 src/super_gradients/common/registry/registry.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/super_gradients/common/registry/registry.py b/src/super_gradients/common/registry/registry.py
index 4c154dc302..e303f3766f 100644
--- a/src/super_gradients/common/registry/registry.py
+++ b/src/super_gradients/common/registry/registry.py
@@ -84,7 +84,7 @@ def warn_if_deprecated(name: str, registry: dict):
 METRICS = {}
 register_metric = create_register_decorator(registry=METRICS)
 
-LOSSES = {Losses.MSE: nn.MSELoss}
+LOSSES = {}
 register_loss = create_register_decorator(registry=LOSSES)
 register_loss(name=Losses.MSE, deprecated_name="mse")(nn.MSELoss)  # Register manually to benefit from deprecated logic
 

From 964a8bd1e89c2e9ce80869f4af135f6004d5660b Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Mon, 4 Sep 2023 16:11:17 +0300
Subject: [PATCH 08/21] go over the losses again

---
 .circleci/config.yml                          |  2 +-
 Makefile                                      |  2 +-
 .../source/Example_Classification.md          |  2 +-
 documentation/source/Losses.md                | 28 +++++++++----------
 documentation/source/ObjectDetection.md       |  4 +--
 documentation/source/PhaseCallbacks.md        |  2 +-
 documentation/source/PoseEstimation.md        |  2 +-
 documentation/source/Segmentation.md          |  6 ++--
 documentation/source/configuration_files.md   |  2 +-
 documentation/source/ptq_qat.md               |  2 +-
 .../recipes/cityscapes_al_ddrnet.yaml         |  2 +-
 .../recipes/cityscapes_ddrnet.yaml            |  2 +-
 .../recipes/cityscapes_kd_base.yaml           |  2 +-
 .../recipes/cityscapes_pplite_seg50.yaml      |  2 +-
 .../recipes/cityscapes_pplite_seg75.yaml      |  2 +-
 .../recipes/cityscapes_regseg48.yaml          |  2 +-
 .../recipes/cityscapes_segformer.yaml         |  2 +-
 .../recipes/cityscapes_stdc_seg50.yaml        |  2 +-
 .../recipes/cityscapes_stdc_seg75.yaml        |  2 +-
 .../recipes/imagenet_resnet50_kd.yaml         |  2 +-
 .../default_quantization_params.yaml          |  2 +-
 .../recipes/roboflow_ppyoloe.yaml             |  2 +-
 .../recipes/roboflow_yolo_nas_m.yaml          |  2 +-
 .../recipes/roboflow_yolo_nas_s.yaml          |  2 +-
 .../recipes/supervisely_unet.yaml             |  2 +-
 .../cifar10_resnet_train_params.yaml          |  2 +-
 .../coco2017_dekr_pose_train_params.yaml      |  2 +-
 .../coco2017_ppyoloe_train_params.yaml        |  2 +-
 .../coco2017_rescoring_train_params.yaml      |  2 +-
 ...17_ssd_lite_mobilenet_v2_train_params.yaml |  2 +-
 .../coco2017_yolo_nas_train_params.yaml       |  2 +-
 .../coco2017_yolox_train_params.yaml          |  2 +-
 ...segmentation_shelfnet_lw_train_params.yaml |  2 +-
 .../imagenet_efficientnet_train_params.yaml   |  2 +-
 .../imagenet_mobilenetv2_train_params.yaml    |  2 +-
 .../imagenet_mobilenetv3_train_params.yaml    |  2 +-
 .../imagenet_regnetY_train_params.yaml        |  2 +-
 .../imagenet_repvgg_train_params.yaml         |  2 +-
 .../imagenet_resnet50_kd_train_params.yaml    |  2 +-
 .../imagenet_resnet50_train_params.yaml       |  2 +-
 .../imagenet_vit_train_params.yaml            |  2 +-
 .../training/losses/seg_kd_loss.py            |  2 +-
 42 files changed, 58 insertions(+), 58 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index de0afea275..1d27ab4fd2 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -547,7 +547,7 @@ jobs:
             python3.8 src/super_gradients/train_from_recipe.py --config-name=coco2017_pose_dekr_w32_no_dc experiment_name=shortened_coco2017_pose_dekr_w32_ap_test batch_size=4 val_batch_size=8 epochs=1 training_hyperparams.lr_warmup_steps=0 training_hyperparams.average_best_models=False training_hyperparams.max_train_batches=1000 training_hyperparams.max_valid_batches=100 multi_gpu=DDP num_gpus=4
             python3.8 src/super_gradients/train_from_recipe.py --config-name=cifar10_resnet experiment_name=shortened_cifar10_resnet_accuracy_test epochs=100 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4
             python3.8 src/super_gradients/examples/convert_recipe_example/convert_recipe_example.py --config-name=cifar10_conversion_params experiment_name=shortened_cifar10_resnet_accuracy_test
-            python3.8 src/super_gradients/train_from_recipe.py --config-name=coco2017_yolox experiment_name=shortened_coco2017_yolox_n_map_test architecture=yolox_n training_hyperparams.loss=yolox_fast_loss epochs=10 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4
+            python3.8 src/super_gradients/train_from_recipe.py --config-name=coco2017_yolox experiment_name=shortened_coco2017_yolox_n_map_test architecture=yolox_n training_hyperparams.loss=YoloXFastDetectionLoss epochs=10 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4
             python3.8 src/super_gradients/train_from_recipe.py --config-name=cityscapes_regseg48 experiment_name=shortened_cityscapes_regseg48_iou_test epochs=10 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4
             coverage run --source=super_gradients -m unittest tests/deci_core_recipe_test_suite_runner.py
 
diff --git a/Makefile b/Makefile
index 255938e51d..6cf22249de 100644
--- a/Makefile
+++ b/Makefile
@@ -10,7 +10,7 @@ yolo_nas_integration_tests:
 recipe_accuracy_tests:
 	python src/super_gradients/train_from_recipe.py --config-name=coco2017_pose_dekr_w32_no_dc experiment_name=shortened_coco2017_pose_dekr_w32_ap_test epochs=1 batch_size=4 val_batch_size=8 training_hyperparams.lr_warmup_steps=0 training_hyperparams.average_best_models=False training_hyperparams.max_train_batches=1000 training_hyperparams.max_valid_batches=100 multi_gpu=DDP num_gpus=4
 	python src/super_gradients/train_from_recipe.py --config-name=cifar10_resnet               experiment_name=shortened_cifar10_resnet_accuracy_test   epochs=100 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4
-	python src/super_gradients/train_from_recipe.py --config-name=coco2017_yolox               experiment_name=shortened_coco2017_yolox_n_map_test      epochs=10  architecture=yolox_n training_hyperparams.loss=yolox_fast_loss training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4
+	python src/super_gradients/train_from_recipe.py --config-name=coco2017_yolox               experiment_name=shortened_coco2017_yolox_n_map_test      epochs=10  architecture=yolox_n training_hyperparams.loss=YoloXFastDetectionLoss training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4
 	python src/super_gradients/train_from_recipe.py --config-name=cityscapes_regseg48          experiment_name=shortened_cityscapes_regseg48_iou_test   epochs=10 training_hyperparams.average_best_models=False multi_gpu=DDP num_gpus=4
 	python src/super_gradients/examples/convert_recipe_example/convert_recipe_example.py --config-name=cifar10_conversion_params experiment_name=shortened_cifar10_resnet_accuracy_test
 	coverage run --source=super_gradients -m unittest tests/deci_core_recipe_test_suite_runner.py
diff --git a/documentation/source/Example_Classification.md b/documentation/source/Example_Classification.md
index b332177134..699b80bf70 100644
--- a/documentation/source/Example_Classification.md
+++ b/documentation/source/Example_Classification.md
@@ -318,7 +318,7 @@ Output (Training parameters):
     'launch_tensorboard': False,
     'load_opt_params': True,
     'log_installed_packages': True,
-    'loss': 'cross_entropy',
+    'loss': "LabelSmoothingCrossEntropyLoss",
     'lr_cooldown_epochs': 0,
     'lr_decay_factor': 0.1,
     'lr_mode': 'step',
diff --git a/documentation/source/Losses.md b/documentation/source/Losses.md
index c5dda6dd1e..e7fd837566 100644
--- a/documentation/source/Losses.md
+++ b/documentation/source/Losses.md
@@ -2,18 +2,18 @@
 
 SuperGradients can support any PyTorch-based loss function. Additionally, multiple Loss function implementations for various tasks are also supported:
 
-    cross_entropy
-    mse
-    r_squared_loss
-    shelfnet_ohem_loss
-    shelfnet_se_loss
-    yolox_loss
-    yolox_fast_loss
-    ssd_loss
-    stdc_loss
-    bce_dice_loss
-    kd_loss
-    dice_ce_edge_loss
+    LabelSmoothingCrossEntropyLoss
+    MSE
+    RSquaredLoss
+    ShelfNetOHEMLoss
+    ShelfNetSemanticEncodingLoss
+    YoloXDetectionLoss
+    YoloXFastDetectionLoss
+    SSDLoss
+    STDCLoss
+    BCEDiceLoss
+    KDLogitsLoss
+    DiceCEEdgeLoss
 
 All the above, are just string aliases for the underlying torch.nn.Module classes, implementing the specified loss functions.
 
@@ -54,14 +54,14 @@ When doing so, in your `my_training_hyperparams.yaml` file:
 ```yaml
 ...
 
-loss: yolox_loss
+loss: YoloXDetectionLoss
 
 criterion_params:
    strides: [8, 16, 32]  # output strides of all yolo outputs
    num_classes: 80
 ```
 
-Note that two `training_params` parameters define the loss function:  `loss` which defines the type of the loss, and`criterion_params` dictionary which will be unpacked to the underlying `yolox_loss` class constructor.
+Note that two `training_params` parameters define the loss function:  `loss` which defines the type of the loss, and`criterion_params` dictionary which will be unpacked to the underlying `YoloXDetectionLoss` class constructor.
 
 ## Passing Instantiated nn.Module Objects as Loss Functions
 
diff --git a/documentation/source/ObjectDetection.md b/documentation/source/ObjectDetection.md
index 92292ec430..230554e451 100644
--- a/documentation/source/ObjectDetection.md
+++ b/documentation/source/ObjectDetection.md
@@ -12,8 +12,8 @@ In SuperGradients, we aim to collect such models and make them very convenient a
 
 | Model                                        | Yaml                                                                                                                                                                     | Model class                                                                                                                              | Loss Class                                                                                                                                  | NMS Callback                                                                                                                                                                                       |
 |----------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 
-| [SSD](https://arxiv.org/abs/1512.02325)      | [ssd_lite_mobilenetv2_arch_params](https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/recipes/arch_params/ssd_lite_mobilenetv2_arch_params.yaml) | [SSDLiteMobileNetV2](https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/models/detection_models/ssd.py) | [SSDLoss](https://docs.deci.ai/super-gradients/docstring/training/losses.html#training.losses.ssd_loss.SSDLoss)                                 | [SSDPostPredictCallback](https://docs.deci.ai/super-gradients/docstring/training/utils.html#training.utils.ssd_utils.SSDPostPredictCallback)                                                           |
-| [YOLOX](https://arxiv.org/abs/2107.08430)    | [yolox_s_arch_params](https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml)                           | [YoloX_S](https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/models/detection_models/yolox.py)          | [YoloXFastDetectionLoss](https://docs.deci.ai/super-gradients/docstring/training/losses.html#training.losses.yolox_loss.YoloXFastDetectionLoss) | [YoloXPostPredictionCallback](https://docs.deci.ai/super-gradients/docstring/training/models.html#training.models.detection_models.yolo_base.YoloXPostPredictionCallback)                              |
+| [SSD](https://arxiv.org/abs/1512.02325)      | [ssd_lite_mobilenetv2_arch_params](https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/recipes/arch_params/ssd_lite_mobilenetv2_arch_params.yaml) | [SSDLiteMobileNetV2](https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/models/detection_models/ssd.py) | [SSDLoss](https://docs.deci.ai/super-gradients/docstring/training/losses.html#training.losses.SSDLoss.SSDLoss)                                 | [SSDPostPredictCallback](https://docs.deci.ai/super-gradients/docstring/training/utils.html#training.utils.ssd_utils.SSDPostPredictCallback)                                                           |
+| [YOLOX](https://arxiv.org/abs/2107.08430)    | [yolox_s_arch_params](https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml)                           | [YoloX_S](https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/models/detection_models/yolox.py)          | [YoloXFastDetectionLoss](https://docs.deci.ai/super-gradients/docstring/training/losses.html#training.losses.YoloXDetectionLoss.YoloXFastDetectionLoss) | [YoloXPostPredictionCallback](https://docs.deci.ai/super-gradients/docstring/training/models.html#training.models.detection_models.yolo_base.YoloXPostPredictionCallback)                              |
 | [PPYolo](https://arxiv.org/abs/2007.12099)   | [ppyoloe_arch_params](https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/recipes/arch_params/ppyoloe_arch_params.yaml)                           | [PPYoloE](https://docs.deci.ai/super-gradients/docstring/training/models.html#training.models.detection_models.pp_yolo_e.pp_yolo_e.PPYoloE)  | [PPYoloELoss](https://docs.deci.ai/super-gradients/docstring/training/losses.html#training.losses.ppyolo_loss.PPYoloELoss)                      | [PPYoloEPostPredictionCallback](https://docs.deci.ai/super-gradients/docstring/training/models.html#training.models.detection_models.pp_yolo_e.post_prediction_callback.PPYoloEPostPredictionCallback) |
 | YoloNAS                                      | [yolo_nas_s_arch_params](https://github.com/Deci-AI/super-gradients/blob/e1db4d99492a25f8e65b5d3e17a6ff2672c5467b/src/super_gradients/recipes/arch_params/yolo_nas_s_arch_params.yaml) | [Yolo NAS S](https://github.com/Deci-AI/super-gradients/blob/e1db4d99492a25f8e65b5d3e17a6ff2672c5467b/src/super_gradients/training/models/detection_models/yolo_nas/yolo_nas_variants.py#L16) | [PPYoloELoss](https://docs.deci.ai/super-gradients/docstring/training/losses.html#training.losses.ppyolo_loss.PPYoloELoss)                      | [PPYoloEPostPredictionCallback](https://docs.deci.ai/super-gradients/docstring/training/models.html#training.models.detection_models.pp_yolo_e.post_prediction_callback.PPYoloEPostPredictionCallback) |
 
diff --git a/documentation/source/PhaseCallbacks.md b/documentation/source/PhaseCallbacks.md
index a447843d62..77596fd743 100644
--- a/documentation/source/PhaseCallbacks.md
+++ b/documentation/source/PhaseCallbacks.md
@@ -30,7 +30,7 @@ off augmentations and incorporate L1 loss starting from epoch 285:
 max_epochs: 300
 ...
 
-loss: yolox_loss
+loss: YoloXDetectionLoss
 
 ...
 
diff --git a/documentation/source/PoseEstimation.md b/documentation/source/PoseEstimation.md
index f8c9c66956..f60538255b 100644
--- a/documentation/source/PoseEstimation.md
+++ b/documentation/source/PoseEstimation.md
@@ -16,7 +16,7 @@ In summary, top-down approach starts with detecting an object and then estimates
 
 | Model                                    | Model class                                                                                                                                                          | Target Generator                                                                                                                                                      | Loss Class                                                                                                     | Decoding Callback                                                                                                                                                                        | Visualization Callback                                                                                                                                                            |
 |------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 
-| [DEKR](https://arxiv.org/abs/2104.02300) | [DEKRPoseEstimationModel](https://docs.deci.ai/super-gradients/docstring/training/models.html#training.models.pose_estimation_models.dekr_hrnet.DEKRPoseEstimationModel) | [DEKRTargetsGenerator](https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/datasets/pose_estimation_datasets/target_generators.py#L8) | [DEKRLoss](https://docs.deci.ai/super-gradients/docstring/training/losses.html#training.losses.dekr_loss.DEKRLoss) | [DEKRPoseEstimationDecodeCallback](https://docs.deci.ai/super-gradients/docstring/training/utils.html#training.utils.pose_estimation.dekr_decode_callbacks.DEKRPoseEstimationDecodeCallback) | [DEKRVisualizationCallback](https://docs.deci.ai/super-gradients/docstring/training/utils.html#training.utils.pose_estimation.dekr_visualization_callbacks.DEKRVisualizationCallback) |
+| [DEKR](https://arxiv.org/abs/2104.02300) | [DEKRPoseEstimationModel](https://docs.deci.ai/super-gradients/docstring/training/models.html#training.models.pose_estimation_models.dekr_hrnet.DEKRPoseEstimationModel) | [DEKRTargetsGenerator](https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/datasets/pose_estimation_datasets/target_generators.py#L8) | [DEKRLoss](https://docs.deci.ai/super-gradients/docstring/training/losses.html#training.losses.DEKRLoss.DEKRLoss) | [DEKRPoseEstimationDecodeCallback](https://docs.deci.ai/super-gradients/docstring/training/utils.html#training.utils.pose_estimation.dekr_decode_callbacks.DEKRPoseEstimationDecodeCallback) | [DEKRVisualizationCallback](https://docs.deci.ai/super-gradients/docstring/training/utils.html#training.utils.pose_estimation.dekr_visualization_callbacks.DEKRVisualizationCallback) |
 
 ## Training
 
diff --git a/documentation/source/Segmentation.md b/documentation/source/Segmentation.md
index 3e6ba3a940..57f0b2a705 100644
--- a/documentation/source/Segmentation.md
+++ b/documentation/source/Segmentation.md
@@ -33,12 +33,12 @@ The following table summarizes the loss functions currently supported by SuperGr
 
 | Loss function class                                                                                                                                                                     | Loss name in YAML | Description                                                          |
 |-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------|----------------------------------------------------------------------|
-| [BCEDiceLoss](https://docs.deci.ai/super-gradients/docstring/training/losses.html#training.losses.bce_dice_loss.BCEDiceLoss)                                                            | bce_dice_loss     | Weighted average of BCE and Dice loss                                |
-| [LabelSmoothingCrossEntropyLoss](https://docs.deci.ai/super-gradients/docstring/training/losses.html#training.losses.label_smoothing_cross_entropy_loss.LabelSmoothingCrossEntropyLoss) | cross_entropy     | Cross entropy loss with label smoothing support                      |
+| [BCEDiceLoss](https://docs.deci.ai/super-gradients/docstring/training/losses.html#training.losses.BCEDiceLoss.BCEDiceLoss)                                                            | BCEDiceLoss     | Weighted average of BCE and Dice loss                                |
+| [LabelSmoothingCrossEntropyLoss](https://docs.deci.ai/super-gradients/docstring/training/losses.html#training.losses.label_smoothing_cross_entropy_loss.LabelSmoothingCrossEntropyLoss) | LabelSmoothingCrossEntropyLoss     | Cross entropy loss with label smoothing support                      |
 | [DiceLoss](https://docs.deci.ai/super-gradients/docstring/training/losses.html#training.losses.dice_loss.DiceLoss)                                                                      | N/A               | Dice loss for multiclass segmentation                                |
 | [BinaryDiceLoss](https://docs.deci.ai/super-gradients/docstring/training/losses.html#training.losses.dice_loss.BinaryDiceLoss)                                                          | N/A               | Dice loss for binary segmentation                                    |
 | [GeneralizedDiceLoss](https://docs.deci.ai/super-gradients/docstring/training/losses.html#training.losses.dice_loss.GeneralizedDiceLoss)                                                | N/A               | Generalized dice loss                                                |
-| [DiceCEEdgeLoss](https://docs.deci.ai/super-gradients/docstring/training/losses.html#training.losses.dice_ce_edge_loss.DiceCEEdgeLoss)                                                  | dice_ce_edge_loss | Dice loss + Cross entropy loss + Edge loss                           |
+| [DiceCEEdgeLoss](https://docs.deci.ai/super-gradients/docstring/training/losses.html#training.losses.DiceCEEdgeLoss.DiceCEEdgeLoss)                                                  | DiceCEEdgeLoss | Dice loss + Cross entropy loss + Edge loss                           |
 | [SegKDLoss](https://docs.deci.ai/super-gradients/docstring/training/losses.html#training.losses.seg_kd_loss.SegKDLoss)                                                                  | N/A               | A loss function for knowledge distillation for semantic segmentation |
 
 ## Metrics
diff --git a/documentation/source/configuration_files.md b/documentation/source/configuration_files.md
index bf7a4b43ab..bb253bd8c3 100644
--- a/documentation/source/configuration_files.md
+++ b/documentation/source/configuration_files.md
@@ -28,7 +28,7 @@ lr_decay_factor: 0.1
 lr_mode: step
 lr_warmup_epochs: 0
 initial_lr: 0.1
-loss: cross_entropy
+loss: LabelSmoothingCrossEntropyLoss
 optimizer: SGD
 criterion_params: {}
 
diff --git a/documentation/source/ptq_qat.md b/documentation/source/ptq_qat.md
index f9376b14b7..7c59dcc7a6 100644
--- a/documentation/source/ptq_qat.md
+++ b/documentation/source/ptq_qat.md
@@ -310,7 +310,7 @@ selective_quantizer_params:
   skip_modules:              # optional list of module names (strings) to skip from quantization
 
 calib_params:
-  histogram_calib_method: "percentile"  # calibration method for all "histogram" calibrators, acceptable types are ["percentile", "entropy", mse"], "max" calibrators always use "max"
+  histogram_calib_method: "percentile"  # calibration method for all "histogram" calibrators, acceptable types are ["percentile", "entropy", MSE"], "max" calibrators always use "max"
   percentile: 99.99                     # percentile for all histogram calibrators with method "percentile", other calibrators are not affected
   num_calib_batches:                    # number of batches to use for calibration, if None, 512 / batch_size will be used
   verbose: False                        # if calibrator should be verbose
diff --git a/src/super_gradients/recipes/cityscapes_al_ddrnet.yaml b/src/super_gradients/recipes/cityscapes_al_ddrnet.yaml
index d5b69683a1..48dd55122d 100644
--- a/src/super_gradients/recipes/cityscapes_al_ddrnet.yaml
+++ b/src/super_gradients/recipes/cityscapes_al_ddrnet.yaml
@@ -61,7 +61,7 @@ training_hyperparams:
   max_epochs: 200
   initial_lr: 0.0075   # batch size 24
   loss:
-    dice_ce_edge_loss:
+    DiceCEEdgeLoss:
       num_classes: 19
       ignore_index: 19
       num_aux_heads: 1
diff --git a/src/super_gradients/recipes/cityscapes_ddrnet.yaml b/src/super_gradients/recipes/cityscapes_ddrnet.yaml
index 9c590c4727..d6763593c8 100644
--- a/src/super_gradients/recipes/cityscapes_ddrnet.yaml
+++ b/src/super_gradients/recipes/cityscapes_ddrnet.yaml
@@ -57,7 +57,7 @@ training_hyperparams:
   max_epochs: 500
   initial_lr: 0.0075   # batch size 24
   loss:
-    dice_ce_edge_loss:
+    DiceCEEdgeLoss:
       num_classes: 19
       ignore_index: 19
       num_aux_heads: 1
diff --git a/src/super_gradients/recipes/cityscapes_kd_base.yaml b/src/super_gradients/recipes/cityscapes_kd_base.yaml
index d6a99d88a9..40d0d6e2b4 100644
--- a/src/super_gradients/recipes/cityscapes_kd_base.yaml
+++ b/src/super_gradients/recipes/cityscapes_kd_base.yaml
@@ -55,7 +55,7 @@ training_hyperparams:
     weights: [ 1. ]
     kd_loss_weights: [1., 6.]
 
-    kd_loss:
+    KDLogitsLoss:
       _target_: super_gradients.training.losses.cwd_loss.ChannelWiseKnowledgeDistillationLoss
       temperature: 3.
       normalization_mode: channel_wise
diff --git a/src/super_gradients/recipes/cityscapes_pplite_seg50.yaml b/src/super_gradients/recipes/cityscapes_pplite_seg50.yaml
index 0843464532..ad1b90d03a 100644
--- a/src/super_gradients/recipes/cityscapes_pplite_seg50.yaml
+++ b/src/super_gradients/recipes/cityscapes_pplite_seg50.yaml
@@ -68,7 +68,7 @@ checkpoint_params:
 training_hyperparams:
   sync_bn: True
   loss:
-    dice_ce_edge_loss:
+    DiceCEEdgeLoss:
       num_classes: 19
       ignore_index: 19
       num_aux_heads: 3
diff --git a/src/super_gradients/recipes/cityscapes_pplite_seg75.yaml b/src/super_gradients/recipes/cityscapes_pplite_seg75.yaml
index b20030cff8..cbc19e4660 100644
--- a/src/super_gradients/recipes/cityscapes_pplite_seg75.yaml
+++ b/src/super_gradients/recipes/cityscapes_pplite_seg75.yaml
@@ -63,7 +63,7 @@ checkpoint_params:
 training_hyperparams:
   sync_bn: True
   loss:
-    dice_ce_edge_loss:
+    DiceCEEdgeLoss:
       num_classes: 19
       ignore_index: 19
       num_aux_heads: 3
diff --git a/src/super_gradients/recipes/cityscapes_regseg48.yaml b/src/super_gradients/recipes/cityscapes_regseg48.yaml
index a9e8b0e393..2135d76e71 100644
--- a/src/super_gradients/recipes/cityscapes_regseg48.yaml
+++ b/src/super_gradients/recipes/cityscapes_regseg48.yaml
@@ -62,7 +62,7 @@ training_hyperparams:
 
   ema: True
 
-  loss: cross_entropy
+  loss: LabelSmoothingCrossEntropyLoss
   criterion_params:
     ignore_index: ${cityscapes_ignored_label}
 
diff --git a/src/super_gradients/recipes/cityscapes_segformer.yaml b/src/super_gradients/recipes/cityscapes_segformer.yaml
index 60c57ba273..fb4d8bb227 100644
--- a/src/super_gradients/recipes/cityscapes_segformer.yaml
+++ b/src/super_gradients/recipes/cityscapes_segformer.yaml
@@ -95,7 +95,7 @@ training_hyperparams:
 
   sync_bn: True
 
-  loss: cross_entropy
+  loss: LabelSmoothingCrossEntropyLoss
   criterion_params:
     ignore_index: ${cityscapes_ignored_label}
 
diff --git a/src/super_gradients/recipes/cityscapes_stdc_seg50.yaml b/src/super_gradients/recipes/cityscapes_stdc_seg50.yaml
index 4c0edec4f7..05f565256d 100644
--- a/src/super_gradients/recipes/cityscapes_stdc_seg50.yaml
+++ b/src/super_gradients/recipes/cityscapes_stdc_seg50.yaml
@@ -62,7 +62,7 @@ checkpoint_params:
 training_hyperparams:
   sync_bn: True
   loss:
-    dice_ce_edge_loss:
+    DiceCEEdgeLoss:
       num_classes: 19
       ignore_index: 19
       weights: [ 1., 0.6, 0.4, 1. ]
diff --git a/src/super_gradients/recipes/cityscapes_stdc_seg75.yaml b/src/super_gradients/recipes/cityscapes_stdc_seg75.yaml
index f63ad19072..c5b6ff7b5a 100644
--- a/src/super_gradients/recipes/cityscapes_stdc_seg75.yaml
+++ b/src/super_gradients/recipes/cityscapes_stdc_seg75.yaml
@@ -68,7 +68,7 @@ training_hyperparams:
   sync_bn: True
 
   loss:
-    stdc_loss:
+    STDCLoss:
       num_classes: 19
       ignore_index: 19
       mining_percent: 0.0625 # mining percentage is 1/16 of pixels following original implementation.
diff --git a/src/super_gradients/recipes/imagenet_resnet50_kd.yaml b/src/super_gradients/recipes/imagenet_resnet50_kd.yaml
index 60bb7db496..2bc9109f46 100644
--- a/src/super_gradients/recipes/imagenet_resnet50_kd.yaml
+++ b/src/super_gradients/recipes/imagenet_resnet50_kd.yaml
@@ -25,7 +25,7 @@ val_dataloader: imagenet_val
 resume: False
 training_hyperparams:
   resume: ${resume}
-  loss: kd_loss
+  loss: KDLogitsLoss
   criterion_params:
     distillation_loss_coeff: 0.8
     task_loss_fn:
diff --git a/src/super_gradients/recipes/quantization_params/default_quantization_params.yaml b/src/super_gradients/recipes/quantization_params/default_quantization_params.yaml
index 2b08ae0c02..81bad8b7cd 100644
--- a/src/super_gradients/recipes/quantization_params/default_quantization_params.yaml
+++ b/src/super_gradients/recipes/quantization_params/default_quantization_params.yaml
@@ -7,7 +7,7 @@ selective_quantizer_params:
   skip_modules:              # optional list of module names (strings) to skip from quantization
 
 calib_params:
-  histogram_calib_method: "percentile"  # calibration method for all "histogram" calibrators, acceptable types are ["percentile", "entropy", mse"], "max" calibrators always use "max"
+  histogram_calib_method: "percentile"  # calibration method for all "histogram" calibrators, acceptable types are ["percentile", "entropy", MSE"], "max" calibrators always use "max"
   percentile: 99.99                     # percentile for all histogram calibrators with method "percentile", other calibrators are not affected
   num_calib_batches:                    # number of batches to use for calibration, if None, 512 / batch_size will be used
   verbose: False                        # if calibrator should be verbose
diff --git a/src/super_gradients/recipes/roboflow_ppyoloe.yaml b/src/super_gradients/recipes/roboflow_ppyoloe.yaml
index 23a2801a66..c904cf96c5 100644
--- a/src/super_gradients/recipes/roboflow_ppyoloe.yaml
+++ b/src/super_gradients/recipes/roboflow_ppyoloe.yaml
@@ -47,7 +47,7 @@ training_hyperparams:
         dataset_name: ${dataset_name}
         output_path: ${result_path}
   loss:
-    ppyoloe_loss:
+    PPYoloELoss:
       num_classes: ${num_classes}
       reg_max: ${arch_params.head.reg_max}
 
diff --git a/src/super_gradients/recipes/roboflow_yolo_nas_m.yaml b/src/super_gradients/recipes/roboflow_yolo_nas_m.yaml
index 79582bab0b..b2c9ba5370 100644
--- a/src/super_gradients/recipes/roboflow_yolo_nas_m.yaml
+++ b/src/super_gradients/recipes/roboflow_yolo_nas_m.yaml
@@ -66,7 +66,7 @@ training_hyperparams:
 
   phase_callbacks: []
   loss:
-    ppyoloe_loss:
+    PPYoloELoss:
       num_classes: ${num_classes}
       reg_max: 16
 
diff --git a/src/super_gradients/recipes/roboflow_yolo_nas_s.yaml b/src/super_gradients/recipes/roboflow_yolo_nas_s.yaml
index a3a54f5140..4698e76a84 100644
--- a/src/super_gradients/recipes/roboflow_yolo_nas_s.yaml
+++ b/src/super_gradients/recipes/roboflow_yolo_nas_s.yaml
@@ -66,7 +66,7 @@ training_hyperparams:
 
   phase_callbacks: []
   loss:
-    ppyoloe_loss:
+    PPYoloELoss:
       num_classes: ${num_classes}
       reg_max: 16
 
diff --git a/src/super_gradients/recipes/supervisely_unet.yaml b/src/super_gradients/recipes/supervisely_unet.yaml
index a05b8f8bd0..e5caa4b3b8 100644
--- a/src/super_gradients/recipes/supervisely_unet.yaml
+++ b/src/super_gradients/recipes/supervisely_unet.yaml
@@ -29,7 +29,7 @@ training_hyperparams:
   initial_lr: 0.025
 
   loss:
-    bce_dice_loss:
+    BCEDiceLoss:
       loss_weights: [ 1., 1. ]
       logits: True
 
diff --git a/src/super_gradients/recipes/training_hyperparams/cifar10_resnet_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/cifar10_resnet_train_params.yaml
index 38fb211a75..ad14908673 100644
--- a/src/super_gradients/recipes/training_hyperparams/cifar10_resnet_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/cifar10_resnet_train_params.yaml
@@ -13,7 +13,7 @@ lr_decay_factor: 0.1
 lr_mode: StepLRCallback
 lr_warmup_epochs: 0
 initial_lr: 0.1
-loss: cross_entropy
+loss: LabelSmoothingCrossEntropyLoss
 optimizer: SGD
 criterion_params: {}
 
diff --git a/src/super_gradients/recipes/training_hyperparams/coco2017_dekr_pose_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/coco2017_dekr_pose_train_params.yaml
index 03bfc50de0..360bb96c96 100644
--- a/src/super_gradients/recipes/training_hyperparams/coco2017_dekr_pose_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/coco2017_dekr_pose_train_params.yaml
@@ -12,7 +12,7 @@ lr_mode: CosineLRCallback
 cosine_final_lr_ratio: 0.1
 batch_accumulate: 1
 initial_lr: 1e-3
-loss: dekr_loss
+loss: DEKRLoss
 
 criterion_params:
   heatmap_loss: qfl
diff --git a/src/super_gradients/recipes/training_hyperparams/coco2017_ppyoloe_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/coco2017_ppyoloe_train_params.yaml
index 53d6fcf965..2e2443cf20 100644
--- a/src/super_gradients/recipes/training_hyperparams/coco2017_ppyoloe_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/coco2017_ppyoloe_train_params.yaml
@@ -19,7 +19,7 @@ batch_accumulate: 1
 save_ckpt_epoch_list: [200, 250, 300, 350, 400, 450]
 
 loss:
-  ppyoloe_loss:
+  PPYoloELoss:
     num_classes: ${arch_params.num_classes}
     reg_max: ${arch_params.head.reg_max}
 
diff --git a/src/super_gradients/recipes/training_hyperparams/coco2017_rescoring_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/coco2017_rescoring_train_params.yaml
index f440564436..04f32cbcd1 100644
--- a/src/super_gradients/recipes/training_hyperparams/coco2017_rescoring_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/coco2017_rescoring_train_params.yaml
@@ -12,7 +12,7 @@ lr_mode: CosineLRCallback
 cosine_final_lr_ratio: 0.1
 batch_accumulate: 1
 initial_lr: 0.001
-loss: rescoring_loss
+loss: RescoringLoss
 criterion_params: {}
 
 mixed_precision: False
diff --git a/src/super_gradients/recipes/training_hyperparams/coco2017_ssd_lite_mobilenet_v2_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/coco2017_ssd_lite_mobilenet_v2_train_params.yaml
index 1a6b39d4c1..0ac79dcca9 100644
--- a/src/super_gradients/recipes/training_hyperparams/coco2017_ssd_lite_mobilenet_v2_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/coco2017_ssd_lite_mobilenet_v2_train_params.yaml
@@ -7,7 +7,7 @@ lr_mode: CosineLRCallback
 cosine_final_lr_ratio: 0.01
 batch_accumulate: 1
 initial_lr: 0.01
-loss: ssd_loss
+loss: SSDLoss
 
 criterion_params:
   alpha: 1.0
diff --git a/src/super_gradients/recipes/training_hyperparams/coco2017_yolo_nas_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/coco2017_yolo_nas_train_params.yaml
index 9eea40f750..7ce80baa58 100644
--- a/src/super_gradients/recipes/training_hyperparams/coco2017_yolo_nas_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/coco2017_yolo_nas_train_params.yaml
@@ -18,7 +18,7 @@ batch_accumulate: 1
 save_ckpt_epoch_list: [100, 200, 250]
 
 loss:
-  ppyoloe_loss:
+  PPYoloELoss:
     use_static_assigner: False
     num_classes: ${arch_params.num_classes}
     reg_max: 16
diff --git a/src/super_gradients/recipes/training_hyperparams/coco2017_yolox_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/coco2017_yolox_train_params.yaml
index cd16842c37..f338781583 100644
--- a/src/super_gradients/recipes/training_hyperparams/coco2017_yolox_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/coco2017_yolox_train_params.yaml
@@ -12,7 +12,7 @@ batch_accumulate: 1
 
 save_ckpt_epoch_list: [285]
 
-loss: yolox_loss
+loss: YoloXDetectionLoss
 
 criterion_params:
   strides: [8, 16, 32]  # output strides of all yolo outputs
diff --git a/src/super_gradients/recipes/training_hyperparams/coco_segmentation_shelfnet_lw_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/coco_segmentation_shelfnet_lw_train_params.yaml
index ca6f3782c9..7501be5c5a 100644
--- a/src/super_gradients/recipes/training_hyperparams/coco_segmentation_shelfnet_lw_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/coco_segmentation_shelfnet_lw_train_params.yaml
@@ -3,7 +3,7 @@ defaults:
 
 max_epochs: 150
 initial_lr: 5e-3
-loss: shelfnet_ohem_loss
+loss: ShelfNetOHEMLoss
 optimizer: SGD
 mixed_precision: True
 batch_accumulate: 3
diff --git a/src/super_gradients/recipes/training_hyperparams/imagenet_efficientnet_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/imagenet_efficientnet_train_params.yaml
index 31a96c0f2f..b9e6688472 100644
--- a/src/super_gradients/recipes/training_hyperparams/imagenet_efficientnet_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/imagenet_efficientnet_train_params.yaml
@@ -20,7 +20,7 @@ ema_params:
   decay: 0.9999
   decay_type: constant
 
-loss: cross_entropy
+loss: LabelSmoothingCrossEntropyLoss
 criterion_params:
   smooth_eps: 0.1
 
diff --git a/src/super_gradients/recipes/training_hyperparams/imagenet_mobilenetv2_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/imagenet_mobilenetv2_train_params.yaml
index 56a5a8e665..eee328aacd 100644
--- a/src/super_gradients/recipes/training_hyperparams/imagenet_mobilenetv2_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/imagenet_mobilenetv2_train_params.yaml
@@ -20,7 +20,7 @@ optimizer_params:
   alpha: 0.9
   eps: 0.001
 
-loss: cross_entropy
+loss: LabelSmoothingCrossEntropyLoss
 
 zero_weight_decay_on_bias_and_bn: True
 ema: True
diff --git a/src/super_gradients/recipes/training_hyperparams/imagenet_mobilenetv3_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/imagenet_mobilenetv3_train_params.yaml
index c6c5305660..9a7cb96938 100644
--- a/src/super_gradients/recipes/training_hyperparams/imagenet_mobilenetv3_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/imagenet_mobilenetv3_train_params.yaml
@@ -10,7 +10,7 @@ optimizer_params:
   weight_decay: 0.00004
 
 lr_warmup_epochs: 5
-loss: cross_entropy
+loss: LabelSmoothingCrossEntropyLoss
 criterion_params:
   smooth_eps: 0.1
 
diff --git a/src/super_gradients/recipes/training_hyperparams/imagenet_regnetY_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/imagenet_regnetY_train_params.yaml
index d0b9add9de..44a757c4f0 100644
--- a/src/super_gradients/recipes/training_hyperparams/imagenet_regnetY_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/imagenet_regnetY_train_params.yaml
@@ -20,7 +20,7 @@ ema_params:
   decay_type: constant
   decay: 0.9999
 
-loss: cross_entropy
+loss: LabelSmoothingCrossEntropyLoss
 criterion_params:
   smooth_eps: 0.1
 
diff --git a/src/super_gradients/recipes/training_hyperparams/imagenet_repvgg_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/imagenet_repvgg_train_params.yaml
index a219060350..f359ed2127 100644
--- a/src/super_gradients/recipes/training_hyperparams/imagenet_repvgg_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/imagenet_repvgg_train_params.yaml
@@ -6,7 +6,7 @@ lr_mode: CosineLRCallback
 initial_lr: 0.1
 cosine_final_lr_ratio: 0
 
-loss: cross_entropy
+loss: LabelSmoothingCrossEntropyLoss
 
 zero_weight_decay_on_bias_and_bn: True
 average_best_models: True
diff --git a/src/super_gradients/recipes/training_hyperparams/imagenet_resnet50_kd_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/imagenet_resnet50_kd_train_params.yaml
index f36c971229..55c2d4867c 100644
--- a/src/super_gradients/recipes/training_hyperparams/imagenet_resnet50_kd_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/imagenet_resnet50_kd_train_params.yaml
@@ -12,7 +12,7 @@ zero_weight_decay_on_bias_and_bn: True
 optimizer: Lamb
 optimizer_params:
   weight_decay: 0.02
-loss: cross_entropy
+loss: LabelSmoothingCrossEntropyLoss
 train_metrics_list:                               # metrics for evaluation
   - Accuracy
   - Top5
diff --git a/src/super_gradients/recipes/training_hyperparams/imagenet_resnet50_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/imagenet_resnet50_train_params.yaml
index 61948dbdaa..150297b727 100644
--- a/src/super_gradients/recipes/training_hyperparams/imagenet_resnet50_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/imagenet_resnet50_train_params.yaml
@@ -9,7 +9,7 @@ ema: False
 save_ckpt_epoch_list: [ 50, 100, 150, 200, 300 ]
 mixed_precision: True
 zero_weight_decay_on_bias_and_bn: True
-loss: cross_entropy
+loss: LabelSmoothingCrossEntropyLoss
 train_metrics_list:                               # metrics for evaluation
   - Accuracy
   - Top5
diff --git a/src/super_gradients/recipes/training_hyperparams/imagenet_vit_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/imagenet_vit_train_params.yaml
index 97c19b5347..dd19f41630 100644
--- a/src/super_gradients/recipes/training_hyperparams/imagenet_vit_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/imagenet_vit_train_params.yaml
@@ -9,7 +9,7 @@ lr_warmup_epochs: 1
 warmup_initial_lr: 0
 warmup_mode: EpochStepWarmupLRCallback
 ema: False
-loss: cross_entropy
+loss: LabelSmoothingCrossEntropyLoss
 clip_grad_norm: 1
 optimizer: SGD
 optimizer_params:
diff --git a/src/super_gradients/training/losses/seg_kd_loss.py b/src/super_gradients/training/losses/seg_kd_loss.py
index 58f9ea3ec8..2791b527d8 100644
--- a/src/super_gradients/training/losses/seg_kd_loss.py
+++ b/src/super_gradients/training/losses/seg_kd_loss.py
@@ -9,7 +9,7 @@
 class SegKDLoss(nn.Module):
     """
     Wrapper loss for semantic segmentation KD.
-    This loss includes two loss components, `ce_loss` i.e CrossEntropyLoss, and `kd_loss` i.e
+    This loss includes two loss components, `ce_loss` i.e CrossEntropyLoss, and `KDLogitsLoss` i.e
     `ChannelWiseKnowledgeDistillationLoss`.
     """
 

From dfd527749f8e0ea6d5f9d961c21a9ffa20543639 Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Mon, 4 Sep 2023 16:13:52 +0300
Subject: [PATCH 09/21] fix

---
 documentation/source/ObjectDetection.md | 4 ++--
 documentation/source/PoseEstimation.md  | 2 +-
 documentation/source/Segmentation.md    | 6 +++---
 documentation/source/ptq_qat.md         | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/documentation/source/ObjectDetection.md b/documentation/source/ObjectDetection.md
index 230554e451..92292ec430 100644
--- a/documentation/source/ObjectDetection.md
+++ b/documentation/source/ObjectDetection.md
@@ -12,8 +12,8 @@ In SuperGradients, we aim to collect such models and make them very convenient a
 
 | Model                                        | Yaml                                                                                                                                                                     | Model class                                                                                                                              | Loss Class                                                                                                                                  | NMS Callback                                                                                                                                                                                       |
 |----------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 
-| [SSD](https://arxiv.org/abs/1512.02325)      | [ssd_lite_mobilenetv2_arch_params](https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/recipes/arch_params/ssd_lite_mobilenetv2_arch_params.yaml) | [SSDLiteMobileNetV2](https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/models/detection_models/ssd.py) | [SSDLoss](https://docs.deci.ai/super-gradients/docstring/training/losses.html#training.losses.SSDLoss.SSDLoss)                                 | [SSDPostPredictCallback](https://docs.deci.ai/super-gradients/docstring/training/utils.html#training.utils.ssd_utils.SSDPostPredictCallback)                                                           |
-| [YOLOX](https://arxiv.org/abs/2107.08430)    | [yolox_s_arch_params](https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml)                           | [YoloX_S](https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/models/detection_models/yolox.py)          | [YoloXFastDetectionLoss](https://docs.deci.ai/super-gradients/docstring/training/losses.html#training.losses.YoloXDetectionLoss.YoloXFastDetectionLoss) | [YoloXPostPredictionCallback](https://docs.deci.ai/super-gradients/docstring/training/models.html#training.models.detection_models.yolo_base.YoloXPostPredictionCallback)                              |
+| [SSD](https://arxiv.org/abs/1512.02325)      | [ssd_lite_mobilenetv2_arch_params](https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/recipes/arch_params/ssd_lite_mobilenetv2_arch_params.yaml) | [SSDLiteMobileNetV2](https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/models/detection_models/ssd.py) | [SSDLoss](https://docs.deci.ai/super-gradients/docstring/training/losses.html#training.losses.ssd_loss.SSDLoss)                                 | [SSDPostPredictCallback](https://docs.deci.ai/super-gradients/docstring/training/utils.html#training.utils.ssd_utils.SSDPostPredictCallback)                                                           |
+| [YOLOX](https://arxiv.org/abs/2107.08430)    | [yolox_s_arch_params](https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/recipes/arch_params/yolox_s_arch_params.yaml)                           | [YoloX_S](https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/models/detection_models/yolox.py)          | [YoloXFastDetectionLoss](https://docs.deci.ai/super-gradients/docstring/training/losses.html#training.losses.yolox_loss.YoloXFastDetectionLoss) | [YoloXPostPredictionCallback](https://docs.deci.ai/super-gradients/docstring/training/models.html#training.models.detection_models.yolo_base.YoloXPostPredictionCallback)                              |
 | [PPYolo](https://arxiv.org/abs/2007.12099)   | [ppyoloe_arch_params](https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/recipes/arch_params/ppyoloe_arch_params.yaml)                           | [PPYoloE](https://docs.deci.ai/super-gradients/docstring/training/models.html#training.models.detection_models.pp_yolo_e.pp_yolo_e.PPYoloE)  | [PPYoloELoss](https://docs.deci.ai/super-gradients/docstring/training/losses.html#training.losses.ppyolo_loss.PPYoloELoss)                      | [PPYoloEPostPredictionCallback](https://docs.deci.ai/super-gradients/docstring/training/models.html#training.models.detection_models.pp_yolo_e.post_prediction_callback.PPYoloEPostPredictionCallback) |
 | YoloNAS                                      | [yolo_nas_s_arch_params](https://github.com/Deci-AI/super-gradients/blob/e1db4d99492a25f8e65b5d3e17a6ff2672c5467b/src/super_gradients/recipes/arch_params/yolo_nas_s_arch_params.yaml) | [Yolo NAS S](https://github.com/Deci-AI/super-gradients/blob/e1db4d99492a25f8e65b5d3e17a6ff2672c5467b/src/super_gradients/training/models/detection_models/yolo_nas/yolo_nas_variants.py#L16) | [PPYoloELoss](https://docs.deci.ai/super-gradients/docstring/training/losses.html#training.losses.ppyolo_loss.PPYoloELoss)                      | [PPYoloEPostPredictionCallback](https://docs.deci.ai/super-gradients/docstring/training/models.html#training.models.detection_models.pp_yolo_e.post_prediction_callback.PPYoloEPostPredictionCallback) |
 
diff --git a/documentation/source/PoseEstimation.md b/documentation/source/PoseEstimation.md
index f60538255b..f8c9c66956 100644
--- a/documentation/source/PoseEstimation.md
+++ b/documentation/source/PoseEstimation.md
@@ -16,7 +16,7 @@ In summary, top-down approach starts with detecting an object and then estimates
 
 | Model                                    | Model class                                                                                                                                                          | Target Generator                                                                                                                                                      | Loss Class                                                                                                     | Decoding Callback                                                                                                                                                                        | Visualization Callback                                                                                                                                                            |
 |------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 
-| [DEKR](https://arxiv.org/abs/2104.02300) | [DEKRPoseEstimationModel](https://docs.deci.ai/super-gradients/docstring/training/models.html#training.models.pose_estimation_models.dekr_hrnet.DEKRPoseEstimationModel) | [DEKRTargetsGenerator](https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/datasets/pose_estimation_datasets/target_generators.py#L8) | [DEKRLoss](https://docs.deci.ai/super-gradients/docstring/training/losses.html#training.losses.DEKRLoss.DEKRLoss) | [DEKRPoseEstimationDecodeCallback](https://docs.deci.ai/super-gradients/docstring/training/utils.html#training.utils.pose_estimation.dekr_decode_callbacks.DEKRPoseEstimationDecodeCallback) | [DEKRVisualizationCallback](https://docs.deci.ai/super-gradients/docstring/training/utils.html#training.utils.pose_estimation.dekr_visualization_callbacks.DEKRVisualizationCallback) |
+| [DEKR](https://arxiv.org/abs/2104.02300) | [DEKRPoseEstimationModel](https://docs.deci.ai/super-gradients/docstring/training/models.html#training.models.pose_estimation_models.dekr_hrnet.DEKRPoseEstimationModel) | [DEKRTargetsGenerator](https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/datasets/pose_estimation_datasets/target_generators.py#L8) | [DEKRLoss](https://docs.deci.ai/super-gradients/docstring/training/losses.html#training.losses.dekr_loss.DEKRLoss) | [DEKRPoseEstimationDecodeCallback](https://docs.deci.ai/super-gradients/docstring/training/utils.html#training.utils.pose_estimation.dekr_decode_callbacks.DEKRPoseEstimationDecodeCallback) | [DEKRVisualizationCallback](https://docs.deci.ai/super-gradients/docstring/training/utils.html#training.utils.pose_estimation.dekr_visualization_callbacks.DEKRVisualizationCallback) |
 
 ## Training
 
diff --git a/documentation/source/Segmentation.md b/documentation/source/Segmentation.md
index 57f0b2a705..3e6ba3a940 100644
--- a/documentation/source/Segmentation.md
+++ b/documentation/source/Segmentation.md
@@ -33,12 +33,12 @@ The following table summarizes the loss functions currently supported by SuperGr
 
 | Loss function class                                                                                                                                                                     | Loss name in YAML | Description                                                          |
 |-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------|----------------------------------------------------------------------|
-| [BCEDiceLoss](https://docs.deci.ai/super-gradients/docstring/training/losses.html#training.losses.BCEDiceLoss.BCEDiceLoss)                                                            | BCEDiceLoss     | Weighted average of BCE and Dice loss                                |
-| [LabelSmoothingCrossEntropyLoss](https://docs.deci.ai/super-gradients/docstring/training/losses.html#training.losses.label_smoothing_cross_entropy_loss.LabelSmoothingCrossEntropyLoss) | LabelSmoothingCrossEntropyLoss     | Cross entropy loss with label smoothing support                      |
+| [BCEDiceLoss](https://docs.deci.ai/super-gradients/docstring/training/losses.html#training.losses.bce_dice_loss.BCEDiceLoss)                                                            | bce_dice_loss     | Weighted average of BCE and Dice loss                                |
+| [LabelSmoothingCrossEntropyLoss](https://docs.deci.ai/super-gradients/docstring/training/losses.html#training.losses.label_smoothing_cross_entropy_loss.LabelSmoothingCrossEntropyLoss) | cross_entropy     | Cross entropy loss with label smoothing support                      |
 | [DiceLoss](https://docs.deci.ai/super-gradients/docstring/training/losses.html#training.losses.dice_loss.DiceLoss)                                                                      | N/A               | Dice loss for multiclass segmentation                                |
 | [BinaryDiceLoss](https://docs.deci.ai/super-gradients/docstring/training/losses.html#training.losses.dice_loss.BinaryDiceLoss)                                                          | N/A               | Dice loss for binary segmentation                                    |
 | [GeneralizedDiceLoss](https://docs.deci.ai/super-gradients/docstring/training/losses.html#training.losses.dice_loss.GeneralizedDiceLoss)                                                | N/A               | Generalized dice loss                                                |
-| [DiceCEEdgeLoss](https://docs.deci.ai/super-gradients/docstring/training/losses.html#training.losses.DiceCEEdgeLoss.DiceCEEdgeLoss)                                                  | DiceCEEdgeLoss | Dice loss + Cross entropy loss + Edge loss                           |
+| [DiceCEEdgeLoss](https://docs.deci.ai/super-gradients/docstring/training/losses.html#training.losses.dice_ce_edge_loss.DiceCEEdgeLoss)                                                  | dice_ce_edge_loss | Dice loss + Cross entropy loss + Edge loss                           |
 | [SegKDLoss](https://docs.deci.ai/super-gradients/docstring/training/losses.html#training.losses.seg_kd_loss.SegKDLoss)                                                                  | N/A               | A loss function for knowledge distillation for semantic segmentation |
 
 ## Metrics
diff --git a/documentation/source/ptq_qat.md b/documentation/source/ptq_qat.md
index 7c59dcc7a6..f9376b14b7 100644
--- a/documentation/source/ptq_qat.md
+++ b/documentation/source/ptq_qat.md
@@ -310,7 +310,7 @@ selective_quantizer_params:
   skip_modules:              # optional list of module names (strings) to skip from quantization
 
 calib_params:
-  histogram_calib_method: "percentile"  # calibration method for all "histogram" calibrators, acceptable types are ["percentile", "entropy", MSE"], "max" calibrators always use "max"
+  histogram_calib_method: "percentile"  # calibration method for all "histogram" calibrators, acceptable types are ["percentile", "entropy", mse"], "max" calibrators always use "max"
   percentile: 99.99                     # percentile for all histogram calibrators with method "percentile", other calibrators are not affected
   num_calib_batches:                    # number of batches to use for calibration, if None, 512 / batch_size will be used
   verbose: False                        # if calibrator should be verbose

From 5ca35b93ffc61d23b684cebfd91a623990ab8d31 Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Mon, 4 Sep 2023 16:49:23 +0300
Subject: [PATCH 10/21] fix LRWarmups

---
 .../source/Example_Classification.md          |  4 ++--
 documentation/source/LRScheduling.md          | 10 +++++-----
 .../training/sg_trainer/sg_trainer.py         | 20 ++++++++++---------
 .../training/utils/callbacks/callbacks.py     | 12 ++++++-----
 4 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/documentation/source/Example_Classification.md b/documentation/source/Example_Classification.md
index 699b80bf70..0e04a5379d 100644
--- a/documentation/source/Example_Classification.md
+++ b/documentation/source/Example_Classification.md
@@ -321,7 +321,7 @@ Output (Training parameters):
     'loss': "LabelSmoothingCrossEntropyLoss",
     'lr_cooldown_epochs': 0,
     'lr_decay_factor': 0.1,
-    'lr_mode': 'step',
+    'lr_mode': 'StepLRCallback',
     'lr_schedule_function': None,
     'lr_updates': array([100, 150, 200]),
     'lr_warmup_epochs': 0,
@@ -355,7 +355,7 @@ Output (Training parameters):
     'train_metrics_list': ['Accuracy', 'Top5'],
     'valid_metrics_list': ['Accuracy', 'Top5'],
     'warmup_initial_lr': None,
-    'warmup_mode': 'linear_epoch_step',
+    'warmup_mode': 'EpochStepWarmupLRCallback',
     'zero_weight_decay_on_bias_and_bn': False
 }
 ```
diff --git a/documentation/source/LRScheduling.md b/documentation/source/LRScheduling.md
index e1992762c2..5e6a158da3 100644
--- a/documentation/source/LRScheduling.md
+++ b/documentation/source/LRScheduling.md
@@ -7,13 +7,13 @@ Learning rate scheduling type is controlled by the training parameter `lr_mode`.
 
         When str:
 
-        Learning rate scheduling policy, one of ['step','poly','cosine','function'].
+        Learning rate scheduling policy, one of ['StepLRCallback','PolyLRCallback','CosineLRCallback','function'].
 
-        'step' refers to constant updates at epoch numbers passed through `lr_updates`. Each update decays the learning rate by `lr_decay_factor`.
+        'StepLRCallback' refers to constant updates at epoch numbers passed through `lr_updates`. Each update decays the learning rate by `lr_decay_factor`.
 
-        'cosine' refers to the Cosine Anealing policy as mentioned in https://arxiv.org/abs/1608.03983. The final learning rate ratio is controlled by `cosine_final_lr_ratio` training parameter.
+        'CosineLRCallback' refers to the Cosine Anealing policy as mentioned in https://arxiv.org/abs/1608.03983. The final learning rate ratio is controlled by `cosine_final_lr_ratio` training parameter.
 
-        'poly' refers to the polynomial decrease: in each epoch iteration `self.lr = self.initial_lr * pow((1.0 - (current_iter / max_iter)), 0.9)`
+        'PolyLRCallback' refers to the polynomial decrease: in each epoch iteration `self.lr = self.initial_lr * pow((1.0 - (current_iter / max_iter)), 0.9)`
 
         'function' refers to a user-defined learning rate scheduling function, that is passed through `lr_schedule_function`.
 
@@ -45,7 +45,7 @@ trainer.train(model=model, training_params=train_params, train_loader=train_data
 ```yaml
 training_hyperparams:
     initial_lr: 0.1
-    lr_mode: step
+    lr_mode: StepLRCallback
     user_lr_updates:
       - 100
       - 150
diff --git a/src/super_gradients/training/sg_trainer/sg_trainer.py b/src/super_gradients/training/sg_trainer/sg_trainer.py
index 7417a44082..0ff54391de 100755
--- a/src/super_gradients/training/sg_trainer/sg_trainer.py
+++ b/src/super_gradients/training/sg_trainer/sg_trainer.py
@@ -766,27 +766,29 @@ def train(
 
                 - `lr_updates` : list(int)
 
-                    List of fixed epoch numbers to perform learning rate updates when `lr_mode='step'`.
+                    List of fixed epoch numbers to perform learning rate updates when `lr_mode='StepLRCallback'`.
 
                 - `lr_decay_factor` : float
 
-                    Decay factor to apply to the learning rate at each update when `lr_mode='step'`.
+                    Decay factor to apply to the learning rate at each update when `lr_mode='StepLRCallback'`.
 
 
                 -  `lr_mode` : Union[str, Mapping],
 
                     When str:
 
-                    Learning rate scheduling policy, one of ['step','poly','cosine','function'].
+                    Learning rate scheduling policy, one of ['StepLRCallback','PolyLRCallback','CosineLRCallback','FunctionLRCallback'].
 
-                    'step' refers to constant updates at epoch numbers passed through `lr_updates`. Each update decays the learning rate by `lr_decay_factor`.
+                    'StepLRCallback' refers to constant updates at epoch numbers passed through `lr_updates`.
+                        Each update decays the learning rate by `lr_decay_factor`.
 
-                    'cosine' refers to the Cosine Anealing policy as mentioned in https://arxiv.org/abs/1608.03983.
+                    'CosineLRCallback' refers to the Cosine Anealing policy as mentioned in https://arxiv.org/abs/1608.03983.
                       The final learning rate ratio is controlled by `cosine_final_lr_ratio` training parameter.
 
-                    'poly' refers to the polynomial decrease: in each epoch iteration `self.lr = self.initial_lr * pow((1.0 - (current_iter / max_iter)), 0.9)`
+                    'PolyLRCallback' refers to the polynomial decrease:
+                        in each epoch iteration `self.lr = self.initial_lr * pow((1.0 - (current_iter / max_iter)), 0.9)`
 
-                    'function' refers to a user-defined learning rate scheduling function, that is passed through `lr_schedule_function`.
+                    'FunctionLRCallback' refers to a user-defined learning rate scheduling function, that is passed through `lr_schedule_function`.
 
 
 
@@ -821,7 +823,7 @@ def train(
 
                 - `lr_schedule_function` : Union[callable,None]
 
-                    Learning rate scheduling function to be used when `lr_mode` is 'function'.
+                    Learning rate scheduling function to be used when `lr_mode` is 'FunctionLRCallback'.
 
                 - `warmup_mode`: Union[str, Type[LRCallbackBase], None]
 
@@ -844,7 +846,7 @@ def train(
                     The capping is done to avoid interference of warmup with epoch-based schedulers.
 
                 - `cosine_final_lr_ratio` : float (default=0.01)
-                    Final learning rate ratio (only relevant when `lr_mode`='cosine'). The cosine starts from initial_lr and reaches
+                    Final learning rate ratio (only relevant when `lr_mode`='CosineLRCallback'). The cosine starts from initial_lr and reaches
                      initial_lr * cosine_final_lr_ratio in last epoch
 
                 - `inital_lr` : float
diff --git a/src/super_gradients/training/utils/callbacks/callbacks.py b/src/super_gradients/training/utils/callbacks/callbacks.py
index 5f53cb7e9c..33881b060c 100644
--- a/src/super_gradients/training/utils/callbacks/callbacks.py
+++ b/src/super_gradients/training/utils/callbacks/callbacks.py
@@ -924,16 +924,18 @@ def create_lr_scheduler_callback(
 
                     When str:
 
-                    Learning rate scheduling policy, one of ['step','poly','cosine','function'].
+                    Learning rate scheduling policy, one of ['StepLRCallback','PolyLRCallback','CosineLRCallback','FunctionLRCallback'].
 
-                    'step' refers to constant updates at epoch numbers passed through `lr_updates`. Each update decays the learning rate by `lr_decay_factor`.
+                    'StepLRCallback' refers to constant updates at epoch numbers passed through `lr_updates`.
+                        Each update decays the learning rate by `lr_decay_factor`.
 
-                    'cosine' refers to the Cosine Anealing policy as mentioned in https://arxiv.org/abs/1608.03983.
+                    'CosineLRCallback' refers to the Cosine Anealing policy as mentioned in https://arxiv.org/abs/1608.03983.
                       The final learning rate ratio is controlled by `cosine_final_lr_ratio` training parameter.
 
-                    'poly' refers to the polynomial decrease: in each epoch iteration `self.lr = self.initial_lr * pow((1.0 - (current_iter / max_iter)), 0.9)`
+                    'PolyLRCallback' refers to the polynomial decrease:
+                        in each epoch iteration `self.lr = self.initial_lr * pow((1.0 - (current_iter / max_iter)), 0.9)`
 
-                    'function' refers to a user-defined learning rate scheduling function, that is passed through `lr_schedule_function`.
+                    'FunctionLRCallback' refers to a user-defined learning rate scheduling function, that is passed through `lr_schedule_function`.
 
 
 

From d644f324086533cddb7a320044983681abe635b9 Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Mon, 4 Sep 2023 16:52:11 +0300
Subject: [PATCH 11/21] leftover

---
 documentation/source/LRScheduling.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/documentation/source/LRScheduling.md b/documentation/source/LRScheduling.md
index 5e6a158da3..a02ccea8f3 100644
--- a/documentation/source/LRScheduling.md
+++ b/documentation/source/LRScheduling.md
@@ -7,7 +7,7 @@ Learning rate scheduling type is controlled by the training parameter `lr_mode`.
 
         When str:
 
-        Learning rate scheduling policy, one of ['StepLRCallback','PolyLRCallback','CosineLRCallback','function'].
+        Learning rate scheduling policy, one of ['StepLRCallback','PolyLRCallback','CosineLRCallback','FunctionLRCallback'].
 
         'StepLRCallback' refers to constant updates at epoch numbers passed through `lr_updates`. Each update decays the learning rate by `lr_decay_factor`.
 
@@ -15,7 +15,7 @@ Learning rate scheduling type is controlled by the training parameter `lr_mode`.
 
         'PolyLRCallback' refers to the polynomial decrease: in each epoch iteration `self.lr = self.initial_lr * pow((1.0 - (current_iter / max_iter)), 0.9)`
 
-        'function' refers to a user-defined learning rate scheduling function, that is passed through `lr_schedule_function`.
+        'FunctionLRCallback' refers to a user-defined learning rate scheduling function, that is passed through `lr_schedule_function`.
 
 For example, the training code below will start with an initial learning rate of 0.1 and decay by 0.1 at epochs 100,150 and 200:
 

From 703829ad778fc3aa7f6a0b17d2fc3983baba85b2 Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Mon, 4 Sep 2023 17:05:52 +0300
Subject: [PATCH 12/21] minor change

---
 .../quantization_params/default_quantization_params.yaml        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/super_gradients/recipes/quantization_params/default_quantization_params.yaml b/src/super_gradients/recipes/quantization_params/default_quantization_params.yaml
index 81bad8b7cd..0d9ceb8e36 100644
--- a/src/super_gradients/recipes/quantization_params/default_quantization_params.yaml
+++ b/src/super_gradients/recipes/quantization_params/default_quantization_params.yaml
@@ -7,7 +7,7 @@ selective_quantizer_params:
   skip_modules:              # optional list of module names (strings) to skip from quantization
 
 calib_params:
-  histogram_calib_method: "percentile"  # calibration method for all "histogram" calibrators, acceptable types are ["percentile", "entropy", MSE"], "max" calibrators always use "max"
+  histogram_calib_method: "percentile"  # calibration method for all "histogram" calibrators, acceptable types are ["percentile", "entropy", "mse"], "max" calibrators always use "max"
   percentile: 99.99                     # percentile for all histogram calibrators with method "percentile", other calibrators are not affected
   num_calib_batches:                    # number of batches to use for calibration, if None, 512 / batch_size will be used
   verbose: False                        # if calibrator should be verbose

From 92a5c88daee8ddafba9b451042f47219eb9e8653 Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Mon, 4 Sep 2023 17:37:10 +0300
Subject: [PATCH 13/21] fix

---
 src/super_gradients/training/losses/dekr_loss.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/super_gradients/training/losses/dekr_loss.py b/src/super_gradients/training/losses/dekr_loss.py
index c94708e81d..8b2a8ea8b5 100644
--- a/src/super_gradients/training/losses/dekr_loss.py
+++ b/src/super_gradients/training/losses/dekr_loss.py
@@ -16,14 +16,14 @@ class DEKRLoss(nn.Module):
     This loss should be used in conjunction with DEKRTargetsGenerator.
     """
 
-    def __init__(self, heatmap_loss_factor: float = 1.0, offset_loss_factor: float = 0.1, heatmap_loss: str = "MSE"):
+    def __init__(self, heatmap_loss_factor: float = 1.0, offset_loss_factor: float = 0.1, heatmap_loss: str = "mse"):
         """
         Instantiate the DEKR loss function. It is two-component loss function, consisting of a heatmap (MSE) loss and an offset (Smooth L1) losses.
         The total loss is the sum of the two individual losses, weighted by the corresponding factors.
 
         :param heatmap_loss_factor: Weighting factor for heatmap loss
         :param offset_loss_factor: Weighting factor for offset loss
-        :param heatmap_loss: Type of heatmap loss to use. Can be "MSE" (Used in DEKR paper) or "qfl" (Quality Focal Loss).
+        :param heatmap_loss: Type of heatmap loss to use. Can be "mse" (Used in DEKR paper) or "qfl" (Quality Focal Loss).
                              We use QFL in our recipe as it produces better results.
         """
         super().__init__()

From 1daae1f38c9b3f49a34a2814e5d8d44ece7ae215 Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Mon, 11 Sep 2023 22:39:58 +0300
Subject: [PATCH 14/21] LabelSmoothingCrossEntropyLoss -> CrossEntropyLoss

---
 src/super_gradients/common/object_names.py           |  2 +-
 .../ddrnet_imagenet/ddrnet_classification_example.py |  2 +-
 .../deci_lab_export_example.py                       |  2 +-
 .../examples/early_stop/early_stop_example.py        |  4 ++--
 .../loggers_examples/clearml_logger_example.py       |  2 +-
 .../loggers_examples/deci_platform_logger_example.py |  2 +-
 .../examples/quantization/resnet_qat_example.py      |  2 +-
 .../train_with_test_set/train_with_test_example.py   |  2 +-
 src/super_gradients/training/losses/__init__.py      |  4 ++--
 src/super_gradients/training/losses/kd_losses.py     |  2 +-
 .../losses/label_smoothing_cross_entropy_loss.py     |  4 ++--
 .../training/sg_trainer/sg_trainer.py                |  2 +-
 .../training/utils/callbacks/callbacks.py            |  2 +-
 tests/end_to_end_tests/cifar_trainer_test.py         |  4 ++--
 tests/end_to_end_tests/trainer_test.py               |  2 +-
 tests/integration_tests/conversion_callback_test.py  |  4 ++--
 tests/integration_tests/deci_lab_export_test.py      |  2 +-
 .../integration_tests/ema_train_integration_test.py  |  2 +-
 tests/integration_tests/lr_test.py                   |  2 +-
 tests/integration_tests/pretrained_models_test.py    |  4 ++--
 tests/recipe_training_tests/coded_qat_launch_test.py |  4 ++--
 tests/unit_tests/early_stop_test.py                  |  2 +-
 tests/unit_tests/factories_test.py                   |  6 +++---
 tests/unit_tests/forward_pass_prep_fn_test.py        |  2 +-
 tests/unit_tests/initialize_with_dataloaders_test.py |  2 +-
 tests/unit_tests/load_ema_ckpt_test.py               |  2 +-
 tests/unit_tests/local_ckpt_head_replacement_test.py |  2 +-
 tests/unit_tests/lr_cooldown_test.py                 |  2 +-
 tests/unit_tests/lr_warmup_test.py                   | 10 +++++-----
 tests/unit_tests/max_batches_loop_break_test.py      |  4 ++--
 tests/unit_tests/optimizer_params_override_test.py   |  4 ++--
 tests/unit_tests/phase_context_test.py               |  2 +-
 tests/unit_tests/resume_training_test.py             |  8 ++++----
 tests/unit_tests/save_ckpt_test.py                   |  4 ++--
 tests/unit_tests/train_logging_test.py               |  2 +-
 .../train_with_intialized_param_args_test.py         | 12 ++++++------
 tests/unit_tests/train_with_precise_bn_test.py       |  4 ++--
 tests/unit_tests/update_param_groups_unit_test.py    |  2 +-
 tests/unit_tests/vit_unit_test.py                    |  2 +-
 39 files changed, 64 insertions(+), 64 deletions(-)

diff --git a/src/super_gradients/common/object_names.py b/src/super_gradients/common/object_names.py
index bc510657a3..07efc6816e 100644
--- a/src/super_gradients/common/object_names.py
+++ b/src/super_gradients/common/object_names.py
@@ -1,7 +1,7 @@
 class Losses:
     """Static class holding all the supported loss names"""
 
-    CROSS_ENTROPY = "LabelSmoothingCrossEntropyLoss"
+    CROSS_ENTROPY = "CrossEntropyLoss"
     MSE = "MSE"
     R_SQUARED_LOSS = "RSquaredLoss"
     SHELFNET_OHEM_LOSS = "ShelfNetOHEMLoss"
diff --git a/src/super_gradients/examples/ddrnet_imagenet/ddrnet_classification_example.py b/src/super_gradients/examples/ddrnet_imagenet/ddrnet_classification_example.py
index 20ceb0ede5..2fe96fc43f 100644
--- a/src/super_gradients/examples/ddrnet_imagenet/ddrnet_classification_example.py
+++ b/src/super_gradients/examples/ddrnet_imagenet/ddrnet_classification_example.py
@@ -45,7 +45,7 @@
     "initial_lr": 0.1 * devices,
     "optimizer": "SGD",
     "optimizer_params": {"weight_decay": 0.0001, "momentum": 0.9, "nesterov": True},
-    "loss": "LabelSmoothingCrossEntropyLoss",
+    "loss": "CrossEntropyLoss",
     "train_metrics_list": [Accuracy(), Top5()],
     "valid_metrics_list": [Accuracy(), Top5()],
     "metric_to_watch": "Accuracy",
diff --git a/src/super_gradients/examples/deci_lab_export_example/deci_lab_export_example.py b/src/super_gradients/examples/deci_lab_export_example/deci_lab_export_example.py
index d5dc9cb0b6..91c9c976fe 100644
--- a/src/super_gradients/examples/deci_lab_export_example/deci_lab_export_example.py
+++ b/src/super_gradients/examples/deci_lab_export_example/deci_lab_export_example.py
@@ -59,7 +59,7 @@ def main(architecture_name: str):
         "lr_mode": "StepLRCallback",
         "lr_warmup_epochs": 0,
         "initial_lr": 0.1,
-        "loss": "LabelSmoothingCrossEntropyLoss",
+        "loss": "CrossEntropyLoss",
         "optimizer": "SGD",
         "criterion_params": {},
         "train_metrics_list": [Accuracy(), Top5()],
diff --git a/src/super_gradients/examples/early_stop/early_stop_example.py b/src/super_gradients/examples/early_stop/early_stop_example.py
index 93c691808a..3a60223956 100644
--- a/src/super_gradients/examples/early_stop/early_stop_example.py
+++ b/src/super_gradients/examples/early_stop/early_stop_example.py
@@ -12,7 +12,7 @@
 super_gradients.init_trainer()
 
 early_stop_acc = EarlyStop(Phase.VALIDATION_EPOCH_END, monitor="Accuracy", mode="max", patience=3, verbose=True)
-early_stop_val_loss = EarlyStop(Phase.VALIDATION_EPOCH_END, monitor="LabelSmoothingCrossEntropyLoss", mode="min", patience=3, verbose=True)
+early_stop_val_loss = EarlyStop(Phase.VALIDATION_EPOCH_END, monitor="CrossEntropyLoss", mode="min", patience=3, verbose=True)
 
 train_params = {
     "max_epochs": 250,
@@ -21,7 +21,7 @@
     "lr_mode": "StepLRCallback",
     "lr_warmup_epochs": 0,
     "initial_lr": 0.1,
-    "loss": "LabelSmoothingCrossEntropyLoss",
+    "loss": "CrossEntropyLoss",
     "optimizer": "SGD",
     "criterion_params": {},
     "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
diff --git a/src/super_gradients/examples/loggers_examples/clearml_logger_example.py b/src/super_gradients/examples/loggers_examples/clearml_logger_example.py
index 94228bb964..e2c6a4eb34 100644
--- a/src/super_gradients/examples/loggers_examples/clearml_logger_example.py
+++ b/src/super_gradients/examples/loggers_examples/clearml_logger_example.py
@@ -13,7 +13,7 @@
     "lr_decay_factor": 0.1,
     "lr_mode": "StepLRCallback",
     "initial_lr": 0.1,
-    "loss": "LabelSmoothingCrossEntropyLoss",
+    "loss": "CrossEntropyLoss",
     "optimizer": "SGD",
     "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
     "train_metrics_list": [Accuracy(), Top5()],
diff --git a/src/super_gradients/examples/loggers_examples/deci_platform_logger_example.py b/src/super_gradients/examples/loggers_examples/deci_platform_logger_example.py
index 981c93ca55..65a2d70aca 100644
--- a/src/super_gradients/examples/loggers_examples/deci_platform_logger_example.py
+++ b/src/super_gradients/examples/loggers_examples/deci_platform_logger_example.py
@@ -16,7 +16,7 @@
     "lr_decay_factor": 0.1,
     "lr_mode": "StepLRCallback",
     "initial_lr": 0.1,
-    "loss": "LabelSmoothingCrossEntropyLoss",
+    "loss": "CrossEntropyLoss",
     "optimizer": "SGD",
     "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
     "train_metrics_list": [Accuracy(), Top5()],
diff --git a/src/super_gradients/examples/quantization/resnet_qat_example.py b/src/super_gradients/examples/quantization/resnet_qat_example.py
index 5b213e4359..c442b7303e 100644
--- a/src/super_gradients/examples/quantization/resnet_qat_example.py
+++ b/src/super_gradients/examples/quantization/resnet_qat_example.py
@@ -89,7 +89,7 @@ def sg_selective_qdq_resnet50():
         "initial_lr": args.lr,
         "optimizer": "SGD",
         "optimizer_params": {"weight_decay": 0.0001, "momentum": 0.9, "nesterov": True},
-        "loss": "LabelSmoothingCrossEntropyLoss",
+        "loss": "CrossEntropyLoss",
         "train_metrics_list": [Accuracy(), Top5()],
         "valid_metrics_list": [Accuracy(), Top5()],
         "test_metrics_list": [Accuracy(), Top5()],
diff --git a/src/super_gradients/examples/train_with_test_set/train_with_test_example.py b/src/super_gradients/examples/train_with_test_set/train_with_test_example.py
index ceb3b6cf42..8729101b8c 100644
--- a/src/super_gradients/examples/train_with_test_set/train_with_test_example.py
+++ b/src/super_gradients/examples/train_with_test_set/train_with_test_example.py
@@ -12,7 +12,7 @@
     "lr_decay_factor": 0.1,
     "lr_mode": "step",
     "initial_lr": 0.1,
-    "loss": "LabelSmoothingCrossEntropyLoss",
+    "loss": "CrossEntropyLoss",
     "optimizer": "SGD",
     "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
     "train_metrics_list": [Accuracy(), Top5()],
diff --git a/src/super_gradients/training/losses/__init__.py b/src/super_gradients/training/losses/__init__.py
index f14781c2a2..81cebc9485 100755
--- a/src/super_gradients/training/losses/__init__.py
+++ b/src/super_gradients/training/losses/__init__.py
@@ -1,6 +1,6 @@
 from super_gradients.training.losses.focal_loss import FocalLoss
 from super_gradients.training.losses.kd_losses import KDLogitsLoss
-from super_gradients.training.losses.label_smoothing_cross_entropy_loss import LabelSmoothingCrossEntropyLoss
+from super_gradients.training.losses.label_smoothing_cross_entropy_loss import CrossEntropyLoss
 from super_gradients.training.losses.r_squared_loss import RSquaredLoss
 from super_gradients.training.losses.shelfnet_ohem_loss import ShelfNetOHEMLoss
 from super_gradients.training.losses.shelfnet_semantic_encoding_loss import ShelfNetSemanticEncodingLoss
@@ -20,7 +20,7 @@
     "LOSSES",
     "Losses",
     "FocalLoss",
-    "LabelSmoothingCrossEntropyLoss",
+    "CrossEntropyLoss",
     "ShelfNetOHEMLoss",
     "ShelfNetSemanticEncodingLoss",
     "YoloXDetectionLoss",
diff --git a/src/super_gradients/training/losses/kd_losses.py b/src/super_gradients/training/losses/kd_losses.py
index 156245bc8b..a42ee2c448 100644
--- a/src/super_gradients/training/losses/kd_losses.py
+++ b/src/super_gradients/training/losses/kd_losses.py
@@ -21,7 +21,7 @@ class KDLogitsLoss(_Loss):
 
     def __init__(self, task_loss_fn: _Loss, distillation_loss_fn: _Loss = KDklDivLoss(), distillation_loss_coeff: float = 0.5):
         """
-        :param task_loss_fn: task loss. E.g., LabelSmoothingCrossEntropyLoss
+        :param task_loss_fn: task loss. E.g., CrossEntropyLoss
         :param distillation_loss_fn: distillation loss. E.g., KLDivLoss
         :param distillation_loss_coeff:
         """
diff --git a/src/super_gradients/training/losses/label_smoothing_cross_entropy_loss.py b/src/super_gradients/training/losses/label_smoothing_cross_entropy_loss.py
index c203c22af4..583b177482 100755
--- a/src/super_gradients/training/losses/label_smoothing_cross_entropy_loss.py
+++ b/src/super_gradients/training/losses/label_smoothing_cross_entropy_loss.py
@@ -84,11 +84,11 @@ def cross_entropy(inputs, target, weight=None, ignore_index=-100, reduction="mea
 
 
 @register_loss(name=Losses.CROSS_ENTROPY, deprecated_name="cross_entropy")
-class LabelSmoothingCrossEntropyLoss(nn.CrossEntropyLoss):
+class CrossEntropyLoss(nn.CrossEntropyLoss):
     """CrossEntropyLoss - with ability to recieve distrbution as targets, and optional label smoothing"""
 
     def __init__(self, weight=None, ignore_index=-100, reduction="mean", smooth_eps=None, smooth_dist=None, from_logits=True):
-        super(LabelSmoothingCrossEntropyLoss, self).__init__(weight=weight, ignore_index=ignore_index, reduction=reduction)
+        super(CrossEntropyLoss, self).__init__(weight=weight, ignore_index=ignore_index, reduction=reduction)
         self.smooth_eps = smooth_eps
         self.smooth_dist = smooth_dist
         self.from_logits = from_logits
diff --git a/src/super_gradients/training/sg_trainer/sg_trainer.py b/src/super_gradients/training/sg_trainer/sg_trainer.py
index 0ff54391de..449fdf78ad 100755
--- a/src/super_gradients/training/sg_trainer/sg_trainer.py
+++ b/src/super_gradients/training/sg_trainer/sg_trainer.py
@@ -858,7 +858,7 @@ def train(
                     Loss function for training.
                     One of SuperGradient's built in options:
 
-                        - LabelSmoothingCrossEntropyLoss,
+                        - CrossEntropyLoss,
                         - MSELoss,
                         - RSquaredLoss,
                         - YoLoV3DetectionLoss,
diff --git a/src/super_gradients/training/utils/callbacks/callbacks.py b/src/super_gradients/training/utils/callbacks/callbacks.py
index 33881b060c..d38fb382a8 100644
--- a/src/super_gradients/training/utils/callbacks/callbacks.py
+++ b/src/super_gradients/training/utils/callbacks/callbacks.py
@@ -1341,7 +1341,7 @@ class ExtremeBatchSegVisualizationCallback(ExtremeBatchCaseVisualizationCallback
                 max=False
                 ignore_idx=19),
             ExtremeBatchSegVisualizationCallback(
-                loss_to_monitor="LabelSmoothingCrossEntropyLoss"
+                loss_to_monitor="CrossEntropyLoss"
                 max=True
                 ignore_idx=19)]
                 ...}
diff --git a/tests/end_to_end_tests/cifar_trainer_test.py b/tests/end_to_end_tests/cifar_trainer_test.py
index d95f28fe68..00398ddbd0 100644
--- a/tests/end_to_end_tests/cifar_trainer_test.py
+++ b/tests/end_to_end_tests/cifar_trainer_test.py
@@ -25,7 +25,7 @@ def test_train_cifar10_dataloader(self):
             training_params={
                 "max_epochs": 1,
                 "initial_lr": 0.1,
-                "loss": "LabelSmoothingCrossEntropyLoss",
+                "loss": "CrossEntropyLoss",
                 "train_metrics_list": ["Accuracy"],
                 "valid_metrics_list": ["Accuracy"],
                 "metric_to_watch": "Accuracy",
@@ -44,7 +44,7 @@ def test_train_cifar100_dataloader(self):
             training_params={
                 "max_epochs": 1,
                 "initial_lr": 0.1,
-                "loss": "LabelSmoothingCrossEntropyLoss",
+                "loss": "CrossEntropyLoss",
                 "train_metrics_list": ["Accuracy"],
                 "valid_metrics_list": ["Accuracy"],
                 "metric_to_watch": "Accuracy",
diff --git a/tests/end_to_end_tests/trainer_test.py b/tests/end_to_end_tests/trainer_test.py
index 228e9ec372..c881cc4821 100644
--- a/tests/end_to_end_tests/trainer_test.py
+++ b/tests/end_to_end_tests/trainer_test.py
@@ -26,7 +26,7 @@ def setUp(cls):
             "initial_lr": 0.1,
             "lr_updates": [4],
             "lr_mode": "StepLRCallback",
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "train_metrics_list": [Accuracy(), Top5()],
             "valid_metrics_list": [Accuracy(), Top5()],
             "metric_to_watch": "Accuracy",
diff --git a/tests/integration_tests/conversion_callback_test.py b/tests/integration_tests/conversion_callback_test.py
index c172318f93..89ddefb344 100644
--- a/tests/integration_tests/conversion_callback_test.py
+++ b/tests/integration_tests/conversion_callback_test.py
@@ -57,7 +57,7 @@ def test_classification_architectures(self):
                 "lr_mode": "StepLRCallback",
                 "lr_warmup_epochs": 0,
                 "initial_lr": 0.1,
-                "loss": "LabelSmoothingCrossEntropyLoss",
+                "loss": "CrossEntropyLoss",
                 "optimizer": "SGD",
                 "criterion_params": {},
                 "train_metrics_list": [Accuracy(), Top5()],
@@ -90,7 +90,7 @@ def get_architecture_custom_config(architecture_name: str):
                 }
             elif re.search(r"regseg", architecture_name):
                 return {
-                    "loss": "LabelSmoothingCrossEntropyLoss",
+                    "loss": "CrossEntropyLoss",
                 }
             else:
                 raise Exception("You tried to run a conversion test on an unknown architecture")
diff --git a/tests/integration_tests/deci_lab_export_test.py b/tests/integration_tests/deci_lab_export_test.py
index a5a116aaee..691c3ae2e8 100644
--- a/tests/integration_tests/deci_lab_export_test.py
+++ b/tests/integration_tests/deci_lab_export_test.py
@@ -47,7 +47,7 @@ def test_train_with_deci_lab_integration(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "optimizer": self.optimizer,
             "criterion_params": {},
             "train_metrics_list": [Accuracy(), Top5()],
diff --git a/tests/integration_tests/ema_train_integration_test.py b/tests/integration_tests/ema_train_integration_test.py
index 990a698915..ea252d52a9 100644
--- a/tests/integration_tests/ema_train_integration_test.py
+++ b/tests/integration_tests/ema_train_integration_test.py
@@ -53,7 +53,7 @@ def _train(self, ema_params):
             "lr_decay_factor": 0.1,
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "ema": True,
diff --git a/tests/integration_tests/lr_test.py b/tests/integration_tests/lr_test.py
index 836b053cae..08b81e338f 100644
--- a/tests/integration_tests/lr_test.py
+++ b/tests/integration_tests/lr_test.py
@@ -19,7 +19,7 @@ def setUp(cls):
             "max_epochs": 1,
             "silent_mode": True,
             "initial_lr": 0.1,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "train_metrics_list": [Accuracy(), Top5()],
             "valid_metrics_list": [Accuracy(), Top5()],
             "metric_to_watch": "Accuracy",
diff --git a/tests/integration_tests/pretrained_models_test.py b/tests/integration_tests/pretrained_models_test.py
index 269ec484f0..855a439054 100644
--- a/tests/integration_tests/pretrained_models_test.py
+++ b/tests/integration_tests/pretrained_models_test.py
@@ -86,7 +86,7 @@ def setUp(self) -> None:
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
             "initial_lr": 0.6,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "lr_mode": "StepLRCallback",
             "optimizer_params": {"weight_decay": 0.000, "momentum": 0.9},
             "train_metrics_list": [Accuracy()],
@@ -246,7 +246,7 @@ def setUp(self) -> None:
         self.regseg_transfer_segmentation_train_params = {
             "max_epochs": 3,
             "initial_lr": 1e-2,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "lr_mode": "PolyLRCallback",
             "ema": True,  # unlike the paper (not specified in paper)
             "optimizer": "SGD",
diff --git a/tests/recipe_training_tests/coded_qat_launch_test.py b/tests/recipe_training_tests/coded_qat_launch_test.py
index f6abb8bac2..9198d296d4 100644
--- a/tests/recipe_training_tests/coded_qat_launch_test.py
+++ b/tests/recipe_training_tests/coded_qat_launch_test.py
@@ -20,7 +20,7 @@ def test_qat_launch(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
@@ -81,7 +81,7 @@ def test_ptq_launch(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
diff --git a/tests/unit_tests/early_stop_test.py b/tests/unit_tests/early_stop_test.py
index fa73207f19..036fc6f576 100644
--- a/tests/unit_tests/early_stop_test.py
+++ b/tests/unit_tests/early_stop_test.py
@@ -52,7 +52,7 @@ def setUp(self) -> None:
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
diff --git a/tests/unit_tests/factories_test.py b/tests/unit_tests/factories_test.py
index 0897ddb3ab..873fc37c89 100644
--- a/tests/unit_tests/factories_test.py
+++ b/tests/unit_tests/factories_test.py
@@ -8,7 +8,7 @@
 from super_gradients.common.object_names import Models
 from super_gradients.training import models
 from super_gradients.training.dataloaders.dataloaders import classification_test_dataloader
-from super_gradients.training.losses import LabelSmoothingCrossEntropyLoss
+from super_gradients.training.losses import CrossEntropyLoss
 from super_gradients.training.metrics import Accuracy, Top5
 from torch import nn
 
@@ -24,7 +24,7 @@ def test_training_with_factories(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "optimizer": "torch.optim.ASGD",  # use an optimizer by factory
             "criterion_params": {},
             "optimizer_params": {"lambd": 0.0001, "alpha": 0.75},
@@ -64,7 +64,7 @@ def test_training_with_factories_with_typos(self):
         self.assertIsInstance(trainer.train_metrics.Accuracy, Accuracy)
         self.assertIsInstance(trainer.valid_metrics.Top5, Top5)
         self.assertIsInstance(trainer.optimizer, torch.optim.Adam)
-        self.assertIsInstance(trainer.criterion, LabelSmoothingCrossEntropyLoss)
+        self.assertIsInstance(trainer.criterion, CrossEntropyLoss)
 
     def test_activations_factory(self):
         class DummyModel(nn.Module):
diff --git a/tests/unit_tests/forward_pass_prep_fn_test.py b/tests/unit_tests/forward_pass_prep_fn_test.py
index 2832686d25..7dfe59ada6 100644
--- a/tests/unit_tests/forward_pass_prep_fn_test.py
+++ b/tests/unit_tests/forward_pass_prep_fn_test.py
@@ -42,7 +42,7 @@ def test_resizing_with_forward_pass_prep_fn(self):
             "lr_cooldown_epochs": 2,
             "lr_warmup_epochs": 3,
             "initial_lr": 1,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
diff --git a/tests/unit_tests/initialize_with_dataloaders_test.py b/tests/unit_tests/initialize_with_dataloaders_test.py
index eb21132a06..da98237816 100644
--- a/tests/unit_tests/initialize_with_dataloaders_test.py
+++ b/tests/unit_tests/initialize_with_dataloaders_test.py
@@ -37,7 +37,7 @@ def test_train_with_dataloaders(self):
                 "lr_decay_factor": 0.01,
                 "lr_mode": "StepLRCallback",
                 "initial_lr": 0.01,
-                "loss": "LabelSmoothingCrossEntropyLoss",
+                "loss": "CrossEntropyLoss",
                 "optimizer": "SGD",
                 "optimizer_params": {"weight_decay": 1e-5, "momentum": 0.9},
                 "train_metrics_list": [Accuracy()],
diff --git a/tests/unit_tests/load_ema_ckpt_test.py b/tests/unit_tests/load_ema_ckpt_test.py
index c49f38f988..8239f31739 100644
--- a/tests/unit_tests/load_ema_ckpt_test.py
+++ b/tests/unit_tests/load_ema_ckpt_test.py
@@ -26,7 +26,7 @@ def setUp(self) -> None:
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
diff --git a/tests/unit_tests/local_ckpt_head_replacement_test.py b/tests/unit_tests/local_ckpt_head_replacement_test.py
index e45788860e..b5567fc304 100644
--- a/tests/unit_tests/local_ckpt_head_replacement_test.py
+++ b/tests/unit_tests/local_ckpt_head_replacement_test.py
@@ -17,7 +17,7 @@ def test_local_ckpt_head_replacement(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
diff --git a/tests/unit_tests/lr_cooldown_test.py b/tests/unit_tests/lr_cooldown_test.py
index ee0aedc0eb..f3972ec84e 100644
--- a/tests/unit_tests/lr_cooldown_test.py
+++ b/tests/unit_tests/lr_cooldown_test.py
@@ -23,7 +23,7 @@ def test_lr_cooldown_with_lr_scheduling(self):
             "lr_cooldown_epochs": 2,
             "lr_warmup_epochs": 3,
             "initial_lr": 1,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
diff --git a/tests/unit_tests/lr_warmup_test.py b/tests/unit_tests/lr_warmup_test.py
index 36fbfd570d..a86935eb23 100644
--- a/tests/unit_tests/lr_warmup_test.py
+++ b/tests/unit_tests/lr_warmup_test.py
@@ -61,7 +61,7 @@ def test_lr_warmup(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 3,
             "initial_lr": 1,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
@@ -97,7 +97,7 @@ def test_lr_warmup_with_lr_scheduling(self):
             "lr_mode": "CosineLRCallback",
             "lr_warmup_epochs": 3,
             "initial_lr": 1,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
@@ -143,7 +143,7 @@ def test_warmup_linear_batch_step(self):
             "warmup_mode": "BatchStepLinearWarmupLRCallback",
             "lr_warmup_steps": lr_warmup_steps,
             "initial_lr": 1,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
@@ -190,7 +190,7 @@ def test_warmup_linear_epoch_step(self):
             "lr_warmup_epochs": 3,
             "initial_lr": 1,
             "warmup_initial_lr": 4.0,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
@@ -226,7 +226,7 @@ def test_custom_lr_warmup(self):
             "lr_decay_factor": 0.1,
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 3,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
diff --git a/tests/unit_tests/max_batches_loop_break_test.py b/tests/unit_tests/max_batches_loop_break_test.py
index c00ca8fe4c..21f78adb44 100644
--- a/tests/unit_tests/max_batches_loop_break_test.py
+++ b/tests/unit_tests/max_batches_loop_break_test.py
@@ -26,7 +26,7 @@ def test_max_train_batches_loop_break(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
@@ -61,7 +61,7 @@ def test_max_valid_batches_loop_break(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
diff --git a/tests/unit_tests/optimizer_params_override_test.py b/tests/unit_tests/optimizer_params_override_test.py
index b8aac3dd46..2fee20413d 100644
--- a/tests/unit_tests/optimizer_params_override_test.py
+++ b/tests/unit_tests/optimizer_params_override_test.py
@@ -19,7 +19,7 @@ def test_optimizer_params_partial_override(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"momentum": 0.9},
@@ -48,7 +48,7 @@ def test_optimizer_params_full_override(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "zero_weight_decay_on_bias_and_bn": True,
diff --git a/tests/unit_tests/phase_context_test.py b/tests/unit_tests/phase_context_test.py
index ba680f3822..a9317f1bdf 100644
--- a/tests/unit_tests/phase_context_test.py
+++ b/tests/unit_tests/phase_context_test.py
@@ -31,7 +31,7 @@ def context_information_in_train_test(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
diff --git a/tests/unit_tests/resume_training_test.py b/tests/unit_tests/resume_training_test.py
index ee7bab7076..96506ff9da 100644
--- a/tests/unit_tests/resume_training_test.py
+++ b/tests/unit_tests/resume_training_test.py
@@ -34,7 +34,7 @@ def test_resume_training(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
@@ -80,7 +80,7 @@ def test_resume_run_id_training(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
@@ -145,7 +145,7 @@ def test_resume_external_training(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
@@ -193,7 +193,7 @@ def test_resume_external_training_same_dir(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
diff --git a/tests/unit_tests/save_ckpt_test.py b/tests/unit_tests/save_ckpt_test.py
index 06d4c539c6..baa2633e02 100644
--- a/tests/unit_tests/save_ckpt_test.py
+++ b/tests/unit_tests/save_ckpt_test.py
@@ -16,12 +16,12 @@ def setUp(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
             "save_ckpt_epoch_list": [1, 3],
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "train_metrics_list": [Accuracy(), Top5()],
             "valid_metrics_list": [Accuracy(), Top5()],
             "metric_to_watch": "Accuracy",
diff --git a/tests/unit_tests/train_logging_test.py b/tests/unit_tests/train_logging_test.py
index 19cb4fe488..c361703372 100644
--- a/tests/unit_tests/train_logging_test.py
+++ b/tests/unit_tests/train_logging_test.py
@@ -22,7 +22,7 @@ def test_train_logging(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
diff --git a/tests/unit_tests/train_with_intialized_param_args_test.py b/tests/unit_tests/train_with_intialized_param_args_test.py
index 52e18de2d4..c6641fc6a4 100644
--- a/tests/unit_tests/train_with_intialized_param_args_test.py
+++ b/tests/unit_tests/train_with_intialized_param_args_test.py
@@ -55,7 +55,7 @@ def test_train_with_external_optimizer(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "optimizer": optimizer,
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
@@ -81,7 +81,7 @@ def test_train_with_external_scheduler(self):
             "phase_callbacks": phase_callbacks,
             "lr_warmup_epochs": 0,
             "initial_lr": lr,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "optimizer": optimizer,
             "criterion_params": {},
             "train_metrics_list": [Accuracy(), Top5()],
@@ -103,7 +103,7 @@ def test_train_with_external_scheduler_class(self):
             "max_epochs": 2,
             "lr_warmup_epochs": 0,
             "initial_lr": 0.3,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "optimizer": optimizer,
             "criterion_params": {},
             "train_metrics_list": [Accuracy(), Top5()],
@@ -128,7 +128,7 @@ def test_train_with_reduce_on_plateau(self):
             "phase_callbacks": phase_callbacks,
             "lr_warmup_epochs": 0,
             "initial_lr": lr,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "optimizer": optimizer,
             "criterion_params": {},
             "train_metrics_list": [Accuracy(), Top5()],
@@ -151,7 +151,7 @@ def test_train_with_external_metric(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
@@ -181,7 +181,7 @@ def test_train_with_external_dataloaders(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
diff --git a/tests/unit_tests/train_with_precise_bn_test.py b/tests/unit_tests/train_with_precise_bn_test.py
index 20c2974c2e..771997628b 100644
--- a/tests/unit_tests/train_with_precise_bn_test.py
+++ b/tests/unit_tests/train_with_precise_bn_test.py
@@ -21,7 +21,7 @@ def test_train_with_precise_bn_explicit_size(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
@@ -50,7 +50,7 @@ def test_train_with_precise_bn_implicit_size(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
diff --git a/tests/unit_tests/update_param_groups_unit_test.py b/tests/unit_tests/update_param_groups_unit_test.py
index 3a483de818..bf772e28a7 100644
--- a/tests/unit_tests/update_param_groups_unit_test.py
+++ b/tests/unit_tests/update_param_groups_unit_test.py
@@ -38,7 +38,7 @@ def test_lr_scheduling_with_update_param_groups(self):
             "lr_updates": [0, 1, 2],
             "initial_lr": 0.1,
             "lr_decay_factor": 1,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},
diff --git a/tests/unit_tests/vit_unit_test.py b/tests/unit_tests/vit_unit_test.py
index b005653438..8436eb6976 100644
--- a/tests/unit_tests/vit_unit_test.py
+++ b/tests/unit_tests/vit_unit_test.py
@@ -18,7 +18,7 @@ def setUp(self):
             "lr_mode": "StepLRCallback",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
-            "loss": "LabelSmoothingCrossEntropyLoss",
+            "loss": "CrossEntropyLoss",
             "optimizer": "SGD",
             "criterion_params": {},
             "optimizer_params": {"weight_decay": 1e-4, "momentum": 0.9},

From a3fd76c58491a64eb0bc0341c9fccb67e2d3682a Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Mon, 11 Sep 2023 22:56:41 +0300
Subject: [PATCH 15/21] add deprecated to LRSchedulers

---
 .../training/utils/callbacks/callbacks.py     | 50 ++++++++++++++-----
 1 file changed, 38 insertions(+), 12 deletions(-)

diff --git a/src/super_gradients/training/utils/callbacks/callbacks.py b/src/super_gradients/training/utils/callbacks/callbacks.py
index d38fb382a8..7b9c54298b 100644
--- a/src/super_gradients/training/utils/callbacks/callbacks.py
+++ b/src/super_gradients/training/utils/callbacks/callbacks.py
@@ -12,9 +12,9 @@
 import onnx
 import onnxruntime
 import torch
-from deprecated import deprecated
 from torch.utils.data import DataLoader
 from torchmetrics import MetricCollection, Metric
+from torchvision.utils import draw_segmentation_masks
 
 from super_gradients.common.abstractions.abstract_logger import get_logger
 from super_gradients.common.decorators.factory_decorator import resolve_param
@@ -32,7 +32,8 @@
 from super_gradients.training.utils.segmentation_utils import BinarySegmentationVisualization
 from super_gradients.common.environment.checkpoints_dir_utils import get_project_checkpoints_dir_path
 from super_gradients.training.utils.utils import unwrap_model
-from torchvision.utils import draw_segmentation_masks
+from super_gradients.common.deprecate import deprecated
+
 
 logger = get_logger(__name__)
 
@@ -385,13 +386,13 @@ def update_lr(self, optimizer, epoch, batch_idx=None):
 
 
 @register_lr_scheduler(LRSchedulers.STEP, deprecated_name="step")
-class StepLRCallback(LRCallbackBase):
+class StepLRScheduler(LRCallbackBase):
     """
     Hard coded step learning rate scheduling (i.e at specific milestones).
     """
 
     def __init__(self, lr_updates, lr_decay_factor, step_lr_update_freq=None, **kwargs):
-        super(StepLRCallback, self).__init__(Phase.TRAIN_EPOCH_END, **kwargs)
+        super().__init__(Phase.TRAIN_EPOCH_END, **kwargs)
         if step_lr_update_freq and len(lr_updates):
             raise ValueError("Only one of [lr_updates, step_lr_update_freq] should be passed to StepLRCallback constructor")
 
@@ -415,8 +416,13 @@ def is_lr_scheduling_enabled(self, context):
         return self.training_params.lr_warmup_epochs <= context.epoch
 
 
+@deprecated(deprecated_since="3.2.1", removed_from="3.5.0", target=StepLRScheduler)
+class StepLRCallback(StepLRScheduler):
+    ...
+
+
 @register_lr_scheduler(LRSchedulers.EXP, deprecated_name="exp")
-class ExponentialLRCallback(LRCallbackBase):
+class ExponentialLRScheduler(LRCallbackBase):
     """
     Exponential decay learning rate scheduling. Decays the learning rate by `lr_decay_factor` every epoch.
     """
@@ -436,14 +442,19 @@ def is_lr_scheduling_enabled(self, context):
         return self.training_params.lr_warmup_epochs <= context.epoch < post_warmup_epochs
 
 
+@deprecated(deprecated_since="3.2.1", removed_from="3.5.0", target=ExponentialLRScheduler)
+class ExponentialLRCallback(ExponentialLRScheduler):
+    ...
+
+
 @register_lr_scheduler(LRSchedulers.POLY, deprecated_name="poly")
-class PolyLRCallback(LRCallbackBase):
+class PolyLRScheduler(LRCallbackBase):
     """
     Hard coded polynomial decay learning rate scheduling (i.e at specific milestones).
     """
 
     def __init__(self, max_epochs, **kwargs):
-        super(PolyLRCallback, self).__init__(Phase.TRAIN_BATCH_STEP, **kwargs)
+        super().__init__(Phase.TRAIN_BATCH_STEP, **kwargs)
         self.max_epochs = max_epochs
 
     def perform_scheduling(self, context):
@@ -459,14 +470,19 @@ def is_lr_scheduling_enabled(self, context):
         return self.training_params.lr_warmup_epochs <= context.epoch < post_warmup_epochs
 
 
+@deprecated(deprecated_since="3.2.1", removed_from="3.5.0", target=PolyLRScheduler)
+class PolyLRCallback(PolyLRScheduler):
+    ...
+
+
 @register_lr_scheduler(LRSchedulers.COSINE, deprecated_name="cosine")
-class CosineLRCallback(LRCallbackBase):
+class CosineLRScheduler(LRCallbackBase):
     """
     Hard coded step Cosine anealing learning rate scheduling.
     """
 
     def __init__(self, max_epochs, cosine_final_lr_ratio, **kwargs):
-        super(CosineLRCallback, self).__init__(Phase.TRAIN_BATCH_STEP, **kwargs)
+        super().__init__(Phase.TRAIN_BATCH_STEP, **kwargs)
         self.max_epochs = max_epochs
         self.cosine_final_lr_ratio = cosine_final_lr_ratio
 
@@ -497,15 +513,20 @@ def compute_learning_rate(cls, step: Union[float, np.ndarray], total_steps: floa
         return lr * (1 - final_lr_ratio) + (initial_lr * final_lr_ratio)
 
 
+@deprecated(deprecated_since="3.2.1", removed_from="3.5.0", target=CosineLRScheduler)
+class CosineLRCallback(CosineLRScheduler):
+    ...
+
+
 @register_lr_scheduler(LRSchedulers.FUNCTION, deprecated_name="function")
-class FunctionLRCallback(LRCallbackBase):
+class FunctionLRScheduler(LRCallbackBase):
     """
     Hard coded rate scheduling for user defined lr scheduling function.
     """
 
-    @deprecated(version="3.2.0", reason="This callback is deprecated and will be removed in future versions.")
+    @deprecated(deprecated_since="3.2.0", removed_from="3.5.0", reason="This callback is deprecated and will be removed in future versions.")
     def __init__(self, max_epochs, lr_schedule_function, **kwargs):
-        super(FunctionLRCallback, self).__init__(Phase.TRAIN_BATCH_STEP, **kwargs)
+        super().__init__(Phase.TRAIN_BATCH_STEP, **kwargs)
         assert callable(lr_schedule_function), "self.lr_function must be callable"
         self.lr_schedule_function = lr_schedule_function
         self.max_epochs = max_epochs
@@ -527,6 +548,11 @@ def perform_scheduling(self, context):
         self.update_lr(context.optimizer, context.epoch, context.batch_idx)
 
 
+@deprecated(deprecated_since="3.2.1", removed_from="3.5.0", target=FunctionLRScheduler)
+class FunctionLRCallback(FunctionLRScheduler):
+    ...
+
+
 class IllegalLRSchedulerMetric(Exception):
     """Exception raised illegal combination of training parameters.
 

From 89b03e0c722637e698fdc16f98a3233fa2a9ca2b Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Mon, 11 Sep 2023 23:00:46 +0300
Subject: [PATCH 16/21] rename LRSchedulers by replacing Callbacks to Scheduler
 in name

---
 .../source/Example_Classification.md          |  2 +-
 .../Example_Training-an-external-model.md     |  2 +-
 documentation/source/LRScheduling.md          | 16 +++++++--------
 documentation/source/PhaseCallbacks.md        | 10 +++++-----
 documentation/source/Segmentation.md          |  2 +-
 src/super_gradients/common/object_names.py    | 10 +++++-----
 .../ddrnet_classification_example.py          |  2 +-
 .../deci_lab_export_example.py                |  2 +-
 .../examples/early_stop/early_stop_example.py |  2 +-
 .../clearml_logger_example.py                 |  2 +-
 .../deci_platform_logger_example.py           |  2 +-
 .../regseg_transfer_learning_example.py       |  2 +-
 .../recipes/cityscapes_regseg48.yaml          |  2 +-
 .../recipes/cityscapes_segformer_b0.yaml      |  2 +-
 .../recipes/cityscapes_segformer_b1.yaml      |  2 +-
 .../recipes/cityscapes_segformer_b2.yaml      |  2 +-
 .../recipes/cityscapes_segformer_b3.yaml      |  2 +-
 .../recipes/cityscapes_segformer_b4.yaml      |  2 +-
 .../recipes/cityscapes_segformer_b5.yaml      |  2 +-
 .../cifar10_resnet_train_params.yaml          |  2 +-
 .../cityscapes_default_train_params.yaml      |  2 +-
 .../coco2017_dekr_pose_train_params.yaml      |  2 +-
 .../coco2017_ppyoloe_train_params.yaml        |  2 +-
 .../coco2017_rescoring_train_params.yaml      |  2 +-
 ...17_ssd_lite_mobilenet_v2_train_params.yaml |  2 +-
 .../coco2017_yolo_nas_train_params.yaml       |  2 +-
 .../coco2017_yolox_train_params.yaml          |  2 +-
 ...segmentation_shelfnet_lw_train_params.yaml |  2 +-
 .../default_train_params.yaml                 |  8 ++++----
 .../imagenet_efficientnet_train_params.yaml   |  2 +-
 .../imagenet_mobilenetv2_train_params.yaml    |  2 +-
 .../imagenet_mobilenetv3_train_params.yaml    |  2 +-
 .../imagenet_regnetY_train_params.yaml        |  2 +-
 .../imagenet_repvgg_train_params.yaml         |  2 +-
 .../imagenet_resnet50_kd_train_params.yaml    |  2 +-
 .../imagenet_resnet50_train_params.yaml       |  2 +-
 .../imagenet_vit_train_params.yaml            |  2 +-
 .../supervisely_default_train_params.yaml     |  2 +-
 src/super_gradients/training/params.py        |  2 +-
 .../pre_launch_callbacks.py                   |  4 ++--
 .../training/sg_trainer/sg_trainer.py         | 18 ++++++++---------
 .../training/utils/callbacks/__init__.py      | 20 +++++++++----------
 .../training/utils/callbacks/callbacks.py     | 12 +++++------
 tests/end_to_end_tests/trainer_test.py        |  2 +-
 .../conversion_callback_test.py               |  4 ++--
 .../integration_tests/deci_lab_export_test.py |  2 +-
 .../ema_train_integration_test.py             |  2 +-
 tests/integration_tests/lr_test.py            |  8 ++++----
 .../pretrained_models_test.py                 | 12 +++++------
 .../coded_qat_launch_test.py                  |  4 ++--
 tests/unit_tests/dataset_statistics_test.py   |  2 +-
 tests/unit_tests/detection_dataset_test.py    |  2 +-
 tests/unit_tests/double_training_test.py      |  2 +-
 tests/unit_tests/early_stop_test.py           |  2 +-
 tests/unit_tests/extreme_batch_cb_test.py     |  4 ++--
 tests/unit_tests/factories_test.py            |  4 ++--
 tests/unit_tests/forward_pass_prep_fn_test.py |  2 +-
 .../initialize_with_dataloaders_test.py       |  2 +-
 tests/unit_tests/kd_ema_test.py               |  2 +-
 tests/unit_tests/kd_trainer_test.py           |  2 +-
 tests/unit_tests/load_ema_ckpt_test.py        |  2 +-
 .../local_ckpt_head_replacement_test.py       |  2 +-
 tests/unit_tests/loss_loggings_test.py        |  6 +++---
 tests/unit_tests/lr_cooldown_test.py          |  2 +-
 tests/unit_tests/lr_warmup_test.py            | 14 ++++++-------
 .../unit_tests/max_batches_loop_break_test.py |  4 ++--
 .../optimizer_params_override_test.py         |  4 ++--
 tests/unit_tests/phase_context_test.py        |  2 +-
 tests/unit_tests/preprocessing_unit_test.py   |  4 ++--
 tests/unit_tests/resume_training_test.py      |  8 ++++----
 tests/unit_tests/save_ckpt_test.py            |  2 +-
 tests/unit_tests/train_after_test_test.py     |  2 +-
 tests/unit_tests/train_logging_test.py        |  2 +-
 .../train_with_intialized_param_args_test.py  |  8 ++++----
 .../unit_tests/train_with_precise_bn_test.py  |  4 ++--
 .../update_param_groups_unit_test.py          |  2 +-
 tests/unit_tests/vit_unit_test.py             |  2 +-
 77 files changed, 148 insertions(+), 148 deletions(-)

diff --git a/documentation/source/Example_Classification.md b/documentation/source/Example_Classification.md
index 0e04a5379d..f0b0292eed 100644
--- a/documentation/source/Example_Classification.md
+++ b/documentation/source/Example_Classification.md
@@ -321,7 +321,7 @@ Output (Training parameters):
     'loss': "LabelSmoothingCrossEntropyLoss",
     'lr_cooldown_epochs': 0,
     'lr_decay_factor': 0.1,
-    'lr_mode': 'StepLRCallback',
+    'lr_mode': 'StepLRScheduler',
     'lr_schedule_function': None,
     'lr_updates': array([100, 150, 200]),
     'lr_warmup_epochs': 0,
diff --git a/documentation/source/Example_Training-an-external-model.md b/documentation/source/Example_Training-an-external-model.md
index c1bbc51919..9b5379dba7 100644
--- a/documentation/source/Example_Training-an-external-model.md
+++ b/documentation/source/Example_Training-an-external-model.md
@@ -640,7 +640,7 @@ And lastly, we need to define the training hyperparameters:
 ```python
 train_params = {
     "max_epochs": 100,
-    "lr_mode": "CosineLRCallback",
+    "lr_mode": "CosineLRScheduler",
     "initial_lr": 0.001,
     "optimizer": "Adam",
     "loss": CustomSegLoss(),
diff --git a/documentation/source/LRScheduling.md b/documentation/source/LRScheduling.md
index a02ccea8f3..04cfa238bf 100644
--- a/documentation/source/LRScheduling.md
+++ b/documentation/source/LRScheduling.md
@@ -7,15 +7,15 @@ Learning rate scheduling type is controlled by the training parameter `lr_mode`.
 
         When str:
 
-        Learning rate scheduling policy, one of ['StepLRCallback','PolyLRCallback','CosineLRCallback','FunctionLRCallback'].
+        Learning rate scheduling policy, one of ['StepLRScheduler','PolyLRScheduler','CosineLRScheduler','FunctionLRScheduler'].
 
-        'StepLRCallback' refers to constant updates at epoch numbers passed through `lr_updates`. Each update decays the learning rate by `lr_decay_factor`.
+        'StepLRScheduler' refers to constant updates at epoch numbers passed through `lr_updates`. Each update decays the learning rate by `lr_decay_factor`.
 
-        'CosineLRCallback' refers to the Cosine Anealing policy as mentioned in https://arxiv.org/abs/1608.03983. The final learning rate ratio is controlled by `cosine_final_lr_ratio` training parameter.
+        'CosineLRScheduler' refers to the Cosine Anealing policy as mentioned in https://arxiv.org/abs/1608.03983. The final learning rate ratio is controlled by `cosine_final_lr_ratio` training parameter.
 
-        'PolyLRCallback' refers to the polynomial decrease: in each epoch iteration `self.lr = self.initial_lr * pow((1.0 - (current_iter / max_iter)), 0.9)`
+        'PolyLRScheduler' refers to the polynomial decrease: in each epoch iteration `self.lr = self.initial_lr * pow((1.0 - (current_iter / max_iter)), 0.9)`
 
-        'FunctionLRCallback' refers to a user-defined learning rate scheduling function, that is passed through `lr_schedule_function`.
+        'FunctionLRScheduler' refers to a user-defined learning rate scheduling function, that is passed through `lr_schedule_function`.
 
 For example, the training code below will start with an initial learning rate of 0.1 and decay by 0.1 at epochs 100,150 and 200:
 
@@ -30,7 +30,7 @@ valid_dataloader = ...
 model = ...
 train_params = {
     "initial_lr": 0.1,
-    "lr_mode":"StepLRCallback",
+    "lr_mode":"StepLRScheduler",
     "lr_updates": [100, 150, 200],
     "lr_decay_factor": 0.1,
     ...,
@@ -45,7 +45,7 @@ trainer.train(model=model, training_params=train_params, train_loader=train_data
 ```yaml
 training_hyperparams:
     initial_lr: 0.1
-    lr_mode: StepLRCallback
+    lr_mode: StepLRScheduler
     user_lr_updates:
       - 100
       - 150
@@ -66,7 +66,7 @@ Prerequisites: [phase callbacks](PhaseCallbacks.md), [training with configuratio
 In SG, learning rate schedulers are implemented as [phase callbacks](PhaseCallbacks.md).
 They read the learning rate from the `PhaseContext` in their `__call__` method, calculate the new learning rate according to the current state of training, and update the optimizer's param groups.
 
-For example, the code snippet from the previous section translates "lr_mode":"StepLRCallback" to a `super_gradients.training.utils.callbacks.callbacks.StepLRCallback` instance, which is added to the phase callbacks list.
+For example, the code snippet from the previous section translates "lr_mode":"StepLRScheduler" to a `super_gradients.training.utils.callbacks.callbacks.StepLRScheduler` instance, which is added to the phase callbacks list.
 
 ### Implementing Your Own Scheduler
 A custom learning rate scheduler should inherit from `LRCallbackBase`, so let's take a look at it:
diff --git a/documentation/source/PhaseCallbacks.md b/documentation/source/PhaseCallbacks.md
index 77596fd743..c2b29f65b5 100644
--- a/documentation/source/PhaseCallbacks.md
+++ b/documentation/source/PhaseCallbacks.md
@@ -10,11 +10,11 @@ SG's `super_gradients.training.utils.callbacks` module implements some common us
     LRCallbackBase
     EpochStepWarmupLRCallback
     BatchStepLinearWarmupLRCallback
-    StepLRCallback
-    ExponentialLRCallback
-    PolyLRCallback
-    CosineLRCallback
-    FunctionLRCallback
+    StepLRScheduler
+    ExponentialLRScheduler
+    PolyLRScheduler
+    CosineLRScheduler
+    FunctionLRScheduler
     LRSchedulerCallback
     DetectionVisualizationCallback
     BinarySegmentationVisualizationCallback
diff --git a/documentation/source/Segmentation.md b/documentation/source/Segmentation.md
index 3e6ba3a940..b5e0b536ad 100644
--- a/documentation/source/Segmentation.md
+++ b/documentation/source/Segmentation.md
@@ -143,7 +143,7 @@ from super_gradients.training.metrics.segmentation_metrics import BinaryIOU
 
 train_params = {
     "max_epochs": 30,
-    "lr_mode": "CosineLRCallback",
+    "lr_mode": "CosineLRScheduler",
     "initial_lr": 0.005,
     "lr_warmup_epochs": 5,
     "multiply_head_lr": 10,
diff --git a/src/super_gradients/common/object_names.py b/src/super_gradients/common/object_names.py
index 07efc6816e..4a19bf3485 100644
--- a/src/super_gradients/common/object_names.py
+++ b/src/super_gradients/common/object_names.py
@@ -154,11 +154,11 @@ class Callbacks:
 class LRSchedulers:
     """Static class to hold all the supported LR Scheduler names"""
 
-    STEP = "StepLRCallback"
-    POLY = "PolyLRCallback"
-    COSINE = "CosineLRCallback"
-    EXP = "ExponentialLRCallback"
-    FUNCTION = "FunctionLRCallback"
+    STEP = "StepLRScheduler"
+    POLY = "PolyLRScheduler"
+    COSINE = "CosineLRScheduler"
+    EXP = "ExponentialLRScheduler"
+    FUNCTION = "FunctionLRScheduler"
 
 
 class LRWarmups:
diff --git a/src/super_gradients/examples/ddrnet_imagenet/ddrnet_classification_example.py b/src/super_gradients/examples/ddrnet_imagenet/ddrnet_classification_example.py
index 2fe96fc43f..c760571169 100644
--- a/src/super_gradients/examples/ddrnet_imagenet/ddrnet_classification_example.py
+++ b/src/super_gradients/examples/ddrnet_imagenet/ddrnet_classification_example.py
@@ -39,7 +39,7 @@
 
 train_params_ddr = {
     "max_epochs": args.max_epochs,
-    "lr_mode": "StepLRCallback",
+    "lr_mode": "StepLRScheduler",
     "lr_updates": [30, 60, 90],
     "lr_decay_factor": 0.1,
     "initial_lr": 0.1 * devices,
diff --git a/src/super_gradients/examples/deci_lab_export_example/deci_lab_export_example.py b/src/super_gradients/examples/deci_lab_export_example/deci_lab_export_example.py
index 91c9c976fe..9f49c0130f 100644
--- a/src/super_gradients/examples/deci_lab_export_example/deci_lab_export_example.py
+++ b/src/super_gradients/examples/deci_lab_export_example/deci_lab_export_example.py
@@ -56,7 +56,7 @@ def main(architecture_name: str):
         "max_epochs": 2,
         "lr_updates": [1],
         "lr_decay_factor": 0.1,
-        "lr_mode": "StepLRCallback",
+        "lr_mode": "StepLRScheduler",
         "lr_warmup_epochs": 0,
         "initial_lr": 0.1,
         "loss": "CrossEntropyLoss",
diff --git a/src/super_gradients/examples/early_stop/early_stop_example.py b/src/super_gradients/examples/early_stop/early_stop_example.py
index 3a60223956..5cf1124b06 100644
--- a/src/super_gradients/examples/early_stop/early_stop_example.py
+++ b/src/super_gradients/examples/early_stop/early_stop_example.py
@@ -18,7 +18,7 @@
     "max_epochs": 250,
     "lr_updates": [100, 150, 200],
     "lr_decay_factor": 0.1,
-    "lr_mode": "StepLRCallback",
+    "lr_mode": "StepLRScheduler",
     "lr_warmup_epochs": 0,
     "initial_lr": 0.1,
     "loss": "CrossEntropyLoss",
diff --git a/src/super_gradients/examples/loggers_examples/clearml_logger_example.py b/src/super_gradients/examples/loggers_examples/clearml_logger_example.py
index e2c6a4eb34..7a96dea5d0 100644
--- a/src/super_gradients/examples/loggers_examples/clearml_logger_example.py
+++ b/src/super_gradients/examples/loggers_examples/clearml_logger_example.py
@@ -11,7 +11,7 @@
     "max_epochs": 20,
     "lr_updates": [5, 10, 15],
     "lr_decay_factor": 0.1,
-    "lr_mode": "StepLRCallback",
+    "lr_mode": "StepLRScheduler",
     "initial_lr": 0.1,
     "loss": "CrossEntropyLoss",
     "optimizer": "SGD",
diff --git a/src/super_gradients/examples/loggers_examples/deci_platform_logger_example.py b/src/super_gradients/examples/loggers_examples/deci_platform_logger_example.py
index 65a2d70aca..9a69d3ef86 100644
--- a/src/super_gradients/examples/loggers_examples/deci_platform_logger_example.py
+++ b/src/super_gradients/examples/loggers_examples/deci_platform_logger_example.py
@@ -14,7 +14,7 @@
     "max_epochs": 20,
     "lr_updates": [5, 10, 15],
     "lr_decay_factor": 0.1,
-    "lr_mode": "StepLRCallback",
+    "lr_mode": "StepLRScheduler",
     "initial_lr": 0.1,
     "loss": "CrossEntropyLoss",
     "optimizer": "SGD",
diff --git a/src/super_gradients/examples/regseg_transfer_learning_example/regseg_transfer_learning_example.py b/src/super_gradients/examples/regseg_transfer_learning_example/regseg_transfer_learning_example.py
index f04adea84c..4e363f460e 100644
--- a/src/super_gradients/examples/regseg_transfer_learning_example/regseg_transfer_learning_example.py
+++ b/src/super_gradients/examples/regseg_transfer_learning_example/regseg_transfer_learning_example.py
@@ -39,7 +39,7 @@
 # DEFINE TRAINING PARAMS. SEE DOCS FOR THE FULL LIST.
 train_params = {
     "max_epochs": 50,
-    "lr_mode": "CosineLRCallback",
+    "lr_mode": "CosineLRScheduler",
     "initial_lr": 0.0064,  # for batch_size=16
     "optimizer_params": {"momentum": 0.843, "weight_decay": 0.00036, "nesterov": True},
     "cosine_final_lr_ratio": 0.1,
diff --git a/src/super_gradients/recipes/cityscapes_regseg48.yaml b/src/super_gradients/recipes/cityscapes_regseg48.yaml
index 2135d76e71..27c4cbd5ac 100644
--- a/src/super_gradients/recipes/cityscapes_regseg48.yaml
+++ b/src/super_gradients/recipes/cityscapes_regseg48.yaml
@@ -52,7 +52,7 @@ training_hyperparams:
   sync_bn: True
   resume: ${resume}
   max_epochs: 800
-  lr_mode: PolyLRCallback
+  lr_mode: PolyLRScheduler
   initial_lr: 0.02   # for effective batch_size=16
   lr_warmup_epochs: 0
   optimizer: SGD
diff --git a/src/super_gradients/recipes/cityscapes_segformer_b0.yaml b/src/super_gradients/recipes/cityscapes_segformer_b0.yaml
index 865732a5a3..a4e54798f5 100644
--- a/src/super_gradients/recipes/cityscapes_segformer_b0.yaml
+++ b/src/super_gradients/recipes/cityscapes_segformer_b0.yaml
@@ -18,7 +18,7 @@ experiment_name: ${architecture}_cityscapes
 training_hyperparams:
   max_epochs: 2
 
-  lr_mode: PolyLRCallback
+  lr_mode: PolyLRScheduler
   initial_lr: 0.00006  # for effective batch_size=8
 
 multi_gpu: DDP
diff --git a/src/super_gradients/recipes/cityscapes_segformer_b1.yaml b/src/super_gradients/recipes/cityscapes_segformer_b1.yaml
index d0347f8cb7..7bd9f4a26f 100644
--- a/src/super_gradients/recipes/cityscapes_segformer_b1.yaml
+++ b/src/super_gradients/recipes/cityscapes_segformer_b1.yaml
@@ -18,7 +18,7 @@ experiment_name: ${architecture}_cityscapes
 training_hyperparams:
   max_epochs: 2
 
-  lr_mode: PolyLRCallback
+  lr_mode: PolyLRScheduler
   initial_lr: 0.00006  # for effective batch_size=8
 
 multi_gpu: DDP
diff --git a/src/super_gradients/recipes/cityscapes_segformer_b2.yaml b/src/super_gradients/recipes/cityscapes_segformer_b2.yaml
index 683c8b4966..c793c3e1f2 100644
--- a/src/super_gradients/recipes/cityscapes_segformer_b2.yaml
+++ b/src/super_gradients/recipes/cityscapes_segformer_b2.yaml
@@ -18,7 +18,7 @@ experiment_name: ${architecture}_cityscapes
 training_hyperparams:
   max_epochs: 2
 
-  lr_mode: PolyLRCallback
+  lr_mode: PolyLRScheduler
   initial_lr: 0.00006  # for effective batch_size=8
 
 multi_gpu: DDP
diff --git a/src/super_gradients/recipes/cityscapes_segformer_b3.yaml b/src/super_gradients/recipes/cityscapes_segformer_b3.yaml
index 682817388c..31245514f1 100644
--- a/src/super_gradients/recipes/cityscapes_segformer_b3.yaml
+++ b/src/super_gradients/recipes/cityscapes_segformer_b3.yaml
@@ -18,7 +18,7 @@ experiment_name: ${architecture}_cityscapes
 training_hyperparams:
   max_epochs: 2
 
-  lr_mode: PolyLRCallback
+  lr_mode: PolyLRScheduler
   initial_lr: 0.00006  # for effective batch_size=8
 
 multi_gpu: DDP
diff --git a/src/super_gradients/recipes/cityscapes_segformer_b4.yaml b/src/super_gradients/recipes/cityscapes_segformer_b4.yaml
index 5a42f3bec3..dc82e01c99 100644
--- a/src/super_gradients/recipes/cityscapes_segformer_b4.yaml
+++ b/src/super_gradients/recipes/cityscapes_segformer_b4.yaml
@@ -18,7 +18,7 @@ experiment_name: ${architecture}_cityscapes
 training_hyperparams:
   max_epochs: 2
 
-  lr_mode: PolyLRCallback
+  lr_mode: PolyLRScheduler
   initial_lr: 0.00006  # for effective batch_size=8
 
   mixed_precision: True
diff --git a/src/super_gradients/recipes/cityscapes_segformer_b5.yaml b/src/super_gradients/recipes/cityscapes_segformer_b5.yaml
index 03cd95e87b..e812db4fa3 100644
--- a/src/super_gradients/recipes/cityscapes_segformer_b5.yaml
+++ b/src/super_gradients/recipes/cityscapes_segformer_b5.yaml
@@ -18,7 +18,7 @@ experiment_name: ${architecture}_cityscapes
 training_hyperparams:
   max_epochs: 2
 
-  lr_mode: PolyLRCallback
+  lr_mode: PolyLRScheduler
   initial_lr: 0.00006  # for effective batch_size=8
 
   mixed_precision: True
diff --git a/src/super_gradients/recipes/training_hyperparams/cifar10_resnet_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/cifar10_resnet_train_params.yaml
index ad14908673..0905ba57ff 100644
--- a/src/super_gradients/recipes/training_hyperparams/cifar10_resnet_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/cifar10_resnet_train_params.yaml
@@ -10,7 +10,7 @@ lr_updates:
   step: 50
 
 lr_decay_factor: 0.1
-lr_mode: StepLRCallback
+lr_mode: StepLRScheduler
 lr_warmup_epochs: 0
 initial_lr: 0.1
 loss: LabelSmoothingCrossEntropyLoss
diff --git a/src/super_gradients/recipes/training_hyperparams/cityscapes_default_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/cityscapes_default_train_params.yaml
index dcccdde62a..3ceb393b10 100644
--- a/src/super_gradients/recipes/training_hyperparams/cityscapes_default_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/cityscapes_default_train_params.yaml
@@ -2,7 +2,7 @@ defaults:
   - default_train_params
 
 max_epochs: 800
-lr_mode: PolyLRCallback
+lr_mode: PolyLRScheduler
 initial_lr: 0.01   # for effective batch_size=32
 lr_warmup_epochs: 10
 multiply_head_lr: 10.
diff --git a/src/super_gradients/recipes/training_hyperparams/coco2017_dekr_pose_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/coco2017_dekr_pose_train_params.yaml
index 360bb96c96..7ed162ad83 100644
--- a/src/super_gradients/recipes/training_hyperparams/coco2017_dekr_pose_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/coco2017_dekr_pose_train_params.yaml
@@ -8,7 +8,7 @@ ema_params:
   beta: 20
 
 max_epochs: 150
-lr_mode: CosineLRCallback
+lr_mode: CosineLRScheduler
 cosine_final_lr_ratio: 0.1
 batch_accumulate: 1
 initial_lr: 1e-3
diff --git a/src/super_gradients/recipes/training_hyperparams/coco2017_ppyoloe_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/coco2017_ppyoloe_train_params.yaml
index 2e2443cf20..40e0bdd788 100644
--- a/src/super_gradients/recipes/training_hyperparams/coco2017_ppyoloe_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/coco2017_ppyoloe_train_params.yaml
@@ -10,7 +10,7 @@ lr_warmup_steps: 1000
 lr_warmup_epochs: 0
 
 initial_lr:  2e-3
-lr_mode: CosineLRCallback
+lr_mode: CosineLRScheduler
 cosine_final_lr_ratio: 0.1
 
 zero_weight_decay_on_bias_and_bn: False
diff --git a/src/super_gradients/recipes/training_hyperparams/coco2017_rescoring_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/coco2017_rescoring_train_params.yaml
index 04f32cbcd1..62ce33e6f2 100644
--- a/src/super_gradients/recipes/training_hyperparams/coco2017_rescoring_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/coco2017_rescoring_train_params.yaml
@@ -8,7 +8,7 @@ ema_params:
   beta: 20
 
 max_epochs: 50
-lr_mode: CosineLRCallback
+lr_mode: CosineLRScheduler
 cosine_final_lr_ratio: 0.1
 batch_accumulate: 1
 initial_lr: 0.001
diff --git a/src/super_gradients/recipes/training_hyperparams/coco2017_ssd_lite_mobilenet_v2_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/coco2017_ssd_lite_mobilenet_v2_train_params.yaml
index 0ac79dcca9..65239ffa13 100644
--- a/src/super_gradients/recipes/training_hyperparams/coco2017_ssd_lite_mobilenet_v2_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/coco2017_ssd_lite_mobilenet_v2_train_params.yaml
@@ -3,7 +3,7 @@ defaults:
 
 ema: True
 max_epochs: 400
-lr_mode: CosineLRCallback
+lr_mode: CosineLRScheduler
 cosine_final_lr_ratio: 0.01
 batch_accumulate: 1
 initial_lr: 0.01
diff --git a/src/super_gradients/recipes/training_hyperparams/coco2017_yolo_nas_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/coco2017_yolo_nas_train_params.yaml
index 7ce80baa58..b34fb54847 100644
--- a/src/super_gradients/recipes/training_hyperparams/coco2017_yolo_nas_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/coco2017_yolo_nas_train_params.yaml
@@ -9,7 +9,7 @@ lr_warmup_steps: 1000
 lr_warmup_epochs: 0
 
 initial_lr:  2e-4
-lr_mode: CosineLRCallback
+lr_mode: CosineLRScheduler
 cosine_final_lr_ratio: 0.1
 
 zero_weight_decay_on_bias_and_bn: True
diff --git a/src/super_gradients/recipes/training_hyperparams/coco2017_yolox_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/coco2017_yolox_train_params.yaml
index f338781583..fcc3fa4ba1 100644
--- a/src/super_gradients/recipes/training_hyperparams/coco2017_yolox_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/coco2017_yolox_train_params.yaml
@@ -2,7 +2,7 @@ defaults:
   - default_train_params
 
 max_epochs: 300
-lr_mode: CosineLRCallback
+lr_mode: CosineLRScheduler
 cosine_final_lr_ratio: 0.05
 lr_warmup_epochs: 5
 lr_cooldown_epochs: 15
diff --git a/src/super_gradients/recipes/training_hyperparams/coco_segmentation_shelfnet_lw_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/coco_segmentation_shelfnet_lw_train_params.yaml
index 7501be5c5a..69e0fe5032 100644
--- a/src/super_gradients/recipes/training_hyperparams/coco_segmentation_shelfnet_lw_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/coco_segmentation_shelfnet_lw_train_params.yaml
@@ -7,7 +7,7 @@ loss: ShelfNetOHEMLoss
 optimizer: SGD
 mixed_precision: True
 batch_accumulate: 3
-lr_mode: PolyLRCallback
+lr_mode: PolyLRScheduler
 optimizer_params:
   momentum: 0.9
   weight_decay: 1e-4
diff --git a/src/super_gradients/recipes/training_hyperparams/default_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/default_train_params.yaml
index c3de57c2d4..df1a100792 100644
--- a/src/super_gradients/recipes/training_hyperparams/default_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/default_train_params.yaml
@@ -13,16 +13,16 @@ resume_from_remote_sg_logger: False # bool (default=False), When true, ckpt_name
 ckpt_name: ckpt_latest.pth  # The checkpoint (.pth file) filename in CKPT_ROOT_DIR/EXPERIMENT_NAME/ to use when resume=True and resume_path=None
 
 lr_mode: # Union[str, Mapping]
-         # when str: Learning rate scheduling policy, one of ["StepLRCallback", "PolyLRCallback", "CosineLRCallback", "ExponentialLRCallback", "FunctionLRCallback"]
+         # when str: Learning rate scheduling policy, one of ["StepLRScheduler", "PolyLRScheduler", "CosineLRScheduler", "ExponentialLRScheduler", "FunctionLRScheduler"]
          # when Mapping: refers to a torch.optim.lr_scheduler._LRScheduler, following the below API: lr_mode = {LR_SCHEDULER_CLASS_NAME: {**LR_SCHEDULER_KWARGS, "phase": XXX, "metric_name": XXX)
 
-lr_schedule_function: # Learning rate scheduling function to be used when `lr_mode` is 'FunctionLRCallback'.
+lr_schedule_function: # Learning rate scheduling function to be used when `lr_mode` is 'FunctionLRScheduler'.
 lr_warmup_epochs: 0 # number of epochs for learning rate warm up - see https://arxiv.org/pdf/1706.02677.pdf (Section 2.2).
 lr_warmup_steps: 0  # number of warmup steps (Used when warmup_mode=BatchStepLinearWarmupLRCallback)
 lr_cooldown_epochs: 0 # epochs to cooldown LR (i.e the last epoch from scheduling view point=max_epochs-cooldown)
 warmup_initial_lr: # Initial lr for EpochStepWarmupLRCallback/BatchStepLinearWarmupLRCallback. When none is given, initial_lr/(warmup_epochs+1) will be used.
-step_lr_update_freq: # (float) update frequency in epoch units for computing lr_updates when lr_mode=`StepLRCallback`.
-cosine_final_lr_ratio: 0.01 # final learning rate ratio (only relevant when `lr_mode`='CosineLRCallback')
+step_lr_update_freq: # (float) update frequency in epoch units for computing lr_updates when lr_mode=`StepLRScheduler`.
+cosine_final_lr_ratio: 0.01 # final learning rate ratio (only relevant when `lr_mode`='CosineLRScheduler')
 warmup_mode: EpochStepWarmupLRCallback # learning rate warmup scheme, currently ['LinearStepWarmupLRCallback', 'EpochStepWarmupLRCallback', 'BatchStepLinearWarmupLRCallback'] are supported
 
 lr_updates:
diff --git a/src/super_gradients/recipes/training_hyperparams/imagenet_efficientnet_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/imagenet_efficientnet_train_params.yaml
index b9e6688472..766b968597 100644
--- a/src/super_gradients/recipes/training_hyperparams/imagenet_efficientnet_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/imagenet_efficientnet_train_params.yaml
@@ -2,7 +2,7 @@ defaults:
   - default_train_params
 
 max_epochs: 450
-lr_mode: StepLRCallback
+lr_mode: StepLRScheduler
 step_lr_update_freq: 2.4
 initial_lr: 0.016
 lr_warmup_epochs: 3
diff --git a/src/super_gradients/recipes/training_hyperparams/imagenet_mobilenetv2_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/imagenet_mobilenetv2_train_params.yaml
index eee328aacd..813ff21a43 100644
--- a/src/super_gradients/recipes/training_hyperparams/imagenet_mobilenetv2_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/imagenet_mobilenetv2_train_params.yaml
@@ -3,7 +3,7 @@ defaults:
 
 max_epochs: 450
 
-lr_mode: StepLRCallback
+lr_mode: StepLRScheduler
 initial_lr: 0.032   # for total batch-size of 512
 lr_decay_factor: 0.973
 lr_updates:
diff --git a/src/super_gradients/recipes/training_hyperparams/imagenet_mobilenetv3_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/imagenet_mobilenetv3_train_params.yaml
index 9a7cb96938..1dddb79b14 100644
--- a/src/super_gradients/recipes/training_hyperparams/imagenet_mobilenetv3_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/imagenet_mobilenetv3_train_params.yaml
@@ -2,7 +2,7 @@ defaults:
   - default_train_params
 
 max_epochs: 150
-lr_mode: CosineLRCallback
+lr_mode: CosineLRScheduler
 initial_lr: 0.1
 optimizer: SGD
 
diff --git a/src/super_gradients/recipes/training_hyperparams/imagenet_regnetY_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/imagenet_regnetY_train_params.yaml
index 44a757c4f0..b1b90729ea 100644
--- a/src/super_gradients/recipes/training_hyperparams/imagenet_regnetY_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/imagenet_regnetY_train_params.yaml
@@ -2,7 +2,7 @@ defaults:
   - default_train_params
 
 max_epochs: 450
-lr_mode: StepLRCallback
+lr_mode: StepLRScheduler
 step_lr_update_freq: 2.4
 initial_lr: 0.016
 lr_warmup_epochs: 3
diff --git a/src/super_gradients/recipes/training_hyperparams/imagenet_repvgg_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/imagenet_repvgg_train_params.yaml
index f359ed2127..966aa8b194 100644
--- a/src/super_gradients/recipes/training_hyperparams/imagenet_repvgg_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/imagenet_repvgg_train_params.yaml
@@ -2,7 +2,7 @@ defaults:
   - default_train_params
 
 max_epochs: 120
-lr_mode: CosineLRCallback
+lr_mode: CosineLRScheduler
 initial_lr: 0.1
 cosine_final_lr_ratio: 0
 
diff --git a/src/super_gradients/recipes/training_hyperparams/imagenet_resnet50_kd_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/imagenet_resnet50_kd_train_params.yaml
index 55c2d4867c..6e39f6a4d2 100644
--- a/src/super_gradients/recipes/training_hyperparams/imagenet_resnet50_kd_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/imagenet_resnet50_kd_train_params.yaml
@@ -3,7 +3,7 @@ defaults:
 
 max_epochs: 610
 initial_lr: 5e-3
-lr_mode: CosineLRCallback
+lr_mode: CosineLRScheduler
 lr_warmup_epochs: 5
 lr_cooldown_epochs: 10
 ema: True
diff --git a/src/super_gradients/recipes/training_hyperparams/imagenet_resnet50_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/imagenet_resnet50_train_params.yaml
index 150297b727..4dac223ac0 100644
--- a/src/super_gradients/recipes/training_hyperparams/imagenet_resnet50_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/imagenet_resnet50_train_params.yaml
@@ -3,7 +3,7 @@ defaults:
 
 max_epochs: 400
 initial_lr: 0.1
-lr_mode: CosineLRCallback
+lr_mode: CosineLRScheduler
 lr_warmup_epochs: 5
 ema: False
 save_ckpt_epoch_list: [ 50, 100, 150, 200, 300 ]
diff --git a/src/super_gradients/recipes/training_hyperparams/imagenet_vit_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/imagenet_vit_train_params.yaml
index dd19f41630..258c20d8e1 100644
--- a/src/super_gradients/recipes/training_hyperparams/imagenet_vit_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/imagenet_vit_train_params.yaml
@@ -3,7 +3,7 @@ defaults:
 
 max_epochs: 10
 initial_lr: 0.03
-lr_mode: CosineLRCallback
+lr_mode: CosineLRScheduler
 cosine_final_lr_ratio: 0
 lr_warmup_epochs: 1
 warmup_initial_lr: 0
diff --git a/src/super_gradients/recipes/training_hyperparams/supervisely_default_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/supervisely_default_train_params.yaml
index 3a5507e52c..b544a381ad 100644
--- a/src/super_gradients/recipes/training_hyperparams/supervisely_default_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/supervisely_default_train_params.yaml
@@ -2,7 +2,7 @@ defaults:
   - default_train_params
 
 max_epochs: 100
-lr_mode: CosineLRCallback
+lr_mode: CosineLRScheduler
 cosine_final_lr_ratio: 0.01
 initial_lr: 0.1
 lr_warmup_epochs: 0
diff --git a/src/super_gradients/training/params.py b/src/super_gradients/training/params.py
index 907d3ffaa4..7234c05fb2 100755
--- a/src/super_gradients/training/params.py
+++ b/src/super_gradients/training/params.py
@@ -100,7 +100,7 @@
         "lr_warmup_epochs": {"type": "number", "minimum": 0, "maximum": 10},
         "initial_lr": {"type": "number", "exclusiveMinimum": 0, "maximum": 10},
     },
-    "if": {"properties": {"lr_mode": {"const": "StepLRCallback"}}},
+    "if": {"properties": {"lr_mode": {"const": "StepLRScheduler"}}},
     "then": {"required": ["lr_updates", "lr_decay_factor"]},
     "required": ["max_epochs", "lr_mode", "initial_lr", "loss"],
 }
diff --git a/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py b/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py
index e8420ba62e..cca464fb5f 100644
--- a/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py
+++ b/src/super_gradients/training/pre_launch_callbacks/pre_launch_callbacks.py
@@ -300,8 +300,8 @@ def modify_params_for_qat(
     logger.warning(f"New learning rate: {training_hyperparams['initial_lr']}")
     logger.warning(f"New weight decay: {training_hyperparams['optimizer_params']['weight_decay']}")
     # as recommended by pytorch-quantization docs
-    if get_param(training_hyperparams, "lr_mode") != "CosineLRCallback":
-        training_hyperparams["lr_mode"] = "CosineLRCallback"
+    if get_param(training_hyperparams, "lr_mode") != "CosineLRScheduler":
+        training_hyperparams["lr_mode"] = "CosineLRScheduler"
     training_hyperparams["cosine_final_lr_ratio"] = cosine_final_lr_ratio
     logger.warning(
         f"lr_mode will be set to cosine for QAT run instead of {get_param(training_hyperparams, 'lr_mode')} with "
diff --git a/src/super_gradients/training/sg_trainer/sg_trainer.py b/src/super_gradients/training/sg_trainer/sg_trainer.py
index 449fdf78ad..ec8f846e3a 100755
--- a/src/super_gradients/training/sg_trainer/sg_trainer.py
+++ b/src/super_gradients/training/sg_trainer/sg_trainer.py
@@ -766,29 +766,29 @@ def train(
 
                 - `lr_updates` : list(int)
 
-                    List of fixed epoch numbers to perform learning rate updates when `lr_mode='StepLRCallback'`.
+                    List of fixed epoch numbers to perform learning rate updates when `lr_mode='StepLRScheduler'`.
 
                 - `lr_decay_factor` : float
 
-                    Decay factor to apply to the learning rate at each update when `lr_mode='StepLRCallback'`.
+                    Decay factor to apply to the learning rate at each update when `lr_mode='StepLRScheduler'`.
 
 
                 -  `lr_mode` : Union[str, Mapping],
 
                     When str:
 
-                    Learning rate scheduling policy, one of ['StepLRCallback','PolyLRCallback','CosineLRCallback','FunctionLRCallback'].
+                    Learning rate scheduling policy, one of ['StepLRScheduler','PolyLRScheduler','CosineLRScheduler','FunctionLRScheduler'].
 
-                    'StepLRCallback' refers to constant updates at epoch numbers passed through `lr_updates`.
+                    'StepLRScheduler' refers to constant updates at epoch numbers passed through `lr_updates`.
                         Each update decays the learning rate by `lr_decay_factor`.
 
-                    'CosineLRCallback' refers to the Cosine Anealing policy as mentioned in https://arxiv.org/abs/1608.03983.
+                    'CosineLRScheduler' refers to the Cosine Anealing policy as mentioned in https://arxiv.org/abs/1608.03983.
                       The final learning rate ratio is controlled by `cosine_final_lr_ratio` training parameter.
 
-                    'PolyLRCallback' refers to the polynomial decrease:
+                    'PolyLRScheduler' refers to the polynomial decrease:
                         in each epoch iteration `self.lr = self.initial_lr * pow((1.0 - (current_iter / max_iter)), 0.9)`
 
-                    'FunctionLRCallback' refers to a user-defined learning rate scheduling function, that is passed through `lr_schedule_function`.
+                    'FunctionLRScheduler' refers to a user-defined learning rate scheduling function, that is passed through `lr_schedule_function`.
 
 
 
@@ -823,7 +823,7 @@ def train(
 
                 - `lr_schedule_function` : Union[callable,None]
 
-                    Learning rate scheduling function to be used when `lr_mode` is 'FunctionLRCallback'.
+                    Learning rate scheduling function to be used when `lr_mode` is 'FunctionLRScheduler'.
 
                 - `warmup_mode`: Union[str, Type[LRCallbackBase], None]
 
@@ -846,7 +846,7 @@ def train(
                     The capping is done to avoid interference of warmup with epoch-based schedulers.
 
                 - `cosine_final_lr_ratio` : float (default=0.01)
-                    Final learning rate ratio (only relevant when `lr_mode`='CosineLRCallback'). The cosine starts from initial_lr and reaches
+                    Final learning rate ratio (only relevant when `lr_mode`='CosineLRScheduler'). The cosine starts from initial_lr and reaches
                      initial_lr * cosine_final_lr_ratio in last epoch
 
                 - `inital_lr` : float
diff --git a/src/super_gradients/training/utils/callbacks/__init__.py b/src/super_gradients/training/utils/callbacks/__init__.py
index db705a5d5c..faef75994e 100644
--- a/src/super_gradients/training/utils/callbacks/__init__.py
+++ b/src/super_gradients/training/utils/callbacks/__init__.py
@@ -5,11 +5,11 @@
     LRCallbackBase,
     EpochStepWarmupLRCallback,
     BatchStepLinearWarmupLRCallback,
-    StepLRCallback,
-    ExponentialLRCallback,
-    PolyLRCallback,
-    CosineLRCallback,
-    FunctionLRCallback,
+    StepLRScheduler,
+    ExponentialLRScheduler,
+    PolyLRScheduler,
+    CosineLRScheduler,
+    FunctionLRScheduler,
     IllegalLRSchedulerMetric,
     LRSchedulerCallback,
     MetricsUpdateCallback,
@@ -42,11 +42,11 @@
     "LRCallbackBase",
     "EpochStepWarmupLRCallback",
     "BatchStepLinearWarmupLRCallback",
-    "StepLRCallback",
-    "ExponentialLRCallback",
-    "PolyLRCallback",
-    "CosineLRCallback",
-    "FunctionLRCallback",
+    "StepLRScheduler",
+    "ExponentialLRScheduler",
+    "PolyLRScheduler",
+    "CosineLRScheduler",
+    "FunctionLRScheduler",
     "IllegalLRSchedulerMetric",
     "LRSchedulerCallback",
     "MetricsUpdateCallback",
diff --git a/src/super_gradients/training/utils/callbacks/callbacks.py b/src/super_gradients/training/utils/callbacks/callbacks.py
index 7b9c54298b..7ca4ff8289 100644
--- a/src/super_gradients/training/utils/callbacks/callbacks.py
+++ b/src/super_gradients/training/utils/callbacks/callbacks.py
@@ -394,7 +394,7 @@ class StepLRScheduler(LRCallbackBase):
     def __init__(self, lr_updates, lr_decay_factor, step_lr_update_freq=None, **kwargs):
         super().__init__(Phase.TRAIN_EPOCH_END, **kwargs)
         if step_lr_update_freq and len(lr_updates):
-            raise ValueError("Only one of [lr_updates, step_lr_update_freq] should be passed to StepLRCallback constructor")
+            raise ValueError("Only one of [lr_updates, step_lr_update_freq] should be passed to StepLRScheduler constructor")
 
         if step_lr_update_freq:
             max_epochs = self.training_params.max_epochs - self.training_params.lr_cooldown_epochs
@@ -950,18 +950,18 @@ def create_lr_scheduler_callback(
 
                     When str:
 
-                    Learning rate scheduling policy, one of ['StepLRCallback','PolyLRCallback','CosineLRCallback','FunctionLRCallback'].
+                    Learning rate scheduling policy, one of ['StepLRScheduler','PolyLRScheduler','CosineLRScheduler','FunctionLRScheduler'].
 
-                    'StepLRCallback' refers to constant updates at epoch numbers passed through `lr_updates`.
+                    'StepLRScheduler' refers to constant updates at epoch numbers passed through `lr_updates`.
                         Each update decays the learning rate by `lr_decay_factor`.
 
-                    'CosineLRCallback' refers to the Cosine Anealing policy as mentioned in https://arxiv.org/abs/1608.03983.
+                    'CosineLRScheduler' refers to the Cosine Anealing policy as mentioned in https://arxiv.org/abs/1608.03983.
                       The final learning rate ratio is controlled by `cosine_final_lr_ratio` training parameter.
 
-                    'PolyLRCallback' refers to the polynomial decrease:
+                    'PolyLRScheduler' refers to the polynomial decrease:
                         in each epoch iteration `self.lr = self.initial_lr * pow((1.0 - (current_iter / max_iter)), 0.9)`
 
-                    'FunctionLRCallback' refers to a user-defined learning rate scheduling function, that is passed through `lr_schedule_function`.
+                    'FunctionLRScheduler' refers to a user-defined learning rate scheduling function, that is passed through `lr_schedule_function`.
 
 
 
diff --git a/tests/end_to_end_tests/trainer_test.py b/tests/end_to_end_tests/trainer_test.py
index c881cc4821..edfb4ec486 100644
--- a/tests/end_to_end_tests/trainer_test.py
+++ b/tests/end_to_end_tests/trainer_test.py
@@ -25,7 +25,7 @@ def setUp(cls):
             "lr_decay_factor": 0.1,
             "initial_lr": 0.1,
             "lr_updates": [4],
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "loss": "CrossEntropyLoss",
             "train_metrics_list": [Accuracy(), Top5()],
             "valid_metrics_list": [Accuracy(), Top5()],
diff --git a/tests/integration_tests/conversion_callback_test.py b/tests/integration_tests/conversion_callback_test.py
index 89ddefb344..22b01512cf 100644
--- a/tests/integration_tests/conversion_callback_test.py
+++ b/tests/integration_tests/conversion_callback_test.py
@@ -54,7 +54,7 @@ def test_classification_architectures(self):
                 "max_epochs": 2,
                 "lr_updates": [1],
                 "lr_decay_factor": 0.1,
-                "lr_mode": "StepLRCallback",
+                "lr_mode": "StepLRScheduler",
                 "lr_warmup_epochs": 0,
                 "initial_lr": 0.1,
                 "loss": "CrossEntropyLoss",
@@ -107,7 +107,7 @@ def get_architecture_custom_config(architecture_name: str):
             train_params = {
                 "max_epochs": 3,
                 "initial_lr": 1e-2,
-                "lr_mode": "PolyLRCallback",
+                "lr_mode": "PolyLRScheduler",
                 "ema": True,  # unlike the paper (not specified in paper)
                 "optimizer": "SGD",
                 "optimizer_params": {"weight_decay": 5e-4, "momentum": 0.9},
diff --git a/tests/integration_tests/deci_lab_export_test.py b/tests/integration_tests/deci_lab_export_test.py
index 691c3ae2e8..50e6132d2e 100644
--- a/tests/integration_tests/deci_lab_export_test.py
+++ b/tests/integration_tests/deci_lab_export_test.py
@@ -44,7 +44,7 @@ def test_train_with_deci_lab_integration(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "CrossEntropyLoss",
diff --git a/tests/integration_tests/ema_train_integration_test.py b/tests/integration_tests/ema_train_integration_test.py
index ea252d52a9..3bca4b3204 100644
--- a/tests/integration_tests/ema_train_integration_test.py
+++ b/tests/integration_tests/ema_train_integration_test.py
@@ -49,7 +49,7 @@ def _train(self, ema_params):
         training_params = {
             "max_epochs": 4,
             "lr_updates": [4],
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_decay_factor": 0.1,
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
diff --git a/tests/integration_tests/lr_test.py b/tests/integration_tests/lr_test.py
index 08b81e338f..1b7a4ce245 100644
--- a/tests/integration_tests/lr_test.py
+++ b/tests/integration_tests/lr_test.py
@@ -45,12 +45,12 @@ def test_lr_function(initial_lr, epoch, iter, max_epoch, iters_per_epoch, **kwar
             return initial_lr * (1 - ((epoch * iters_per_epoch + iter) / (max_epoch * iters_per_epoch)))
 
         # test if we are able that lr_function supports functions with this structure
-        training_params = {**self.training_params, "lr_mode": "FunctionLRCallback", "lr_schedule_function": test_lr_function}
+        training_params = {**self.training_params, "lr_mode": "FunctionLRScheduler", "lr_schedule_function": test_lr_function}
         trainer.train(
             model=model, training_params=training_params, train_loader=classification_test_dataloader(), valid_loader=classification_test_dataloader()
         )
         # test that we assert lr_function is callable
-        training_params = {**self.training_params, "lr_mode": "FunctionLRCallback"}
+        training_params = {**self.training_params, "lr_mode": "FunctionLRScheduler"}
         with self.assertRaises(AssertionError):
             trainer.train(
                 model=model, training_params=training_params, train_loader=classification_test_dataloader(), valid_loader=classification_test_dataloader()
@@ -58,14 +58,14 @@ def test_lr_function(initial_lr, epoch, iter, max_epoch, iters_per_epoch, **kwar
 
     def test_cosine_lr(self):
         trainer, model = self.get_trainer(self.folder_name)
-        training_params = {**self.training_params, "lr_mode": "CosineLRCallback", "cosine_final_lr_ratio": 0.01}
+        training_params = {**self.training_params, "lr_mode": "CosineLRScheduler", "cosine_final_lr_ratio": 0.01}
         trainer.train(
             model=model, training_params=training_params, train_loader=classification_test_dataloader(), valid_loader=classification_test_dataloader()
         )
 
     def test_step_lr(self):
         trainer, model = self.get_trainer(self.folder_name)
-        training_params = {**self.training_params, "lr_mode": "StepLRCallback", "lr_decay_factor": 0.1, "lr_updates": [4]}
+        training_params = {**self.training_params, "lr_mode": "StepLRScheduler", "lr_decay_factor": 0.1, "lr_updates": [4]}
         trainer.train(
             model=model, training_params=training_params, train_loader=classification_test_dataloader(), valid_loader=classification_test_dataloader()
         )
diff --git a/tests/integration_tests/pretrained_models_test.py b/tests/integration_tests/pretrained_models_test.py
index 855a439054..2ef2b7b23d 100644
--- a/tests/integration_tests/pretrained_models_test.py
+++ b/tests/integration_tests/pretrained_models_test.py
@@ -87,7 +87,7 @@ def setUp(self) -> None:
             "lr_decay_factor": 0.1,
             "initial_lr": 0.6,
             "loss": "CrossEntropyLoss",
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "optimizer_params": {"weight_decay": 0.000, "momentum": 0.9},
             "train_metrics_list": [Accuracy()],
             "valid_metrics_list": [Accuracy()],
@@ -128,7 +128,7 @@ def setUp(self) -> None:
         ssd_dboxes = DEFAULT_SSD_LITE_MOBILENET_V2_ARCH_PARAMS["heads"]["SSDHead"]["anchors"]
         self.transfer_detection_train_params_ssd = {
             "max_epochs": 3,
-            "lr_mode": "CosineLRCallback",
+            "lr_mode": "CosineLRScheduler",
             "initial_lr": 0.01,
             "cosine_final_lr_ratio": 0.01,
             "lr_warmup_epochs": 3,
@@ -145,7 +145,7 @@ def setUp(self) -> None:
         }
         self.transfer_detection_train_params_yolox = {
             "max_epochs": 3,
-            "lr_mode": "CosineLRCallback",
+            "lr_mode": "CosineLRScheduler",
             "cosine_final_lr_ratio": 0.05,
             "warmup_bias_lr": 0.0,
             "warmup_momentum": 0.9,
@@ -215,7 +215,7 @@ def setUp(self) -> None:
             "max_epochs": 3,
             "initial_lr": 1e-2,
             "loss": DDRNetLoss(),
-            "lr_mode": "PolyLRCallback",
+            "lr_mode": "PolyLRScheduler",
             "ema": True,  # unlike the paper (not specified in paper)
             "average_best_models": True,
             "optimizer": "SGD",
@@ -232,7 +232,7 @@ def setUp(self) -> None:
             "max_epochs": 3,
             "initial_lr": 1e-2,
             "loss": STDCLoss(num_classes=5),
-            "lr_mode": "PolyLRCallback",
+            "lr_mode": "PolyLRScheduler",
             "ema": True,  # unlike the paper (not specified in paper)
             "optimizer": "SGD",
             "optimizer_params": {"weight_decay": 5e-4, "momentum": 0.9},
@@ -247,7 +247,7 @@ def setUp(self) -> None:
             "max_epochs": 3,
             "initial_lr": 1e-2,
             "loss": "CrossEntropyLoss",
-            "lr_mode": "PolyLRCallback",
+            "lr_mode": "PolyLRScheduler",
             "ema": True,  # unlike the paper (not specified in paper)
             "optimizer": "SGD",
             "optimizer_params": {"weight_decay": 5e-4, "momentum": 0.9},
diff --git a/tests/recipe_training_tests/coded_qat_launch_test.py b/tests/recipe_training_tests/coded_qat_launch_test.py
index 9198d296d4..243d78cd53 100644
--- a/tests/recipe_training_tests/coded_qat_launch_test.py
+++ b/tests/recipe_training_tests/coded_qat_launch_test.py
@@ -17,7 +17,7 @@ def test_qat_launch(self):
             "max_epochs": 10,
             "lr_updates": [],
             "lr_decay_factor": 0.1,
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "CrossEntropyLoss",
@@ -78,7 +78,7 @@ def test_ptq_launch(self):
             "max_epochs": 10,
             "lr_updates": [],
             "lr_decay_factor": 0.1,
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "CrossEntropyLoss",
diff --git a/tests/unit_tests/dataset_statistics_test.py b/tests/unit_tests/dataset_statistics_test.py
index 875ab475cf..f68fbb562e 100644
--- a/tests/unit_tests/dataset_statistics_test.py
+++ b/tests/unit_tests/dataset_statistics_test.py
@@ -24,7 +24,7 @@ def test_dataset_statistics_tensorboard_logger(self):
 
         training_params = {
             "max_epochs": 1,  # we dont really need the actual training to run
-            "lr_mode": "CosineLRCallback",
+            "lr_mode": "CosineLRScheduler",
             "initial_lr": 0.01,
             "loss": "YoloXDetectionLoss",
             "criterion_params": {"strides": [8, 16, 32], "num_classes": 80},
diff --git a/tests/unit_tests/detection_dataset_test.py b/tests/unit_tests/detection_dataset_test.py
index 55bd3767db..cb38faa7bb 100644
--- a/tests/unit_tests/detection_dataset_test.py
+++ b/tests/unit_tests/detection_dataset_test.py
@@ -168,7 +168,7 @@ def test_coco_detection_metrics_with_classwise_ap(self):
 
         detection_train_params_yolox = {
             "max_epochs": 5,
-            "lr_mode": "CosineLRCallback",
+            "lr_mode": "CosineLRScheduler",
             "cosine_final_lr_ratio": 0.05,
             "warmup_bias_lr": 0.0,
             "warmup_momentum": 0.9,
diff --git a/tests/unit_tests/double_training_test.py b/tests/unit_tests/double_training_test.py
index 44bb373565..4a9ab0b265 100644
--- a/tests/unit_tests/double_training_test.py
+++ b/tests/unit_tests/double_training_test.py
@@ -24,7 +24,7 @@ def test_call_train_twice(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": torch.nn.CrossEntropyLoss(),
diff --git a/tests/unit_tests/early_stop_test.py b/tests/unit_tests/early_stop_test.py
index 036fc6f576..2082d2fd73 100644
--- a/tests/unit_tests/early_stop_test.py
+++ b/tests/unit_tests/early_stop_test.py
@@ -49,7 +49,7 @@ def setUp(self) -> None:
             "max_epochs": self.max_epochs,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "CrossEntropyLoss",
diff --git a/tests/unit_tests/extreme_batch_cb_test.py b/tests/unit_tests/extreme_batch_cb_test.py
index db767f3b36..26bfd636a2 100644
--- a/tests/unit_tests/extreme_batch_cb_test.py
+++ b/tests/unit_tests/extreme_batch_cb_test.py
@@ -40,7 +40,7 @@ def setUpClass(cls):
             "max_epochs": 3,
             "initial_lr": 1e-2,
             "loss": DDRNetLoss(),
-            "lr_mode": "PolyLRCallback",
+            "lr_mode": "PolyLRScheduler",
             "ema": True,
             "optimizer": "SGD",
             "mixed_precision": False,
@@ -56,7 +56,7 @@ def setUpClass(cls):
             "max_epochs": 3,
             "initial_lr": 1e-2,
             "loss": PPYoloELoss(num_classes=1, use_static_assigner=False, reg_max=16),
-            "lr_mode": "PolyLRCallback",
+            "lr_mode": "PolyLRScheduler",
             "ema": True,
             "optimizer": "SGD",
             "mixed_precision": False,
diff --git a/tests/unit_tests/factories_test.py b/tests/unit_tests/factories_test.py
index 873fc37c89..e3b7babba0 100644
--- a/tests/unit_tests/factories_test.py
+++ b/tests/unit_tests/factories_test.py
@@ -21,7 +21,7 @@ def test_training_with_factories(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "CrossEntropyLoss",
@@ -47,7 +47,7 @@ def test_training_with_factories_with_typos(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "crossEnt_ropy",
diff --git a/tests/unit_tests/forward_pass_prep_fn_test.py b/tests/unit_tests/forward_pass_prep_fn_test.py
index 7dfe59ada6..57ccf27f69 100644
--- a/tests/unit_tests/forward_pass_prep_fn_test.py
+++ b/tests/unit_tests/forward_pass_prep_fn_test.py
@@ -38,7 +38,7 @@ def test_resizing_with_forward_pass_prep_fn(self):
         train_params = {
             "max_epochs": 2,
             "cosine_final_lr_ratio": 0.2,
-            "lr_mode": "CosineLRCallback",
+            "lr_mode": "CosineLRScheduler",
             "lr_cooldown_epochs": 2,
             "lr_warmup_epochs": 3,
             "initial_lr": 1,
diff --git a/tests/unit_tests/initialize_with_dataloaders_test.py b/tests/unit_tests/initialize_with_dataloaders_test.py
index da98237816..9853431498 100644
--- a/tests/unit_tests/initialize_with_dataloaders_test.py
+++ b/tests/unit_tests/initialize_with_dataloaders_test.py
@@ -35,7 +35,7 @@ def test_train_with_dataloaders(self):
                 "max_epochs": 2,
                 "lr_updates": [5, 6, 12],
                 "lr_decay_factor": 0.01,
-                "lr_mode": "StepLRCallback",
+                "lr_mode": "StepLRScheduler",
                 "initial_lr": 0.01,
                 "loss": "CrossEntropyLoss",
                 "optimizer": "SGD",
diff --git a/tests/unit_tests/kd_ema_test.py b/tests/unit_tests/kd_ema_test.py
index ebbd642a8c..bbdf9164bd 100644
--- a/tests/unit_tests/kd_ema_test.py
+++ b/tests/unit_tests/kd_ema_test.py
@@ -20,7 +20,7 @@ def setUp(cls):
             "max_epochs": 3,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": KDLogitsLoss(torch.nn.CrossEntropyLoss()),
diff --git a/tests/unit_tests/kd_trainer_test.py b/tests/unit_tests/kd_trainer_test.py
index 454078eb94..98b3a37f3f 100644
--- a/tests/unit_tests/kd_trainer_test.py
+++ b/tests/unit_tests/kd_trainer_test.py
@@ -42,7 +42,7 @@ def setUp(cls):
             "max_epochs": 3,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": KDLogitsLoss(torch.nn.CrossEntropyLoss()),
diff --git a/tests/unit_tests/load_ema_ckpt_test.py b/tests/unit_tests/load_ema_ckpt_test.py
index 8239f31739..c1d1fe1d98 100644
--- a/tests/unit_tests/load_ema_ckpt_test.py
+++ b/tests/unit_tests/load_ema_ckpt_test.py
@@ -23,7 +23,7 @@ def setUp(self) -> None:
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "CrossEntropyLoss",
diff --git a/tests/unit_tests/local_ckpt_head_replacement_test.py b/tests/unit_tests/local_ckpt_head_replacement_test.py
index b5567fc304..0d100e364a 100644
--- a/tests/unit_tests/local_ckpt_head_replacement_test.py
+++ b/tests/unit_tests/local_ckpt_head_replacement_test.py
@@ -14,7 +14,7 @@ def test_local_ckpt_head_replacement(self):
             "max_epochs": 1,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "CrossEntropyLoss",
diff --git a/tests/unit_tests/loss_loggings_test.py b/tests/unit_tests/loss_loggings_test.py
index 151cff0475..5294885bd1 100644
--- a/tests/unit_tests/loss_loggings_test.py
+++ b/tests/unit_tests/loss_loggings_test.py
@@ -35,7 +35,7 @@ def test_single_item_logging(self):
             "max_epochs": 1,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": torch.nn.CrossEntropyLoss(),
@@ -59,7 +59,7 @@ def test_multiple_unnamed_components_loss_logging(self):
             "max_epochs": 1,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": CriterionWithUnnamedComponents(),
@@ -83,7 +83,7 @@ def test_multiple_named_components_loss_logging(self):
             "max_epochs": 1,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": CriterionWithNamedComponents(),
diff --git a/tests/unit_tests/lr_cooldown_test.py b/tests/unit_tests/lr_cooldown_test.py
index f3972ec84e..668bc0c74f 100644
--- a/tests/unit_tests/lr_cooldown_test.py
+++ b/tests/unit_tests/lr_cooldown_test.py
@@ -19,7 +19,7 @@ def test_lr_cooldown_with_lr_scheduling(self):
         train_params = {
             "max_epochs": 7,
             "cosine_final_lr_ratio": 0.2,
-            "lr_mode": "CosineLRCallback",
+            "lr_mode": "CosineLRScheduler",
             "lr_cooldown_epochs": 2,
             "lr_warmup_epochs": 3,
             "initial_lr": 1,
diff --git a/tests/unit_tests/lr_warmup_test.py b/tests/unit_tests/lr_warmup_test.py
index a86935eb23..ec3442ae87 100644
--- a/tests/unit_tests/lr_warmup_test.py
+++ b/tests/unit_tests/lr_warmup_test.py
@@ -6,7 +6,7 @@
 from super_gradients.training.dataloaders.dataloaders import classification_test_dataloader
 from super_gradients.training.metrics import Accuracy
 from super_gradients.training.models import LeNet
-from super_gradients.training.utils.callbacks import TestLRCallback, LRCallbackBase, Phase, Callback, PhaseContext, CosineLRCallback
+from super_gradients.training.utils.callbacks import TestLRCallback, LRCallbackBase, Phase, Callback, PhaseContext, CosineLRScheduler
 
 
 class CollectLRCallback(Callback):
@@ -58,7 +58,7 @@ def test_lr_warmup(self):
             "max_epochs": 5,
             "lr_updates": [],
             "lr_decay_factor": 0.1,
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_warmup_epochs": 3,
             "initial_lr": 1,
             "loss": "CrossEntropyLoss",
@@ -94,7 +94,7 @@ def test_lr_warmup_with_lr_scheduling(self):
         train_params = {
             "max_epochs": 5,
             "cosine_final_lr_ratio": 0.2,
-            "lr_mode": "CosineLRCallback",
+            "lr_mode": "CosineLRScheduler",
             "lr_warmup_epochs": 3,
             "initial_lr": 1,
             "loss": "CrossEntropyLoss",
@@ -137,7 +137,7 @@ def test_warmup_linear_batch_step(self):
 
         train_params = {
             "max_epochs": max_epochs,
-            "lr_mode": "CosineLRCallback",
+            "lr_mode": "CosineLRScheduler",
             "cosine_final_lr_ratio": cosine_final_lr_ratio,
             "warmup_initial_lr": warmup_initial_lr,
             "warmup_mode": "BatchStepLinearWarmupLRCallback",
@@ -161,7 +161,7 @@ def test_warmup_linear_batch_step(self):
         expected_warmup_lrs = np.linspace(warmup_initial_lr, initial_lr, lr_warmup_steps).tolist()
         total_steps = max_epochs * len(train_loader) - lr_warmup_steps
 
-        expected_cosine_lrs = CosineLRCallback.compute_learning_rate(
+        expected_cosine_lrs = CosineLRScheduler.compute_learning_rate(
             step=np.arange(0, total_steps), total_steps=total_steps, initial_lr=initial_lr, final_lr_ratio=cosine_final_lr_ratio
         )
 
@@ -186,7 +186,7 @@ def test_warmup_linear_epoch_step(self):
             "max_epochs": 5,
             "lr_updates": [],
             "lr_decay_factor": 0.1,
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_warmup_epochs": 3,
             "initial_lr": 1,
             "warmup_initial_lr": 4.0,
@@ -224,7 +224,7 @@ def test_custom_lr_warmup(self):
             "max_epochs": 5,
             "lr_updates": [],
             "lr_decay_factor": 0.1,
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_warmup_epochs": 3,
             "loss": "CrossEntropyLoss",
             "optimizer": "SGD",
diff --git a/tests/unit_tests/max_batches_loop_break_test.py b/tests/unit_tests/max_batches_loop_break_test.py
index 21f78adb44..bbaa483e09 100644
--- a/tests/unit_tests/max_batches_loop_break_test.py
+++ b/tests/unit_tests/max_batches_loop_break_test.py
@@ -23,7 +23,7 @@ def test_max_train_batches_loop_break(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "CrossEntropyLoss",
@@ -58,7 +58,7 @@ def test_max_valid_batches_loop_break(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "CrossEntropyLoss",
diff --git a/tests/unit_tests/optimizer_params_override_test.py b/tests/unit_tests/optimizer_params_override_test.py
index 2fee20413d..f0b250b160 100644
--- a/tests/unit_tests/optimizer_params_override_test.py
+++ b/tests/unit_tests/optimizer_params_override_test.py
@@ -16,7 +16,7 @@ def test_optimizer_params_partial_override(self):
             "max_epochs": 1,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "CrossEntropyLoss",
@@ -45,7 +45,7 @@ def test_optimizer_params_full_override(self):
             "max_epochs": 1,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "CrossEntropyLoss",
diff --git a/tests/unit_tests/phase_context_test.py b/tests/unit_tests/phase_context_test.py
index a9317f1bdf..5fb20101c4 100644
--- a/tests/unit_tests/phase_context_test.py
+++ b/tests/unit_tests/phase_context_test.py
@@ -28,7 +28,7 @@ def context_information_in_train_test(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "CrossEntropyLoss",
diff --git a/tests/unit_tests/preprocessing_unit_test.py b/tests/unit_tests/preprocessing_unit_test.py
index 1e98d694c9..4c1d20f805 100644
--- a/tests/unit_tests/preprocessing_unit_test.py
+++ b/tests/unit_tests/preprocessing_unit_test.py
@@ -97,7 +97,7 @@ def test_setting_preprocessing_params_from_validation_set(self):
 
         detection_train_params_yolox = {
             "max_epochs": 1,
-            "lr_mode": "CosineLRCallback",
+            "lr_mode": "CosineLRScheduler",
             "cosine_final_lr_ratio": 0.05,
             "warmup_bias_lr": 0.0,
             "warmup_momentum": 0.9,
@@ -168,7 +168,7 @@ def test_setting_preprocessing_params_from_checkpoint(self):
 
         detection_train_params_yolox = {
             "max_epochs": 1,
-            "lr_mode": "CosineLRCallback",
+            "lr_mode": "CosineLRScheduler",
             "cosine_final_lr_ratio": 0.05,
             "warmup_bias_lr": 0.0,
             "warmup_momentum": 0.9,
diff --git a/tests/unit_tests/resume_training_test.py b/tests/unit_tests/resume_training_test.py
index 96506ff9da..6c8bc0b465 100644
--- a/tests/unit_tests/resume_training_test.py
+++ b/tests/unit_tests/resume_training_test.py
@@ -31,7 +31,7 @@ def test_resume_training(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "CrossEntropyLoss",
@@ -77,7 +77,7 @@ def test_resume_run_id_training(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "CrossEntropyLoss",
@@ -142,7 +142,7 @@ def test_resume_external_training(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "CrossEntropyLoss",
@@ -190,7 +190,7 @@ def test_resume_external_training_same_dir(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "CrossEntropyLoss",
diff --git a/tests/unit_tests/save_ckpt_test.py b/tests/unit_tests/save_ckpt_test.py
index baa2633e02..11ae820467 100644
--- a/tests/unit_tests/save_ckpt_test.py
+++ b/tests/unit_tests/save_ckpt_test.py
@@ -13,7 +13,7 @@ def setUp(self):
             "max_epochs": 4,
             "lr_decay_factor": 0.1,
             "lr_updates": [4],
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "CrossEntropyLoss",
diff --git a/tests/unit_tests/train_after_test_test.py b/tests/unit_tests/train_after_test_test.py
index b2e3158e7e..d0a7ec085e 100644
--- a/tests/unit_tests/train_after_test_test.py
+++ b/tests/unit_tests/train_after_test_test.py
@@ -20,7 +20,7 @@ def setUp(self) -> None:
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": torch.nn.CrossEntropyLoss(),
diff --git a/tests/unit_tests/train_logging_test.py b/tests/unit_tests/train_logging_test.py
index c361703372..5fbb16a539 100644
--- a/tests/unit_tests/train_logging_test.py
+++ b/tests/unit_tests/train_logging_test.py
@@ -19,7 +19,7 @@ def test_train_logging(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "CrossEntropyLoss",
diff --git a/tests/unit_tests/train_with_intialized_param_args_test.py b/tests/unit_tests/train_with_intialized_param_args_test.py
index c6641fc6a4..d1dcefbd22 100644
--- a/tests/unit_tests/train_with_intialized_param_args_test.py
+++ b/tests/unit_tests/train_with_intialized_param_args_test.py
@@ -28,7 +28,7 @@ def test_train_with_external_criterion(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": torch.nn.CrossEntropyLoss(),
@@ -52,7 +52,7 @@ def test_train_with_external_optimizer(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "CrossEntropyLoss",
@@ -148,7 +148,7 @@ def test_train_with_external_metric(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "CrossEntropyLoss",
@@ -178,7 +178,7 @@ def test_train_with_external_dataloaders(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "CrossEntropyLoss",
diff --git a/tests/unit_tests/train_with_precise_bn_test.py b/tests/unit_tests/train_with_precise_bn_test.py
index 771997628b..a67d87bb40 100644
--- a/tests/unit_tests/train_with_precise_bn_test.py
+++ b/tests/unit_tests/train_with_precise_bn_test.py
@@ -18,7 +18,7 @@ def test_train_with_precise_bn_explicit_size(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "CrossEntropyLoss",
@@ -47,7 +47,7 @@ def test_train_with_precise_bn_implicit_size(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "CrossEntropyLoss",
diff --git a/tests/unit_tests/update_param_groups_unit_test.py b/tests/unit_tests/update_param_groups_unit_test.py
index bf772e28a7..e4edd4ca02 100644
--- a/tests/unit_tests/update_param_groups_unit_test.py
+++ b/tests/unit_tests/update_param_groups_unit_test.py
@@ -34,7 +34,7 @@ def test_lr_scheduling_with_update_param_groups(self):
 
         train_params = {
             "max_epochs": 3,
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_updates": [0, 1, 2],
             "initial_lr": 0.1,
             "lr_decay_factor": 1,
diff --git a/tests/unit_tests/vit_unit_test.py b/tests/unit_tests/vit_unit_test.py
index 8436eb6976..b9a3527761 100644
--- a/tests/unit_tests/vit_unit_test.py
+++ b/tests/unit_tests/vit_unit_test.py
@@ -15,7 +15,7 @@ def setUp(self):
             "max_epochs": 2,
             "lr_updates": [1],
             "lr_decay_factor": 0.1,
-            "lr_mode": "StepLRCallback",
+            "lr_mode": "StepLRScheduler",
             "lr_warmup_epochs": 0,
             "initial_lr": 0.1,
             "loss": "CrossEntropyLoss",

From 7097f73ef756da4c305080946b0f053cb16b0a68 Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Mon, 11 Sep 2023 23:15:11 +0300
Subject: [PATCH 17/21] add deprecate on LRWarmups

---
 .../training/utils/callbacks/callbacks.py     | 33 ++++++++++++-------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/src/super_gradients/training/utils/callbacks/callbacks.py b/src/super_gradients/training/utils/callbacks/callbacks.py
index 7ca4ff8289..437f63cc50 100644
--- a/src/super_gradients/training/utils/callbacks/callbacks.py
+++ b/src/super_gradients/training/utils/callbacks/callbacks.py
@@ -278,7 +278,7 @@ def update_lr(self, optimizer, epoch, batch_idx=None):
 
 
 @register_lr_warmup(LRWarmups.LINEAR_EPOCH_STEP, deprecated_name="linear_epoch_step")
-class EpochStepWarmupLRCallback(LRCallbackBase):
+class LinearEpochLRWarmup(LRCallbackBase):
     """
     LR scheduling callback for linear step warmup. This scheduler uses a whole epoch as single step.
     LR climbs from warmup_initial_lr with even steps to initial lr. When warmup_initial_lr is None - LR climb starts from
@@ -287,7 +287,7 @@ class EpochStepWarmupLRCallback(LRCallbackBase):
     """
 
     def __init__(self, **kwargs):
-        super(EpochStepWarmupLRCallback, self).__init__(Phase.TRAIN_EPOCH_START, **kwargs)
+        super().__init__(Phase.TRAIN_EPOCH_START, **kwargs)
         self.warmup_initial_lr = self.training_params.warmup_initial_lr or self.initial_lr / (self.training_params.lr_warmup_epochs + 1)
         self.warmup_step_size = (
             (self.initial_lr - self.warmup_initial_lr) / self.training_params.lr_warmup_epochs if self.training_params.lr_warmup_epochs > 0 else 0
@@ -301,20 +301,24 @@ def is_lr_scheduling_enabled(self, context):
         return self.training_params.lr_warmup_epochs > 0 and self.training_params.lr_warmup_epochs >= context.epoch
 
 
+@deprecated(deprecated_since="3.2.1", removed_from="3.5.0", target=LinearEpochLRWarmup)
+class EpochStepWarmupLRCallback(LinearEpochLRWarmup):
+    ...
+
+
 @register_lr_warmup(LRWarmups.LINEAR_STEP, deprecated_name="linear_step")
-class LinearStepWarmupLRCallback(EpochStepWarmupLRCallback):
-    """Deprecated, use EpochStepWarmupLRCallback instead"""
+@deprecated(deprecated_since="3.2.1", removed_from="3.5.0", target=LinearEpochLRWarmup)
+class LinearLRWarmup(LinearEpochLRWarmup):
+    ...
 
-    def __init__(self, **kwargs):
-        logger.warning(
-            f"Parameter {LRWarmups.LINEAR_STEP} has been made deprecated and will be removed in the next SG release. "
-            f"Please use `{LRWarmups.LINEAR_EPOCH_STEP}` instead."
-        )
-        super(LinearStepWarmupLRCallback, self).__init__(**kwargs)
+
+@deprecated(deprecated_since="3.2.1", removed_from="3.5.0", target=LinearEpochLRWarmup)
+class LinearStepWarmupLRCallback(LinearEpochLRWarmup):
+    ...
 
 
 @register_lr_warmup(LRWarmups.LINEAR_BATCH_STEP, deprecated_name="linear_batch_step")
-class BatchStepLinearWarmupLRCallback(Callback):
+class LinearBatchLRWarmup(Callback):
     """
     LR scheduling callback for linear step warmup on each batch step.
     LR climbs from warmup_initial_lr with to initial lr.
@@ -340,7 +344,7 @@ def __init__(
         :param kwargs:
         """
 
-        super(BatchStepLinearWarmupLRCallback, self).__init__()
+        super().__init__()
 
         if lr_warmup_steps > train_loader_len:
             logger.warning(
@@ -385,6 +389,11 @@ def update_lr(self, optimizer, epoch, batch_idx=None):
                 param_group["lr"] = self.lr
 
 
+@deprecated(deprecated_since="3.2.1", removed_from="3.5.0", target=LinearBatchLRWarmup)
+class BatchStepLinearWarmupLRCallback(LinearBatchLRWarmup):
+    ...
+
+
 @register_lr_scheduler(LRSchedulers.STEP, deprecated_name="step")
 class StepLRScheduler(LRCallbackBase):
     """

From 9eb134d91814fa87fe1f2f9ec7a8edaf57c09aef Mon Sep 17 00:00:00 2001
From: Louis Dupont <louis-dupont@live.fr>
Date: Mon, 11 Sep 2023 23:17:12 +0300
Subject: [PATCH 18/21] rename LRWarmups to be more simple

---
 documentation/source/Example_Classification.md            | 2 +-
 documentation/source/PhaseCallbacks.md                    | 4 ++--
 src/super_gradients/common/object_names.py                | 6 +++---
 src/super_gradients/recipes/roboflow_yolo_nas_m.yaml      | 2 +-
 src/super_gradients/recipes/roboflow_yolo_nas_s.yaml      | 2 +-
 .../coco2017_ppyoloe_train_params.yaml                    | 2 +-
 .../coco2017_yolo_nas_train_params.yaml                   | 2 +-
 .../training_hyperparams/default_train_params.yaml        | 6 +++---
 .../training_hyperparams/imagenet_vit_train_params.yaml   | 2 +-
 src/super_gradients/training/params.py                    | 2 +-
 src/super_gradients/training/utils/callbacks/__init__.py  | 8 ++++----
 src/super_gradients/training/utils/deprecated_utils.py    | 6 +++---
 tests/unit_tests/lr_warmup_test.py                        | 8 ++++----
 13 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/documentation/source/Example_Classification.md b/documentation/source/Example_Classification.md
index f0b0292eed..f6d9d6606b 100644
--- a/documentation/source/Example_Classification.md
+++ b/documentation/source/Example_Classification.md
@@ -355,7 +355,7 @@ Output (Training parameters):
     'train_metrics_list': ['Accuracy', 'Top5'],
     'valid_metrics_list': ['Accuracy', 'Top5'],
     'warmup_initial_lr': None,
-    'warmup_mode': 'EpochStepWarmupLRCallback',
+    'warmup_mode': 'LinearEpochLRWarmup',
     'zero_weight_decay_on_bias_and_bn': False
 }
 ```
diff --git a/documentation/source/PhaseCallbacks.md b/documentation/source/PhaseCallbacks.md
index c2b29f65b5..cc7480aa06 100644
--- a/documentation/source/PhaseCallbacks.md
+++ b/documentation/source/PhaseCallbacks.md
@@ -8,8 +8,8 @@ SG's `super_gradients.training.utils.callbacks` module implements some common us
 
     ModelConversionCheckCallback
     LRCallbackBase
-    EpochStepWarmupLRCallback
-    BatchStepLinearWarmupLRCallback
+    LinearEpochLRWarmup
+    LinearBatchLRWarmup
     StepLRScheduler
     ExponentialLRScheduler
     PolyLRScheduler
diff --git a/src/super_gradients/common/object_names.py b/src/super_gradients/common/object_names.py
index 4a19bf3485..f21b44d999 100644
--- a/src/super_gradients/common/object_names.py
+++ b/src/super_gradients/common/object_names.py
@@ -164,9 +164,9 @@ class LRSchedulers:
 class LRWarmups:
     """Static class to hold all the supported LR Warmup names"""
 
-    LINEAR_STEP = "LinearStepWarmupLRCallback"
-    LINEAR_EPOCH_STEP = "EpochStepWarmupLRCallback"
-    LINEAR_BATCH_STEP = "BatchStepLinearWarmupLRCallback"
+    LINEAR_STEP = "LinearEpochLRWarmup"
+    LINEAR_EPOCH_STEP = "LinearEpochLRWarmup"
+    LINEAR_BATCH_STEP = "LinearBatchLRWarmup"
 
 
 class Samplers:
diff --git a/src/super_gradients/recipes/roboflow_yolo_nas_m.yaml b/src/super_gradients/recipes/roboflow_yolo_nas_m.yaml
index b2c9ba5370..2d6641e801 100644
--- a/src/super_gradients/recipes/roboflow_yolo_nas_m.yaml
+++ b/src/super_gradients/recipes/roboflow_yolo_nas_m.yaml
@@ -46,7 +46,7 @@ training_hyperparams:
   zero_weight_decay_on_bias_and_bn: True
 
   lr_warmup_epochs: 3
-  warmup_mode: EpochStepWarmupLRCallback
+  warmup_mode: LinearEpochLRWarmup
 
   initial_lr: 4e-4
   cosine_final_lr_ratio: 0.1
diff --git a/src/super_gradients/recipes/roboflow_yolo_nas_s.yaml b/src/super_gradients/recipes/roboflow_yolo_nas_s.yaml
index 4698e76a84..8fb2baf901 100644
--- a/src/super_gradients/recipes/roboflow_yolo_nas_s.yaml
+++ b/src/super_gradients/recipes/roboflow_yolo_nas_s.yaml
@@ -46,7 +46,7 @@ training_hyperparams:
   zero_weight_decay_on_bias_and_bn: True
 
   lr_warmup_epochs: 3
-  warmup_mode: EpochStepWarmupLRCallback
+  warmup_mode: LinearEpochLRWarmup
 
   initial_lr: 5e-4
   cosine_final_lr_ratio: 0.1
diff --git a/src/super_gradients/recipes/training_hyperparams/coco2017_ppyoloe_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/coco2017_ppyoloe_train_params.yaml
index 40e0bdd788..b80d8fcb66 100644
--- a/src/super_gradients/recipes/training_hyperparams/coco2017_ppyoloe_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/coco2017_ppyoloe_train_params.yaml
@@ -4,7 +4,7 @@ defaults:
 max_epochs: 500
 static_assigner_end_epoch: 150
 
-warmup_mode: BatchStepLinearWarmupLRCallback
+warmup_mode: LinearBatchLRWarmup
 warmup_initial_lr:  1e-6
 lr_warmup_steps: 1000
 lr_warmup_epochs: 0
diff --git a/src/super_gradients/recipes/training_hyperparams/coco2017_yolo_nas_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/coco2017_yolo_nas_train_params.yaml
index b34fb54847..cf7c7add94 100644
--- a/src/super_gradients/recipes/training_hyperparams/coco2017_yolo_nas_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/coco2017_yolo_nas_train_params.yaml
@@ -3,7 +3,7 @@ defaults:
 
 max_epochs: 300
 
-warmup_mode: BatchStepLinearWarmupLRCallback
+warmup_mode: LinearBatchLRWarmup
 warmup_initial_lr:  1e-6
 lr_warmup_steps: 1000
 lr_warmup_epochs: 0
diff --git a/src/super_gradients/recipes/training_hyperparams/default_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/default_train_params.yaml
index df1a100792..0015f58e9d 100644
--- a/src/super_gradients/recipes/training_hyperparams/default_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/default_train_params.yaml
@@ -18,12 +18,12 @@ lr_mode: # Union[str, Mapping]
 
 lr_schedule_function: # Learning rate scheduling function to be used when `lr_mode` is 'FunctionLRScheduler'.
 lr_warmup_epochs: 0 # number of epochs for learning rate warm up - see https://arxiv.org/pdf/1706.02677.pdf (Section 2.2).
-lr_warmup_steps: 0  # number of warmup steps (Used when warmup_mode=BatchStepLinearWarmupLRCallback)
+lr_warmup_steps: 0  # number of warmup steps (Used when warmup_mode=LinearBatchLRWarmup)
 lr_cooldown_epochs: 0 # epochs to cooldown LR (i.e the last epoch from scheduling view point=max_epochs-cooldown)
-warmup_initial_lr: # Initial lr for EpochStepWarmupLRCallback/BatchStepLinearWarmupLRCallback. When none is given, initial_lr/(warmup_epochs+1) will be used.
+warmup_initial_lr: # Initial lr for LinearEpochLRWarmup/LinearBatchLRWarmup. When none is given, initial_lr/(warmup_epochs+1) will be used.
 step_lr_update_freq: # (float) update frequency in epoch units for computing lr_updates when lr_mode=`StepLRScheduler`.
 cosine_final_lr_ratio: 0.01 # final learning rate ratio (only relevant when `lr_mode`='CosineLRScheduler')
-warmup_mode: EpochStepWarmupLRCallback # learning rate warmup scheme, currently ['LinearStepWarmupLRCallback', 'EpochStepWarmupLRCallback', 'BatchStepLinearWarmupLRCallback'] are supported
+warmup_mode: LinearEpochLRWarmup # learning rate warmup scheme, currently ['LinearEpochLRWarmup', 'LinearEpochLRWarmup', 'LinearBatchLRWarmup'] are supported
 
 lr_updates:
   _target_: super_gradients.training.utils.utils.empty_list # This is a workaround to instantiate a list using _target_. If we would instantiate as "lr_updates: []",
diff --git a/src/super_gradients/recipes/training_hyperparams/imagenet_vit_train_params.yaml b/src/super_gradients/recipes/training_hyperparams/imagenet_vit_train_params.yaml
index 258c20d8e1..63598dd9ce 100644
--- a/src/super_gradients/recipes/training_hyperparams/imagenet_vit_train_params.yaml
+++ b/src/super_gradients/recipes/training_hyperparams/imagenet_vit_train_params.yaml
@@ -7,7 +7,7 @@ lr_mode: CosineLRScheduler
 cosine_final_lr_ratio: 0
 lr_warmup_epochs: 1
 warmup_initial_lr: 0
-warmup_mode: EpochStepWarmupLRCallback
+warmup_mode: LinearEpochLRWarmup
 ema: False
 loss: LabelSmoothingCrossEntropyLoss
 clip_grad_norm: 1
diff --git a/src/super_gradients/training/params.py b/src/super_gradients/training/params.py
index 7234c05fb2..1388457841 100755
--- a/src/super_gradients/training/params.py
+++ b/src/super_gradients/training/params.py
@@ -48,7 +48,7 @@
         "save_tensorboard_remote": False,  # upload tensorboard files to s3
         "save_logs_remote": False,
     },  # upload log files to s3
-    "warmup_mode": "LinearStepWarmupLRCallback",
+    "warmup_mode": "LinearEpochLRWarmup",
     "step_lr_update_freq": None,
     "lr_updates": [],
     "clip_grad_norm": None,
diff --git a/src/super_gradients/training/utils/callbacks/__init__.py b/src/super_gradients/training/utils/callbacks/__init__.py
index faef75994e..88c4d2f45f 100644
--- a/src/super_gradients/training/utils/callbacks/__init__.py
+++ b/src/super_gradients/training/utils/callbacks/__init__.py
@@ -3,8 +3,8 @@
     ModelConversionCheckCallback,
     DeciLabUploadCallback,
     LRCallbackBase,
-    EpochStepWarmupLRCallback,
-    BatchStepLinearWarmupLRCallback,
+    LinearEpochLRWarmup,
+    LinearBatchLRWarmup,
     StepLRScheduler,
     ExponentialLRScheduler,
     PolyLRScheduler,
@@ -40,8 +40,8 @@
     "ModelConversionCheckCallback",
     "DeciLabUploadCallback",
     "LRCallbackBase",
-    "EpochStepWarmupLRCallback",
-    "BatchStepLinearWarmupLRCallback",
+    "LinearEpochLRWarmup",
+    "LinearBatchLRWarmup",
     "StepLRScheduler",
     "ExponentialLRScheduler",
     "PolyLRScheduler",
diff --git a/src/super_gradients/training/utils/deprecated_utils.py b/src/super_gradients/training/utils/deprecated_utils.py
index 433615d0e1..8da5257139 100644
--- a/src/super_gradients/training/utils/deprecated_utils.py
+++ b/src/super_gradients/training/utils/deprecated_utils.py
@@ -10,14 +10,14 @@ def wrap_with_warning(cls: Callable, message: str) -> Any:
     Emits a warning when target class of function is called.
 
     >>> from super_gradients.training.utils.deprecated_utils import wrap_with_warning
-    >>> from super_gradients.training.utils.callbacks import EpochStepWarmupLRCallback, BatchStepLinearWarmupLRCallback
+    >>> from super_gradients.training.utils.callbacks import LinearEpochLRWarmup, LinearBatchLRWarmup
     >>>
     >>> LR_WARMUP_CLS_DICT = {
     >>>     "linear": wrap_with_warning(
-    >>>         EpochStepWarmupLRCallback,
+    >>>         LinearEpochLRWarmup,
     >>>         message=f"Parameter `linear` has been made deprecated and will be removed in the next SG release. Please use `linear_epoch` instead",
     >>>     ),
-    >>>     'linear_epoch`': EpochStepWarmupLRCallback,
+    >>>     'linear_epoch`': LinearEpochLRWarmup,
     >>> }
 
     :param cls: A class or function to wrap
diff --git a/tests/unit_tests/lr_warmup_test.py b/tests/unit_tests/lr_warmup_test.py
index ec3442ae87..2521090499 100644
--- a/tests/unit_tests/lr_warmup_test.py
+++ b/tests/unit_tests/lr_warmup_test.py
@@ -71,7 +71,7 @@ def test_lr_warmup(self):
             "greater_metric_to_watch_is_better": True,
             "ema": False,
             "phase_callbacks": phase_callbacks,
-            "warmup_mode": "EpochStepWarmupLRCallback",
+            "warmup_mode": "LinearEpochLRWarmup",
         }
 
         expected_lrs = [0.25, 0.5, 0.75, 1.0, 1.0]
@@ -107,7 +107,7 @@ def test_lr_warmup_with_lr_scheduling(self):
             "greater_metric_to_watch_is_better": True,
             "ema": False,
             "phase_callbacks": phase_callbacks,
-            "warmup_mode": "EpochStepWarmupLRCallback",
+            "warmup_mode": "LinearEpochLRWarmup",
         }
 
         expected_lrs = [0.25, 0.5, 0.75, 0.9236067977499791, 0.4763932022500211]
@@ -140,7 +140,7 @@ def test_warmup_linear_batch_step(self):
             "lr_mode": "CosineLRScheduler",
             "cosine_final_lr_ratio": cosine_final_lr_ratio,
             "warmup_initial_lr": warmup_initial_lr,
-            "warmup_mode": "BatchStepLinearWarmupLRCallback",
+            "warmup_mode": "LinearBatchLRWarmup",
             "lr_warmup_steps": lr_warmup_steps,
             "initial_lr": 1,
             "loss": "CrossEntropyLoss",
@@ -200,7 +200,7 @@ def test_warmup_linear_epoch_step(self):
             "greater_metric_to_watch_is_better": True,
             "ema": False,
             "phase_callbacks": [collect_lr_callback],
-            "warmup_mode": "EpochStepWarmupLRCallback",
+            "warmup_mode": "LinearEpochLRWarmup",
         }
 
         expected_lrs = [4.0, 3.0, 2.0, 1.0, 1.0]

From 0b913fa4f8e12d9279eca0cffb24e4931f790aa3 Mon Sep 17 00:00:00 2001
From: shayaharon <shay.aharon@deci.ai>
Date: Tue, 19 Sep 2023 15:18:33 +0300
Subject: [PATCH 19/21] removed double registry from linear epoch lr warmup

---
 src/super_gradients/training/utils/callbacks/callbacks.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/super_gradients/training/utils/callbacks/callbacks.py b/src/super_gradients/training/utils/callbacks/callbacks.py
index 437f63cc50..e0b52fa327 100644
--- a/src/super_gradients/training/utils/callbacks/callbacks.py
+++ b/src/super_gradients/training/utils/callbacks/callbacks.py
@@ -306,7 +306,6 @@ class EpochStepWarmupLRCallback(LinearEpochLRWarmup):
     ...
 
 
-@register_lr_warmup(LRWarmups.LINEAR_STEP, deprecated_name="linear_step")
 @deprecated(deprecated_since="3.2.1", removed_from="3.5.0", target=LinearEpochLRWarmup)
 class LinearLRWarmup(LinearEpochLRWarmup):
     ...

From 1ff60b4cabdf50ccb55e532a744a1e1720894d32 Mon Sep 17 00:00:00 2001
From: shayaharon <shay.aharon@deci.ai>
Date: Tue, 19 Sep 2023 15:47:29 +0300
Subject: [PATCH 20/21] fixed some failing breaking changes

---
 src/super_gradients/training/losses/__init__.py    |  3 ++-
 .../losses/label_smoothing_cross_entropy_loss.py   |  6 ++++++
 .../training/utils/callbacks/__init__.py           | 14 ++++++++++++++
 3 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/super_gradients/training/losses/__init__.py b/src/super_gradients/training/losses/__init__.py
index 81cebc9485..b0eae2eb2e 100755
--- a/src/super_gradients/training/losses/__init__.py
+++ b/src/super_gradients/training/losses/__init__.py
@@ -1,6 +1,6 @@
 from super_gradients.training.losses.focal_loss import FocalLoss
 from super_gradients.training.losses.kd_losses import KDLogitsLoss
-from super_gradients.training.losses.label_smoothing_cross_entropy_loss import CrossEntropyLoss
+from super_gradients.training.losses.label_smoothing_cross_entropy_loss import CrossEntropyLoss, LabelSmoothingCrossEntropyLoss
 from super_gradients.training.losses.r_squared_loss import RSquaredLoss
 from super_gradients.training.losses.shelfnet_ohem_loss import ShelfNetOHEMLoss
 from super_gradients.training.losses.shelfnet_semantic_encoding_loss import ShelfNetSemanticEncodingLoss
@@ -34,4 +34,5 @@
     "DEKRLoss",
     "STDCLoss",
     "RescoringLoss",
+    "LabelSmoothingCrossEntropyLoss",
 ]
diff --git a/src/super_gradients/training/losses/label_smoothing_cross_entropy_loss.py b/src/super_gradients/training/losses/label_smoothing_cross_entropy_loss.py
index 583b177482..708c23ba62 100755
--- a/src/super_gradients/training/losses/label_smoothing_cross_entropy_loss.py
+++ b/src/super_gradients/training/losses/label_smoothing_cross_entropy_loss.py
@@ -4,6 +4,7 @@
 
 from super_gradients.common.object_names import Losses
 from super_gradients.common.registry.registry import register_loss
+from super_gradients.common.deprecate import deprecated
 
 
 def onehot(indexes, N=None, ignore_index=None):
@@ -109,3 +110,8 @@ def forward(self, input, target, smooth_dist=None):
         # CHANGED TO THE CURRENT FORMAT- OUR CRITERION FUNCTIONS SHOULD ALL NPW RETURN A TUPLE OF (LOSS_FOR_BACKPROP, ADDITIONAL_ITEMS)
         # WHERE ADDITIONAL ITEMS ARE TORCH TENSORS OF SIZE (N_ITEMS,...) DETACHED FROM THEIR GRADIENTS FOR LOGGING
         return loss, loss.unsqueeze(0).detach()
+
+
+@deprecated(deprecated_since="3.2.1", removed_from="3.5.0", target=CrossEntropyLoss)
+class FunctionLRCallback(CrossEntropyLoss):
+    ...
diff --git a/src/super_gradients/training/utils/callbacks/__init__.py b/src/super_gradients/training/utils/callbacks/__init__.py
index 88c4d2f45f..31103bd3ee 100644
--- a/src/super_gradients/training/utils/callbacks/__init__.py
+++ b/src/super_gradients/training/utils/callbacks/__init__.py
@@ -21,6 +21,13 @@
     YoloXTrainingStageSwitchCallback,
     TestLRCallback,
     TimerCallback,
+    EpochStepWarmupLRCallback,
+    BatchStepLinearWarmupLRCallback,
+    StepLRCallback,
+    ExponentialLRCallback,
+    PolyLRCallback,
+    CosineLRCallback,
+    FunctionLRCallback,
 )
 from super_gradients.training.utils.callbacks.ppyoloe_switch_callback import PPYoloETrainingStageSwitchCallback
 from super_gradients.common.object_names import Callbacks, LRSchedulers, LRWarmups
@@ -60,4 +67,11 @@
     "TestLRCallback",
     "PPYoloETrainingStageSwitchCallback",
     "TimerCallback",
+    "EpochStepWarmupLRCallback",
+    "BatchStepLinearWarmupLRCallback",
+    "StepLRCallback",
+    "ExponentialLRCallback",
+    "PolyLRCallback",
+    "CosineLRCallback",
+    "FunctionLRCallback",
 ]

From dd4ae3fe8787f197c7125cb38ed65cc4990fcb4b Mon Sep 17 00:00:00 2001
From: shayaharon <shay.aharon@deci.ai>
Date: Tue, 19 Sep 2023 15:48:48 +0300
Subject: [PATCH 21/21] fixed changed CELoss ref

---
 .../training/losses/label_smoothing_cross_entropy_loss.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/super_gradients/training/losses/label_smoothing_cross_entropy_loss.py b/src/super_gradients/training/losses/label_smoothing_cross_entropy_loss.py
index 708c23ba62..f9a1f36476 100755
--- a/src/super_gradients/training/losses/label_smoothing_cross_entropy_loss.py
+++ b/src/super_gradients/training/losses/label_smoothing_cross_entropy_loss.py
@@ -113,5 +113,5 @@ def forward(self, input, target, smooth_dist=None):
 
 
 @deprecated(deprecated_since="3.2.1", removed_from="3.5.0", target=CrossEntropyLoss)
-class FunctionLRCallback(CrossEntropyLoss):
+class LabelSmoothingCrossEntropyLoss(CrossEntropyLoss):
     ...