Deci-AI · BloodAxe · Oct 13, 2023 · Oct 2, 2023 · Oct 2, 2023 · Oct 3, 2023
@@ -16,6 +16,7 @@ class Losses:
     DICE_CE_EDGE_LOSS = "DiceCEEdgeLoss"
     DEKR_LOSS = "DEKRLoss"
     RESCORING_LOSS = "RescoringLoss"
+    YOLONAS_POSE_LOSS = "YoloNASPoseLoss"
 
 
 class Metrics:
@@ -113,11 +114,13 @@ class Transforms:
     KeypointsImageNormalize = "KeypointsImageNormalize"
     KeypointsImageStandardize = "KeypointsImageStandardize"
     KeypointsImageToTensor = "KeypointsImageToTensor"
-    KeypointTransform = "KeypointTransform"
     KeypointsPadIfNeeded = "KeypointsPadIfNeeded"
     KeypointsLongestMaxSize = "KeypointsLongestMaxSize"
     KeypointsRandomVerticalFlip = "KeypointsRandomVerticalFlip"
     KeypointsRandomHorizontalFlip = "KeypointsRandomHorizontalFlip"
+    KeypointsRescale = "KeypointsRescale"
+    KeypointsRandomRotate90 = "KeypointsRandomRotate90"
+    KeypointsRemoveSmallObjects = "KeypointsRemoveSmallObjects"
 
 
 class Optimizers:
@@ -312,6 +315,11 @@ class Models:
     POSE_RESCORING = "pose_rescoring_custom"
     POSE_RESCORING_COCO = "pose_rescoring_coco"
 
+    YOLO_NAS_POSE_N = "yolo_nas_pose_n"
+    YOLO_NAS_POSE_S = "yolo_nas_pose_s"
+    YOLO_NAS_POSE_M = "yolo_nas_pose_m"
+    YOLO_NAS_POSE_L = "yolo_nas_pose_l"
+
 
 class ConcatenatedTensorFormats:
     XYXY_LABEL = "XYXY_LABEL"
@@ -418,6 +426,7 @@ class Processings:
     DetectionLongestMaxSizeRescale = "DetectionLongestMaxSizeRescale"
     DetectionBottomRightPadding = "DetectionBottomRightPadding"
     DetectionRescale = "DetectionRescale"
+    KeypointsRescale = "KeypointsRescale"
     KeypointsLongestMaxSizeRescale = "KeypointsLongestMaxSizeRescale"
     KeypointsBottomRightPadding = "KeypointsBottomRightPadding"
     ImagePermute = "ImagePermute"

@@ -1,5 +1,8 @@
 from .module_interfaces import HasPredict, HasPreprocessingParams, SupportsReplaceNumClasses
-from .exportable_detector import ExportableObjectDetectionModel, AbstractObjectDetectionDecodingModule, ModelHasNoPreprocessingParamsException
+from .exceptions import ModelHasNoPreprocessingParamsException
+from .exportable_detector import ExportableObjectDetectionModel, AbstractObjectDetectionDecodingModule
+from .exportable_pose_estimation import ExportablePoseEstimationModel, PoseEstimationModelExportResult, AbstractPoseEstimationDecodingModule
+from .pose_estimation_post_prediction_callback import AbstractPoseEstimationPostPredictionCallback, PoseEstimationPredictions
 
 __all__ = [
     "HasPredict",
@@ -8,4 +11,9 @@
     "ExportableObjectDetectionModel",
     "AbstractObjectDetectionDecodingModule",
     "ModelHasNoPreprocessingParamsException",
+    "AbstractPoseEstimationPostPredictionCallback",
+    "PoseEstimationPredictions",
+    "ExportablePoseEstimationModel",
+    "PoseEstimationModelExportResult",
+    "AbstractPoseEstimationDecodingModule",
 ]
@@ -0,0 +1,6 @@
+class ModelHasNoPreprocessingParamsException(Exception):
+    """
+    Exception that is raised when model does not have preprocessing parameters.
+    """
+
+    pass
diff --git a/src/super_gradients/module_interfaces/pose_estimation_post_prediction_callback.py b/src/super_gradients/module_interfaces/pose_estimation_post_prediction_callback.py
@@ -0,0 +1,37 @@
+import abc
+import dataclasses
+import numpy as np
+
+from typing import Any, List
+from typing import Union, Optional
+from torch import Tensor
+
+__all__ = ["PoseEstimationPredictions", "AbstractPoseEstimationPostPredictionCallback"]
+
+
+@dataclasses.dataclass
+class PoseEstimationPredictions:
+    """
+    A data class that encapsulates pose estimation predictions for a single image.
+
+    :param poses:        Array of shape [N, K, 3] where N is number of poses and K is number of joints.
+                         Last dimension is [x, y, score] where score the confidence score for the specific joint
+                         with [0..1] range.
+    :param scores:       Array of shape [N] with scores for each pose with [0..1] range.
+    :param bboxes_xyxy:  Array of shape [N, 4] with bounding boxes for each pose in XYXY format.
+                         Can be None if bounding boxes are not available (for instance, DEKR model does not output boxes).
+    """
+
+    poses: Union[Tensor, np.ndarray]
+    scores: Union[Tensor, np.ndarray]
+    bboxes_xyxy: Optional[Union[Tensor, np.ndarray]]
+
+
+class AbstractPoseEstimationPostPredictionCallback(abc.ABC):
+    """
+    A protocol interface of a post-prediction callback for pose estimation models.
+    """
+
+    @abc.abstractmethod
+    def __call__(self, predictions: Any) -> List[PoseEstimationPredictions]:
+        ...
@@ -0,0 +1,143 @@
+in_channels: 3
+
+backbone:
+  NStageBackbone:
+
+    stem:
+      YoloNASStem:
+        out_channels: 48
+
+    stages:
+      - YoloNASStage:
+          out_channels: 96
+          num_blocks: 2
+          activation_type: relu
+          hidden_channels: 96
+          concat_intermediates: True
+
+      - YoloNASStage:
+          out_channels: 192
+          num_blocks: 3
+          activation_type: relu
+          hidden_channels: 128
+          concat_intermediates: True
+
+      - YoloNASStage:
+          out_channels: 384
+          num_blocks: 5
+          activation_type: relu
+          hidden_channels: 256
+          concat_intermediates: True
+
+      - YoloNASStage:
+          out_channels: 768
+          num_blocks: 2
+          activation_type: relu
+          hidden_channels: 512
+          concat_intermediates: True
+
+
+    context_module:
+      SPP:
+        output_channels: 768
+        activation_type: relu
+        k: [5,9,13]
+
+    out_layers: [stage1, stage2, stage3, context_module]
+
+neck:
+  YoloNASPANNeckWithC2:
+
+    neck1:
+      YoloNASUpStage:
+        out_channels: 192
+        num_blocks: 4
+        hidden_channels: 128
+        width_mult: 1
+        depth_mult: 1
+        activation_type: relu
+        reduce_channels: True
+
+    neck2:
+      YoloNASUpStage:
+        out_channels: 96
+        num_blocks: 4
+        hidden_channels: 128
+        width_mult: 1
+        depth_mult: 1
+        activation_type: relu
+        reduce_channels: True
+
+    neck3:
+      YoloNASDownStage:
+        out_channels: 192
+        num_blocks: 4
+        hidden_channels: 128
+        activation_type: relu
+        width_mult: 1
+        depth_mult: 1
+
+    neck4:
+      YoloNASDownStage:
+        out_channels: 384
+        num_blocks: 4
+        hidden_channels: 256
+        activation_type: relu
+        width_mult: 1
+        depth_mult: 1
+
+heads:
+  YoloNASPoseNDFLHeads:
+    num_classes: 17
+    reg_max: 16
+    heads_list:
+      - YoloNASPoseDFLHead:
+          bbox_inter_channels: 128
+          pose_inter_channels: 128
+          pose_regression_blocks: 2
+          shared_stem: False
+          width_mult: 1
+          pose_conf_in_class_head: True
+          pose_block_use_repvgg: False
+          first_conv_group_size: 0
+          num_classes:
+          stride: 8
+          reg_max: 16
+          cls_dropout_rate: 0.0
+          reg_dropout_rate: 0.0
+
+      - YoloNASPoseDFLHead:
+          bbox_inter_channels: 256
+          pose_inter_channels: 512
+          pose_regression_blocks: 2
+          shared_stem: False
+          width_mult: 1
+          pose_conf_in_class_head: True
+          pose_block_use_repvgg: False
+          first_conv_group_size: 0
+          num_classes:
+          stride: 16
+          reg_max: 16
+          cls_dropout_rate: 0.0
+          reg_dropout_rate: 0.0
+
+      - YoloNASPoseDFLHead:
+          bbox_inter_channels: 512
+          pose_inter_channels: 512
+          pose_regression_blocks: 3
+          shared_stem: False
+          width_mult: 1
+          pose_conf_in_class_head: True
+          pose_block_use_repvgg: False
+          first_conv_group_size: 0
+          num_classes:
+          stride: 32
+          reg_max: 16
+          cls_dropout_rate: 0.0
+          reg_dropout_rate: 0.0
+
+bn_eps: 1e-6
+bn_momentum: 0.03
+inplace_act: True
+
+_convert_: all
@@ -0,0 +1,139 @@
+in_channels: 3
+
+backbone:
+  NStageBackbone:
+
+    stem:
+      YoloNASStem:
+        out_channels: 48
+
+    stages:
+      - YoloNASStage:
+          out_channels: 96
+          num_blocks: 2
+          activation_type: relu
+          hidden_channels: 64
+          concat_intermediates: True
+
+      - YoloNASStage:
+          out_channels: 192
+          num_blocks: 3
+          activation_type: relu
+          hidden_channels: 128
+          concat_intermediates: True
+
+      - YoloNASStage:
+          out_channels: 384
+          num_blocks: 5
+          activation_type: relu
+          hidden_channels: 256
+          concat_intermediates: True
+
+      - YoloNASStage:
+          out_channels: 768
+          num_blocks: 2
+          activation_type: relu
+          hidden_channels: 384
+          concat_intermediates: False
+
+
+    context_module:
+      SPP:
+        output_channels: 768
+        activation_type: relu
+        k: [5,9,13]
+
+    out_layers: [stage1, stage2, stage3, context_module]
+
+neck:
+  YoloNASPANNeckWithC2:
+
+    neck1:
+      YoloNASUpStage:
+        out_channels: 192
+        num_blocks: 2
+        hidden_channels: 192
+        width_mult: 1
+        depth_mult: 1
+        activation_type: relu
+        reduce_channels: True
+
+    neck2:
+      YoloNASUpStage:
+        out_channels: 96
+        num_blocks: 3
+        hidden_channels: 64
+        width_mult: 1
+        depth_mult: 1
+        activation_type: relu
+        reduce_channels: True
+
+    neck3:
+      YoloNASDownStage:
+        out_channels: 192
+        num_blocks: 2
+        hidden_channels: 192
+        activation_type: relu
+        width_mult: 1
+        depth_mult: 1
+
+    neck4:
+      YoloNASDownStage:
+        out_channels: 384
+        num_blocks: 3
+        hidden_channels: 256
+        activation_type: relu
+        width_mult: 1
+        depth_mult: 1
+
+heads:
+  YoloNASPoseNDFLHeads:
+    num_classes: 17
+    reg_max: 16
+    pose_offset_multiplier: 1.0
+    compensate_grid_cell_offset: True
+    inference_mode: False # True used only when benchmarking
+    heads_list:
+      - YoloNASPoseDFLHead:
+          bbox_inter_channels: 128
+          pose_inter_channels: 128
+          pose_regression_blocks: 2
+          shared_stem: False
+          width_mult: 0.75
+          pose_conf_in_class_head: True
+          pose_block_use_repvgg: False
+          first_conv_group_size: 0
+          num_classes:
+          stride: 8
+          reg_max: 16
+      - YoloNASPoseDFLHead:
+          bbox_inter_channels: 256
+          pose_inter_channels: 512
+          pose_regression_blocks: 2
+          shared_stem: False
+          width_mult: 0.75
+          pose_conf_in_class_head: True
+          pose_block_use_repvgg: False
+          first_conv_group_size: 0
+          num_classes:
+          stride: 16
+          reg_max: 16
+      - YoloNASPoseDFLHead:
+          bbox_inter_channels: 512
+          pose_inter_channels: 512
+          pose_regression_blocks: 3
+          shared_stem: False
+          width_mult: 0.75
+          pose_conf_in_class_head: True
+          pose_block_use_repvgg: False
+          first_conv_group_size: 0
+          num_classes:
+          stride: 32
+          reg_max: 16
+
+
+bn_eps: 1e-6
+bn_momentum: 0.1
+inplace_act: True
+
+_convert_: all