Squashed changes with YoloNASPose & Loss (#1512)

* Introduce sample-centric keypoint transforms * Cleanup leftovers * Fixed way of checking transforms that require additional samples * Docstrings * :attr -> :param * Added docs clarifying behavior of mosaic & mixup * Added docs clarifying behavior of mosaic & mixup * Improved tests * Additional docstrings & typing annotations * Added missing additional_samples_count field * KeypointsRemoveSmallObjects * KeypointsRemoveSmallObjects * Feature/sg 1060 yolo nas pose release pr to add datasets and metric (#1506) * Cherry pick changes to post-prediction, visualization and metric * Remove unwanted references to new datasets * Remove YoloNASPoseCollateFN * Fixed unit test * Improve clarify of bbox format by giving it more explicit name and added a bunch of docstrings * Improve variable names * Squashed changes with YoloNASPose & Loss * Remove print statement * Fixed attribute name that was not renamed * Improve docstrings to use 'Num Keypoints' instead of magic number 17 * Fixed PoseNMS export to work with custom number of keypoints * Added docstrings * Simplify forward/forward_eval * Simplify forward/forward_eval * _insert_heads_list_params * Added tests * Refactor the way we generate usage instructions. Should be easier to follow and update * Improve docstrings * Improved docstrings * Improved docstrings * Improved docstrings * Improved docstrings * Rename bboxes -> bboxes_xyxy * Fixed instructions text * Improve efficiency of training --------- Co-authored-by: Shay Aharon <80472096+shaydeci@users.noreply.github.com>
Deci-AI · Oct 13, 2023 · 3a19f85 · 3a19f85
1 parent 0b79e68
commit 3a19f85
Show file tree

Hide file tree

Showing 28 changed files with 3,862 additions and 10 deletions.
diff --git a/src/super_gradients/common/object_names.py b/src/super_gradients/common/object_names.py
@@ -16,6 +16,7 @@ class Losses:
     DICE_CE_EDGE_LOSS = "DiceCEEdgeLoss"
     DEKR_LOSS = "DEKRLoss"
     RESCORING_LOSS = "RescoringLoss"
+    YOLONAS_POSE_LOSS = "YoloNASPoseLoss"
 
 
 class Metrics:
@@ -314,6 +315,11 @@ class Models:
     POSE_RESCORING = "pose_rescoring_custom"
     POSE_RESCORING_COCO = "pose_rescoring_coco"
 
+    YOLO_NAS_POSE_N = "yolo_nas_pose_n"
+    YOLO_NAS_POSE_S = "yolo_nas_pose_s"
+    YOLO_NAS_POSE_M = "yolo_nas_pose_m"
+    YOLO_NAS_POSE_L = "yolo_nas_pose_l"
+
 
 class ConcatenatedTensorFormats:
     XYXY_LABEL = "XYXY_LABEL"

diff --git a/src/super_gradients/conversion/onnx/pose_nms.py b/src/super_gradients/conversion/onnx/pose_nms.py
diff --git a/src/super_gradients/module_interfaces/__init__.py b/src/super_gradients/module_interfaces/__init__.py
@@ -1,5 +1,7 @@
 from .module_interfaces import HasPredict, HasPreprocessingParams, SupportsReplaceNumClasses
-from .exportable_detector import ExportableObjectDetectionModel, AbstractObjectDetectionDecodingModule, ModelHasNoPreprocessingParamsException
+from .exceptions import ModelHasNoPreprocessingParamsException
+from .exportable_detector import ExportableObjectDetectionModel, AbstractObjectDetectionDecodingModule
+from .exportable_pose_estimation import ExportablePoseEstimationModel, PoseEstimationModelExportResult, AbstractPoseEstimationDecodingModule
 from .pose_estimation_post_prediction_callback import AbstractPoseEstimationPostPredictionCallback, PoseEstimationPredictions
 
 __all__ = [
@@ -11,4 +13,7 @@
     "ModelHasNoPreprocessingParamsException",
     "AbstractPoseEstimationPostPredictionCallback",
     "PoseEstimationPredictions",
+    "ExportablePoseEstimationModel",
+    "PoseEstimationModelExportResult",
+    "AbstractPoseEstimationDecodingModule",
 ]
diff --git a/src/super_gradients/module_interfaces/exceptions.py b/src/super_gradients/module_interfaces/exceptions.py
@@ -0,0 +1,6 @@
+class ModelHasNoPreprocessingParamsException(Exception):
+    """
+    Exception that is raised when model does not have preprocessing parameters.
+    """
+
+    pass
diff --git a/src/super_gradients/module_interfaces/exportable_pose_estimation.py b/src/super_gradients/module_interfaces/exportable_pose_estimation.py
diff --git a/src/super_gradients/module_interfaces/usage_instructions.py b/src/super_gradients/module_interfaces/usage_instructions.py
@@ -0,0 +1,155 @@
+import numpy as np
+from torch import nn
+
+from super_gradients.conversion.conversion_utils import torch_dtype_to_numpy_dtype
+from super_gradients.conversion.conversion_enums import ExportTargetBackend, ExportQuantizationMode, DetectionOutputFormatMode
+
+
+def build_preprocessing_hint_text(preprocessing_module: nn.Module) -> str:
+    module_repr = repr(preprocessing_module)
+    return f"""
+Exported model already contains preprocessing (normalization) step, so you don't need to do it manually.
+Preprocessing steps to be applied to input image are:
+{module_repr}
+"""
+
+
+def build_postprocessing_hint_text(num_pre_nms_predictions, max_predictions_per_image, nms_threshold, confidence_threshold, output_predictions_format) -> str:
+    return f"""
+Exported model contains postprocessing (NMS) step with the following parameters:
+    num_pre_nms_predictions={num_pre_nms_predictions}
+    max_predictions_per_image={max_predictions_per_image}
+    nms_threshold={nms_threshold}
+    confidence_threshold={confidence_threshold}
+    output_predictions_format={output_predictions_format}
+
+"""
+
+
+def build_usage_instructions_for_pose_estimation(
+    *,
+    output,
+    batch_size,
+    input_image_channels,
+    input_image_shape,
+    input_image_dtype,
+    preprocessing,
+    preprocessing_module,
+    postprocessing,
+    postprocessing_module,
+    num_pre_nms_predictions,
+    max_predictions_per_image,
+    confidence_threshold,
+    nms_threshold,
+    output_predictions_format,
+    engine: ExportTargetBackend,
+    quantization_mode: ExportQuantizationMode,
+) -> str:
+    # Add usage instructions
+    usage_instructions = f"""
+Model exported successfully to {output}
+Model expects input image of shape [{batch_size}, {input_image_channels}, {input_image_shape[0]}, {input_image_shape[1]}]
+Input image dtype is {input_image_dtype}"""
+
+    if preprocessing:
+        preprocessing_hint_text = build_preprocessing_hint_text(preprocessing_module)
+        usage_instructions += f"\n{preprocessing_hint_text}"
+
+    if postprocessing:
+        postprocessing_hint_text = build_postprocessing_hint_text(
+            num_pre_nms_predictions, max_predictions_per_image, nms_threshold, confidence_threshold, output_predictions_format
+        )
+        usage_instructions += f"\n{postprocessing_hint_text}"
+
+    if engine in (ExportTargetBackend.ONNXRUNTIME, ExportTargetBackend.TENSORRT):
+        dtype_name = np.dtype(torch_dtype_to_numpy_dtype(input_image_dtype)).name
+
+        usage_instructions += f"""
+Exported model is in ONNX format and can be used with ONNXRuntime
+To run inference with ONNXRuntime, please use the following code snippet:
+
+    import onnxruntime
+    import numpy as np
+    session = onnxruntime.InferenceSession("{output}", providers=["CUDAExecutionProvider", "CPUExecutionProvider"])
+    inputs = [o.name for o in session.get_inputs()]
+    outputs = [o.name for o in session.get_outputs()]
+
+    example_input_image = np.zeros(({batch_size}, {input_image_channels}, {input_image_shape[0]}, {input_image_shape[1]})).astype(np.{dtype_name})
+    predictions = session.run(outputs, {{inputs[0]: example_input_image}})
+
+Exported model can also be used with TensorRT
+To run inference with TensorRT, please see TensorRT deployment documentation
+You can benchmark the model using the following code snippet:
+
+    trtexec --onnx={output} {'--int8' if quantization_mode == ExportQuantizationMode.INT8 else '--fp16'} --avgRuns=100 --duration=15
+
+"""
+
+    if postprocessing is True:
+        if output_predictions_format == DetectionOutputFormatMode.FLAT_FORMAT:
+            usage_instructions += f"""
+Exported model has predictions in {output_predictions_format} format:
+
+# flat_predictions is a 2D array of [N,K] shape
+# Each row represents (image_index, x_min, y_min, x_max, y_max, confidence, joints...)
+# Please note all values are floats, so you have to convert them to integers if needed
+
+[flat_predictions] = predictions"""
+
+            if batch_size == 1:
+                usage_instructions += """
+pred_bboxes = flat_predictions[:, 1:5]
+pred_scores = flat_predictions[:, 5]
+pred_joints = flat_predictions[:, 6:].reshape((len(pred_bboxes), -1, 3))
+for i in range(len(pred_bboxes)):
+    confidence = pred_scores[i]
+    x_min, y_min, x_max, y_max = pred_bboxes[i]
+    print(f"Detected pose with confidence={{confidence}}, x_min={{x_min}}, y_min={{y_min}}, x_max={{x_max}}, y_max={{y_max}}")
+    for joint_index, (x, y, confidence) in enumerate(pred_joints[i]):")
+        print(f"Joint {{joint_index}} has coordinates x={{x}}, y={{y}}, confidence={{confidence}}")
+
+"""
+
+            else:
+                usage_instructions += f"""
+for current_sample in range({batch_size}):
+    predictions_for_current_sample = predictions[predictions[0] == current_sample]
+    print("Predictions for sample " + str(current_sample))
+    pred_bboxes = predictions_for_current_sample[:, 1:5]
+    pred_scores = predictions_for_current_sample[:, 5]
+    pred_joints = predictions_for_current_sample[:, 6:].reshape((len(pred_bboxes), -1, 3))
+    for i in range(len(pred_bboxes)):
+        confidence = pred_scores[i]
+        x_min, y_min, x_max, y_max = pred_bboxes[i]
+        print(f"Detected pose with confidence={{confidence}}, x_min={{x_min}}, y_min={{y_min}}, x_max={{x_max}}, y_max={{y_max}}")
+        for joint_index, (x, y, confidence) in enumerate(pred_joints[i]):
+            print(f"Joint {{joint_index}} has coordinates x={{x}}, y={{y}}, confidence={{confidence}}")
+
+"""
+
+        elif output_predictions_format == DetectionOutputFormatMode.BATCH_FORMAT:
+            # fmt: off
+            usage_instructions += f"""Exported model has predictions in {output_predictions_format} format:
+
+    num_detections, pred_boxes, pred_scores, pred_joints = predictions
+    for image_index in range(num_detections.shape[0]):
+        for i in range(num_detections[image_index,0]):
+            confidence = pred_scores[image_index, i]
+            x_min, y_min, x_max, y_max = pred_boxes[image_index, i]
+            pred_joints = pred_joints[image_index, i]
+            print(f"Detected pose with confidence={{confidence}}, x_min={{x_min}}, y_min={{y_min}}, x_max={{x_max}}, y_max={{y_max}}")
+            for joint_index, (x, y, confidence) in enumerate(pred_joints[i]):
+                print(f"Joint {{joint_index}} has coordinates x={{x}}, y={{y}}, confidence={{confidence}}")
+
+"""
+    elif postprocessing is False:
+        usage_instructions += """Model exported with postprocessing=False
+No decoding or NMS is added to the model, so you will have to decode predictions manually.
+Please refer to the documentation for the model you exported"""
+    elif isinstance(postprocessing_module, nn.Module):
+        usage_instructions += f""""Exported model contains a custom postprocessing step.
+We are unable to provide usage instructions to user-provided postprocessing module
+But here is the human-friendly representation of the postprocessing module:
+        {repr(postprocessing_module)}"""
+
+    return usage_instructions
diff --git a/src/super_gradients/recipes/arch_params/yolo_nas_pose_l_arch_params.yaml b/src/super_gradients/recipes/arch_params/yolo_nas_pose_l_arch_params.yaml
@@ -0,0 +1,143 @@
+in_channels: 3
+
+backbone:
+  NStageBackbone:
+
+    stem:
+      YoloNASStem:
+        out_channels: 48
+
+    stages:
+      - YoloNASStage:
+          out_channels: 96
+          num_blocks: 2
+          activation_type: relu
+          hidden_channels: 96
+          concat_intermediates: True
+
+      - YoloNASStage:
+          out_channels: 192
+          num_blocks: 3
+          activation_type: relu
+          hidden_channels: 128
+          concat_intermediates: True
+
+      - YoloNASStage:
+          out_channels: 384
+          num_blocks: 5
+          activation_type: relu
+          hidden_channels: 256
+          concat_intermediates: True
+
+      - YoloNASStage:
+          out_channels: 768
+          num_blocks: 2
+          activation_type: relu
+          hidden_channels: 512
+          concat_intermediates: True
+
+
+    context_module:
+      SPP:
+        output_channels: 768
+        activation_type: relu
+        k: [5,9,13]
+
+    out_layers: [stage1, stage2, stage3, context_module]
+
+neck:
+  YoloNASPANNeckWithC2:
+
+    neck1:
+      YoloNASUpStage:
+        out_channels: 192
+        num_blocks: 4
+        hidden_channels: 128
+        width_mult: 1
+        depth_mult: 1
+        activation_type: relu
+        reduce_channels: True
+
+    neck2:
+      YoloNASUpStage:
+        out_channels: 96
+        num_blocks: 4
+        hidden_channels: 128
+        width_mult: 1
+        depth_mult: 1
+        activation_type: relu
+        reduce_channels: True
+
+    neck3:
+      YoloNASDownStage:
+        out_channels: 192
+        num_blocks: 4
+        hidden_channels: 128
+        activation_type: relu
+        width_mult: 1
+        depth_mult: 1
+
+    neck4:
+      YoloNASDownStage:
+        out_channels: 384
+        num_blocks: 4
+        hidden_channels: 256
+        activation_type: relu
+        width_mult: 1
+        depth_mult: 1
+
+heads:
+  YoloNASPoseNDFLHeads:
+    num_classes: 17
+    reg_max: 16
+    heads_list:
+      - YoloNASPoseDFLHead:
+          bbox_inter_channels: 128
+          pose_inter_channels: 128
+          pose_regression_blocks: 2
+          shared_stem: False
+          width_mult: 1
+          pose_conf_in_class_head: True
+          pose_block_use_repvgg: False
+          first_conv_group_size: 0
+          num_classes:
+          stride: 8
+          reg_max: 16
+          cls_dropout_rate: 0.0
+          reg_dropout_rate: 0.0
+
+      - YoloNASPoseDFLHead:
+          bbox_inter_channels: 256
+          pose_inter_channels: 512
+          pose_regression_blocks: 2
+          shared_stem: False
+          width_mult: 1
+          pose_conf_in_class_head: True
+          pose_block_use_repvgg: False
+          first_conv_group_size: 0
+          num_classes:
+          stride: 16
+          reg_max: 16
+          cls_dropout_rate: 0.0
+          reg_dropout_rate: 0.0
+
+      - YoloNASPoseDFLHead:
+          bbox_inter_channels: 512
+          pose_inter_channels: 512
+          pose_regression_blocks: 3
+          shared_stem: False
+          width_mult: 1
+          pose_conf_in_class_head: True
+          pose_block_use_repvgg: False
+          first_conv_group_size: 0
+          num_classes:
+          stride: 32
+          reg_max: 16
+          cls_dropout_rate: 0.0
+          reg_dropout_rate: 0.0
+
+bn_eps: 1e-6
+bn_momentum: 0.03
+inplace_act: True
+
+_convert_: all