-
Notifications
You must be signed in to change notification settings - Fork 488
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Squashed changes with YoloNASPose & Loss (#1512)
* Introduce sample-centric keypoint transforms * Cleanup leftovers * Fixed way of checking transforms that require additional samples * Docstrings * :attr -> :param * Added docs clarifying behavior of mosaic & mixup * Added docs clarifying behavior of mosaic & mixup * Improved tests * Additional docstrings & typing annotations * Added missing additional_samples_count field * KeypointsRemoveSmallObjects * KeypointsRemoveSmallObjects * Feature/sg 1060 yolo nas pose release pr to add datasets and metric (#1506) * Cherry pick changes to post-prediction, visualization and metric * Remove unwanted references to new datasets * Remove YoloNASPoseCollateFN * Fixed unit test * Improve clarify of bbox format by giving it more explicit name and added a bunch of docstrings * Improve variable names * Squashed changes with YoloNASPose & Loss * Remove print statement * Fixed attribute name that was not renamed * Improve docstrings to use 'Num Keypoints' instead of magic number 17 * Fixed PoseNMS export to work with custom number of keypoints * Added docstrings * Simplify forward/forward_eval * Simplify forward/forward_eval * _insert_heads_list_params * Added tests * Refactor the way we generate usage instructions. Should be easier to follow and update * Improve docstrings * Improved docstrings * Improved docstrings * Improved docstrings * Improved docstrings * Rename bboxes -> bboxes_xyxy * Fixed instructions text * Improve efficiency of training --------- Co-authored-by: Shay Aharon <80472096+shaydeci@users.noreply.github.com>
- Loading branch information
Showing
28 changed files
with
3,862 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
class ModelHasNoPreprocessingParamsException(Exception): | ||
""" | ||
Exception that is raised when model does not have preprocessing parameters. | ||
""" | ||
|
||
pass |
552 changes: 552 additions & 0 deletions
552
src/super_gradients/module_interfaces/exportable_pose_estimation.py
Large diffs are not rendered by default.
Oops, something went wrong.
155 changes: 155 additions & 0 deletions
155
src/super_gradients/module_interfaces/usage_instructions.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,155 @@ | ||
import numpy as np | ||
from torch import nn | ||
|
||
from super_gradients.conversion.conversion_utils import torch_dtype_to_numpy_dtype | ||
from super_gradients.conversion.conversion_enums import ExportTargetBackend, ExportQuantizationMode, DetectionOutputFormatMode | ||
|
||
|
||
def build_preprocessing_hint_text(preprocessing_module: nn.Module) -> str: | ||
module_repr = repr(preprocessing_module) | ||
return f""" | ||
Exported model already contains preprocessing (normalization) step, so you don't need to do it manually. | ||
Preprocessing steps to be applied to input image are: | ||
{module_repr} | ||
""" | ||
|
||
|
||
def build_postprocessing_hint_text(num_pre_nms_predictions, max_predictions_per_image, nms_threshold, confidence_threshold, output_predictions_format) -> str: | ||
return f""" | ||
Exported model contains postprocessing (NMS) step with the following parameters: | ||
num_pre_nms_predictions={num_pre_nms_predictions} | ||
max_predictions_per_image={max_predictions_per_image} | ||
nms_threshold={nms_threshold} | ||
confidence_threshold={confidence_threshold} | ||
output_predictions_format={output_predictions_format} | ||
""" | ||
|
||
|
||
def build_usage_instructions_for_pose_estimation( | ||
*, | ||
output, | ||
batch_size, | ||
input_image_channels, | ||
input_image_shape, | ||
input_image_dtype, | ||
preprocessing, | ||
preprocessing_module, | ||
postprocessing, | ||
postprocessing_module, | ||
num_pre_nms_predictions, | ||
max_predictions_per_image, | ||
confidence_threshold, | ||
nms_threshold, | ||
output_predictions_format, | ||
engine: ExportTargetBackend, | ||
quantization_mode: ExportQuantizationMode, | ||
) -> str: | ||
# Add usage instructions | ||
usage_instructions = f""" | ||
Model exported successfully to {output} | ||
Model expects input image of shape [{batch_size}, {input_image_channels}, {input_image_shape[0]}, {input_image_shape[1]}] | ||
Input image dtype is {input_image_dtype}""" | ||
|
||
if preprocessing: | ||
preprocessing_hint_text = build_preprocessing_hint_text(preprocessing_module) | ||
usage_instructions += f"\n{preprocessing_hint_text}" | ||
|
||
if postprocessing: | ||
postprocessing_hint_text = build_postprocessing_hint_text( | ||
num_pre_nms_predictions, max_predictions_per_image, nms_threshold, confidence_threshold, output_predictions_format | ||
) | ||
usage_instructions += f"\n{postprocessing_hint_text}" | ||
|
||
if engine in (ExportTargetBackend.ONNXRUNTIME, ExportTargetBackend.TENSORRT): | ||
dtype_name = np.dtype(torch_dtype_to_numpy_dtype(input_image_dtype)).name | ||
|
||
usage_instructions += f""" | ||
Exported model is in ONNX format and can be used with ONNXRuntime | ||
To run inference with ONNXRuntime, please use the following code snippet: | ||
import onnxruntime | ||
import numpy as np | ||
session = onnxruntime.InferenceSession("{output}", providers=["CUDAExecutionProvider", "CPUExecutionProvider"]) | ||
inputs = [o.name for o in session.get_inputs()] | ||
outputs = [o.name for o in session.get_outputs()] | ||
example_input_image = np.zeros(({batch_size}, {input_image_channels}, {input_image_shape[0]}, {input_image_shape[1]})).astype(np.{dtype_name}) | ||
predictions = session.run(outputs, {{inputs[0]: example_input_image}}) | ||
Exported model can also be used with TensorRT | ||
To run inference with TensorRT, please see TensorRT deployment documentation | ||
You can benchmark the model using the following code snippet: | ||
trtexec --onnx={output} {'--int8' if quantization_mode == ExportQuantizationMode.INT8 else '--fp16'} --avgRuns=100 --duration=15 | ||
""" | ||
|
||
if postprocessing is True: | ||
if output_predictions_format == DetectionOutputFormatMode.FLAT_FORMAT: | ||
usage_instructions += f""" | ||
Exported model has predictions in {output_predictions_format} format: | ||
# flat_predictions is a 2D array of [N,K] shape | ||
# Each row represents (image_index, x_min, y_min, x_max, y_max, confidence, joints...) | ||
# Please note all values are floats, so you have to convert them to integers if needed | ||
[flat_predictions] = predictions""" | ||
|
||
if batch_size == 1: | ||
usage_instructions += """ | ||
pred_bboxes = flat_predictions[:, 1:5] | ||
pred_scores = flat_predictions[:, 5] | ||
pred_joints = flat_predictions[:, 6:].reshape((len(pred_bboxes), -1, 3)) | ||
for i in range(len(pred_bboxes)): | ||
confidence = pred_scores[i] | ||
x_min, y_min, x_max, y_max = pred_bboxes[i] | ||
print(f"Detected pose with confidence={{confidence}}, x_min={{x_min}}, y_min={{y_min}}, x_max={{x_max}}, y_max={{y_max}}") | ||
for joint_index, (x, y, confidence) in enumerate(pred_joints[i]):") | ||
print(f"Joint {{joint_index}} has coordinates x={{x}}, y={{y}}, confidence={{confidence}}") | ||
""" | ||
|
||
else: | ||
usage_instructions += f""" | ||
for current_sample in range({batch_size}): | ||
predictions_for_current_sample = predictions[predictions[0] == current_sample] | ||
print("Predictions for sample " + str(current_sample)) | ||
pred_bboxes = predictions_for_current_sample[:, 1:5] | ||
pred_scores = predictions_for_current_sample[:, 5] | ||
pred_joints = predictions_for_current_sample[:, 6:].reshape((len(pred_bboxes), -1, 3)) | ||
for i in range(len(pred_bboxes)): | ||
confidence = pred_scores[i] | ||
x_min, y_min, x_max, y_max = pred_bboxes[i] | ||
print(f"Detected pose with confidence={{confidence}}, x_min={{x_min}}, y_min={{y_min}}, x_max={{x_max}}, y_max={{y_max}}") | ||
for joint_index, (x, y, confidence) in enumerate(pred_joints[i]): | ||
print(f"Joint {{joint_index}} has coordinates x={{x}}, y={{y}}, confidence={{confidence}}") | ||
""" | ||
|
||
elif output_predictions_format == DetectionOutputFormatMode.BATCH_FORMAT: | ||
# fmt: off | ||
usage_instructions += f"""Exported model has predictions in {output_predictions_format} format: | ||
num_detections, pred_boxes, pred_scores, pred_joints = predictions | ||
for image_index in range(num_detections.shape[0]): | ||
for i in range(num_detections[image_index,0]): | ||
confidence = pred_scores[image_index, i] | ||
x_min, y_min, x_max, y_max = pred_boxes[image_index, i] | ||
pred_joints = pred_joints[image_index, i] | ||
print(f"Detected pose with confidence={{confidence}}, x_min={{x_min}}, y_min={{y_min}}, x_max={{x_max}}, y_max={{y_max}}") | ||
for joint_index, (x, y, confidence) in enumerate(pred_joints[i]): | ||
print(f"Joint {{joint_index}} has coordinates x={{x}}, y={{y}}, confidence={{confidence}}") | ||
""" | ||
elif postprocessing is False: | ||
usage_instructions += """Model exported with postprocessing=False | ||
No decoding or NMS is added to the model, so you will have to decode predictions manually. | ||
Please refer to the documentation for the model you exported""" | ||
elif isinstance(postprocessing_module, nn.Module): | ||
usage_instructions += f""""Exported model contains a custom postprocessing step. | ||
We are unable to provide usage instructions to user-provided postprocessing module | ||
But here is the human-friendly representation of the postprocessing module: | ||
{repr(postprocessing_module)}""" | ||
|
||
return usage_instructions |
143 changes: 143 additions & 0 deletions
143
src/super_gradients/recipes/arch_params/yolo_nas_pose_l_arch_params.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
in_channels: 3 | ||
|
||
backbone: | ||
NStageBackbone: | ||
|
||
stem: | ||
YoloNASStem: | ||
out_channels: 48 | ||
|
||
stages: | ||
- YoloNASStage: | ||
out_channels: 96 | ||
num_blocks: 2 | ||
activation_type: relu | ||
hidden_channels: 96 | ||
concat_intermediates: True | ||
|
||
- YoloNASStage: | ||
out_channels: 192 | ||
num_blocks: 3 | ||
activation_type: relu | ||
hidden_channels: 128 | ||
concat_intermediates: True | ||
|
||
- YoloNASStage: | ||
out_channels: 384 | ||
num_blocks: 5 | ||
activation_type: relu | ||
hidden_channels: 256 | ||
concat_intermediates: True | ||
|
||
- YoloNASStage: | ||
out_channels: 768 | ||
num_blocks: 2 | ||
activation_type: relu | ||
hidden_channels: 512 | ||
concat_intermediates: True | ||
|
||
|
||
context_module: | ||
SPP: | ||
output_channels: 768 | ||
activation_type: relu | ||
k: [5,9,13] | ||
|
||
out_layers: [stage1, stage2, stage3, context_module] | ||
|
||
neck: | ||
YoloNASPANNeckWithC2: | ||
|
||
neck1: | ||
YoloNASUpStage: | ||
out_channels: 192 | ||
num_blocks: 4 | ||
hidden_channels: 128 | ||
width_mult: 1 | ||
depth_mult: 1 | ||
activation_type: relu | ||
reduce_channels: True | ||
|
||
neck2: | ||
YoloNASUpStage: | ||
out_channels: 96 | ||
num_blocks: 4 | ||
hidden_channels: 128 | ||
width_mult: 1 | ||
depth_mult: 1 | ||
activation_type: relu | ||
reduce_channels: True | ||
|
||
neck3: | ||
YoloNASDownStage: | ||
out_channels: 192 | ||
num_blocks: 4 | ||
hidden_channels: 128 | ||
activation_type: relu | ||
width_mult: 1 | ||
depth_mult: 1 | ||
|
||
neck4: | ||
YoloNASDownStage: | ||
out_channels: 384 | ||
num_blocks: 4 | ||
hidden_channels: 256 | ||
activation_type: relu | ||
width_mult: 1 | ||
depth_mult: 1 | ||
|
||
heads: | ||
YoloNASPoseNDFLHeads: | ||
num_classes: 17 | ||
reg_max: 16 | ||
heads_list: | ||
- YoloNASPoseDFLHead: | ||
bbox_inter_channels: 128 | ||
pose_inter_channels: 128 | ||
pose_regression_blocks: 2 | ||
shared_stem: False | ||
width_mult: 1 | ||
pose_conf_in_class_head: True | ||
pose_block_use_repvgg: False | ||
first_conv_group_size: 0 | ||
num_classes: | ||
stride: 8 | ||
reg_max: 16 | ||
cls_dropout_rate: 0.0 | ||
reg_dropout_rate: 0.0 | ||
|
||
- YoloNASPoseDFLHead: | ||
bbox_inter_channels: 256 | ||
pose_inter_channels: 512 | ||
pose_regression_blocks: 2 | ||
shared_stem: False | ||
width_mult: 1 | ||
pose_conf_in_class_head: True | ||
pose_block_use_repvgg: False | ||
first_conv_group_size: 0 | ||
num_classes: | ||
stride: 16 | ||
reg_max: 16 | ||
cls_dropout_rate: 0.0 | ||
reg_dropout_rate: 0.0 | ||
|
||
- YoloNASPoseDFLHead: | ||
bbox_inter_channels: 512 | ||
pose_inter_channels: 512 | ||
pose_regression_blocks: 3 | ||
shared_stem: False | ||
width_mult: 1 | ||
pose_conf_in_class_head: True | ||
pose_block_use_repvgg: False | ||
first_conv_group_size: 0 | ||
num_classes: | ||
stride: 32 | ||
reg_max: 16 | ||
cls_dropout_rate: 0.0 | ||
reg_dropout_rate: 0.0 | ||
|
||
bn_eps: 1e-6 | ||
bn_momentum: 0.03 | ||
inplace_act: True | ||
|
||
_convert_: all |
Oops, something went wrong.