From 48a4bd4e53dd91f8cd00369ce44afb6287f90b22 Mon Sep 17 00:00:00 2001 From: xiexinch Date: Wed, 24 Apr 2024 23:13:44 +0800 Subject: [PATCH 01/15] --feat=add rtmpose3d inference demo --- .../datasets/transforms/common_transforms.py | 2 +- .../datasets/transforms/topdown_transforms.py | 3 + projects/rtmpose3d/body3d_img2pose_demo.py | 439 +++++++++++ projects/rtmpose3d/configs/default_runtime.py | 54 ++ .../configs/rtmdet_m_640-8xb32_coco-person.py | 20 + .../rtmw3d-l_8xb64_cocktail14-384x288.py | 706 ++++++++++++++++++ .../rtmw3d-x_8xb64_cocktail14-384x288.py | 706 ++++++++++++++++++ projects/rtmpose3d/rtmpose3d/__init__.py | 6 + projects/rtmpose3d/rtmpose3d/loss.py | 37 + .../rtmpose3d/rtmpose3d/pose_estimator.py | 116 +++ projects/rtmpose3d/rtmpose3d/rtmw3d_head.py | 444 +++++++++++ .../rtmpose3d/rtmpose3d/simcc_3d_label.py | 335 +++++++++ projects/rtmpose3d/rtmpose3d/utils.py | 76 ++ 13 files changed, 2943 insertions(+), 1 deletion(-) create mode 100644 projects/rtmpose3d/body3d_img2pose_demo.py create mode 100644 projects/rtmpose3d/configs/default_runtime.py create mode 100644 projects/rtmpose3d/configs/rtmdet_m_640-8xb32_coco-person.py create mode 100644 projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py create mode 100644 projects/rtmpose3d/configs/rtmw3d-x_8xb64_cocktail14-384x288.py create mode 100644 projects/rtmpose3d/rtmpose3d/__init__.py create mode 100644 projects/rtmpose3d/rtmpose3d/loss.py create mode 100644 projects/rtmpose3d/rtmpose3d/pose_estimator.py create mode 100644 projects/rtmpose3d/rtmpose3d/rtmw3d_head.py create mode 100644 projects/rtmpose3d/rtmpose3d/simcc_3d_label.py create mode 100644 projects/rtmpose3d/rtmpose3d/utils.py diff --git a/mmpose/datasets/transforms/common_transforms.py b/mmpose/datasets/transforms/common_transforms.py index 33f9c560c0..b29417f045 100644 --- a/mmpose/datasets/transforms/common_transforms.py +++ b/mmpose/datasets/transforms/common_transforms.py @@ -973,7 +973,7 @@ def transform(self, results: Dict) -> Optional[dict]: # For single encoding, the encoded items will be directly added # into results. auxiliary_encode_kwargs = { - key: results[key] + key: results.get(key, None) for key in self.encoder.auxiliary_encode_keys } encoded = self.encoder.encode( diff --git a/mmpose/datasets/transforms/topdown_transforms.py b/mmpose/datasets/transforms/topdown_transforms.py index 3480c5b38c..c76d45e46a 100644 --- a/mmpose/datasets/transforms/topdown_transforms.py +++ b/mmpose/datasets/transforms/topdown_transforms.py @@ -126,6 +126,9 @@ def transform(self, results: Dict) -> Optional[dict]: transformed_keypoints[..., :2] = cv2.transform( results['keypoints'][..., :2], warp_mat) results['transformed_keypoints'] = transformed_keypoints + else: + results['transformed_keypoints'] = np.zeros([]) + results['keypoints_visible'] = np.ones((1, 1, 1)) results['input_size'] = (w, h) results['input_center'] = center diff --git a/projects/rtmpose3d/body3d_img2pose_demo.py b/projects/rtmpose3d/body3d_img2pose_demo.py new file mode 100644 index 0000000000..200043d7d4 --- /dev/null +++ b/projects/rtmpose3d/body3d_img2pose_demo.py @@ -0,0 +1,439 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import logging +import mimetypes +import os +import time +from argparse import ArgumentParser +from typing import List + +import cv2 +import json_tricks as json +import mmcv +import mmengine +import numpy as np +from mmengine.logging import print_log + +from mmpose.apis import inference_topdown, init_model +from mmpose.registry import VISUALIZERS +from mmpose.structures import (PoseDataSample, merge_data_samples, + split_instances) +from mmpose.utils import adapt_mmdet_pipeline +from mmpose.visualization import Pose3dLocalVisualizer +from rtmpose3d import * + +try: + from mmdet.apis import inference_detector, init_detector + has_mmdet = True +except (ImportError, ModuleNotFoundError): + has_mmdet = False + + +def parse_args(): + parser = ArgumentParser() + parser.add_argument('det_config', help='Config file for detection') + parser.add_argument('det_checkpoint', help='Checkpoint file for detection') + parser.add_argument( + 'pose3d_estimator_config', + type=str, + default=None, + help='Config file for the 3D pose estimator') + parser.add_argument( + 'pose3d_estimator_checkpoint', + type=str, + default=None, + help='Checkpoint file for the 3D pose estimator') + parser.add_argument('--input', type=str, default='', help='Video path') + parser.add_argument( + '--show', + action='store_true', + default=False, + help='Whether to show visualizations') + parser.add_argument( + '--disable-rebase-keypoint', + action='store_true', + default=False, + help='Whether to disable rebasing the predicted 3D pose so its ' + 'lowest keypoint has a height of 0 (landing on the ground). Rebase ' + 'is useful for visualization when the model do not predict the ' + 'global position of the 3D pose.') + parser.add_argument( + '--disable-norm-pose-2d', + action='store_true', + default=False, + help='Whether to scale the bbox (along with the 2D pose) to the ' + 'average bbox scale of the dataset, and move the bbox (along with the ' + '2D pose) to the average bbox center of the dataset. This is useful ' + 'when bbox is small, especially in multi-person scenarios.') + parser.add_argument( + '--num-instances', + type=int, + default=1, + help='The number of 3D poses to be visualized in every frame. If ' + 'less than 0, it will be set to the number of pose results in the ' + 'first frame.') + parser.add_argument( + '--output-root', + type=str, + default='', + help='Root of the output video file. ' + 'Default not saving the visualization video.') + parser.add_argument( + '--save-predictions', + action='store_true', + default=False, + help='Whether to save predicted results') + parser.add_argument( + '--device', default='cuda:0', help='Device used for inference') + parser.add_argument( + '--det-cat-id', + type=int, + default=0, + help='Category id for bounding box detection model') + parser.add_argument( + '--bbox-thr', + type=float, + default=0.5, + help='Bounding box score threshold') + parser.add_argument('--kpt-thr', type=float, default=0.3) + parser.add_argument( + '--use-oks-tracking', action='store_true', help='Using OKS tracking') + parser.add_argument( + '--tracking-thr', type=float, default=0.3, help='Tracking threshold') + parser.add_argument( + '--show-interval', type=int, default=0, help='Sleep seconds per frame') + parser.add_argument( + '--thickness', + type=int, + default=1, + help='Link thickness for visualization') + parser.add_argument( + '--radius', + type=int, + default=3, + help='Keypoint radius for visualization') + parser.add_argument( + '--online', + action='store_true', + default=False, + help='Inference mode. If set to True, can not use future frame' + 'information when using multi frames for inference in the 2D pose' + 'detection stage. Default: False.') + + args = parser.parse_args() + return args + + +def process_one_image(args, detector, frame: np.ndarray, frame_idx: int, + pose_estimator: TopdownPoseEstimator3D, + pose_est_results_last: List[PoseDataSample], + pose_est_results_list: List[List[PoseDataSample]], + next_id: int, visualize_frame: np.ndarray, + visualizer: Pose3dLocalVisualizer): + """Visualize detected and predicted keypoints of one image. + + Pipeline of this function: + + frame + | + V + +-----------------+ + | detector | + +-----------------+ + | det_result + V + +-----------------+ + | pose_estimator | + +-----------------+ + | pose_est_results + V + +-----------------+ + | post-processing | + +-----------------+ + | pred_3d_data_samples + V + +------------+ + | visualizer | + +------------+ + + Args: + args (Argument): Custom command-line arguments. + detector (mmdet.BaseDetector): The mmdet detector. + frame (np.ndarray): The image frame read from input image or video. + frame_idx (int): The index of current frame. + pose_estimator (TopdownPoseEstimator): The pose estimator for 2d pose. + pose_est_results_last (list(PoseDataSample)): The results of pose + estimation from the last frame for tracking instances. + pose_est_results_list (list(list(PoseDataSample))): The list of all + pose estimation results converted by + ``convert_keypoint_definition`` from previous frames. In + pose-lifting stage it is used to obtain the 2d estimation sequence. + next_id (int): The next track id to be used. + pose_lifter (PoseLifter): The pose-lifter for estimating 3d pose. + visualize_frame (np.ndarray): The image for drawing the results on. + visualizer (Visualizer): The visualizer for visualizing the 2d and 3d + pose estimation results. + + Returns: + pose_est_results (list(PoseDataSample)): The pose estimation result of + the current frame. + pose_est_results_list (list(list(PoseDataSample))): The list of all + converted pose estimation results until the current frame. + pred_3d_instances (InstanceData): The result of pose-lifting. + Specifically, the predicted keypoints and scores are saved at + ``pred_3d_instances.keypoints`` and + ``pred_3d_instances.keypoint_scores``. + next_id (int): The next track id to be used. + """ + # pose_dataset = pose_estimator.cfg.test_dataloader.dataset + pose_det_dataset_name = pose_estimator.dataset_meta['dataset_name'] + + # First stage: conduct 2D pose detection in a Topdown manner + # use detector to obtain person bounding boxes + det_result = inference_detector(detector, frame) + pred_instance = det_result.pred_instances.cpu().numpy() + + # filter out the person instances with category and bbox threshold + # e.g. 0 for person in COCO + bboxes = pred_instance.bboxes + bboxes = bboxes[np.logical_and(pred_instance.labels == args.det_cat_id, + pred_instance.scores > args.bbox_thr)] + + # estimate pose results for current image + pose_est_results = inference_topdown(pose_estimator, frame, bboxes) + + # post-processing + for idx, pose_est_result in enumerate(pose_est_results): + pose_est_result.track_id = pose_est_results[idx].get('track_id', 1e4) + + pred_instances = pose_est_result.pred_instances + keypoints = pred_instances.keypoints + keypoint_scores = pred_instances.keypoint_scores + if keypoint_scores.ndim == 3: + keypoint_scores = np.squeeze(keypoint_scores, axis=1) + pose_est_results[ + idx].pred_instances.keypoint_scores = keypoint_scores + if keypoints.ndim == 4: + keypoints = np.squeeze(keypoints, axis=1) + + keypoints = -keypoints[..., [0, 2, 1]] + # keypoints[..., 0] = -keypoints[..., 0] + # keypoints[..., 2] = -keypoints[..., 2] + + # rebase height (z-axis) + if not args.disable_rebase_keypoint: + keypoints[..., 2] -= np.min( + keypoints[..., 2], axis=-1, keepdims=True) + + pose_est_results[idx].pred_instances.keypoints = keypoints + + pose_est_results = sorted( + pose_est_results, key=lambda x: x.get('track_id', 1e4)) + + pred_3d_data_samples = merge_data_samples(pose_est_results) + pred_3d_instances = pred_3d_data_samples.get('pred_instances', None) + + if args.num_instances < 0: + args.num_instances = len(pose_est_results) + + # Visualization + if visualizer is not None: + visualizer.add_datasample( + 'result', + visualize_frame, + data_sample=pred_3d_data_samples, + det_data_sample=pred_3d_data_samples, + draw_gt=False, + draw_2d=True, + dataset_2d=pose_det_dataset_name, + dataset_3d=pose_det_dataset_name, + show=args.show, + draw_bbox=True, + kpt_thr=args.kpt_thr, + convert_keypoint=False, + axis_limit=400, + axis_azimuth=70, + axis_elev=15, + num_instances=args.num_instances, + wait_time=args.show_interval, + root_index=[11, 12]) + + return pose_est_results, pose_est_results_list, pred_3d_instances, next_id + + +def main(): + assert has_mmdet, 'Please install mmdet to run the demo.' + + args = parse_args() + + assert args.show or (args.output_root != '') + assert args.input != '' + assert args.det_config is not None + assert args.det_checkpoint is not None + + detector = init_detector( + args.det_config, args.det_checkpoint, device=args.device.lower()) + detector.cfg = adapt_mmdet_pipeline(detector.cfg) + + pose_estimator = init_model( + args.pose3d_estimator_config, + args.pose3d_estimator_checkpoint, + device=args.device.lower()) + + det_kpt_color = pose_estimator.dataset_meta.get('keypoint_colors', None) + det_dataset_skeleton = pose_estimator.dataset_meta.get( + 'skeleton_links', None) + det_dataset_link_color = pose_estimator.dataset_meta.get( + 'skeleton_link_colors', None) + + pose_estimator.cfg.model.test_cfg.mode = 'simcc' + pose_estimator.cfg.visualizer.radius = args.radius + pose_estimator.cfg.visualizer.line_width = args.thickness + pose_estimator.cfg.visualizer.det_kpt_color = det_kpt_color + pose_estimator.cfg.visualizer.det_dataset_skeleton = det_dataset_skeleton + pose_estimator.cfg.visualizer.det_dataset_link_color = det_dataset_link_color # noqa: E501 + pose_estimator.cfg.visualizer.skeleton = det_dataset_skeleton + pose_estimator.cfg.visualizer.link_color = det_dataset_link_color + pose_estimator.cfg.visualizer.kpt_color = det_kpt_color + visualizer = VISUALIZERS.build(pose_estimator.cfg.visualizer) + + if args.input == 'webcam': + input_type = 'webcam' + else: + input_type = mimetypes.guess_type(args.input)[0].split('/')[0] + + if args.output_root == '': + save_output = False + else: + mmengine.mkdir_or_exist(args.output_root) + output_file = os.path.join(args.output_root, + os.path.basename(args.input)) + if args.input == 'webcam': + output_file += '.mp4' + save_output = True + + if args.save_predictions: + assert args.output_root != '' + args.pred_save_path = f'{args.output_root}/results_' \ + f'{os.path.splitext(os.path.basename(args.input))[0]}.json' + + if save_output: + fourcc = cv2.VideoWriter_fourcc(*'mp4v') + + pose_est_results_list = [] + pred_instances_list = [] + if input_type == 'image': + frame = mmcv.imread(args.input, channel_order='rgb') + _, _, pred_3d_instances, _ = process_one_image( + args=args, + detector=detector, + frame=args.input, + frame_idx=0, + pose_estimator=pose_estimator, + pose_est_results_last=[], + pose_est_results_list=pose_est_results_list, + next_id=0, + visualize_frame=frame, + visualizer=visualizer) + + if args.save_predictions: + # save prediction results + pred_instances_list = split_instances(pred_3d_instances) + + if save_output: + frame_vis = visualizer.get_image() + mmcv.imwrite(mmcv.rgb2bgr(frame_vis), output_file) + + elif input_type in ['webcam', 'video']: + next_id = 0 + pose_est_results = [] + + if args.input == 'webcam': + video = cv2.VideoCapture(0) + else: + video = cv2.VideoCapture(args.input) + + (major_ver, minor_ver, subminor_ver) = (cv2.__version__).split('.') + if int(major_ver) < 3: + fps = video.get(cv2.cv.CV_CAP_PROP_FPS) + else: + fps = video.get(cv2.CAP_PROP_FPS) + + video_writer = None + frame_idx = 0 + + while video.isOpened(): + success, frame = video.read() + frame_idx += 1 + + if not success: + break + + pose_est_results_last = pose_est_results + + # First stage: 2D pose detection + # make person results for current image + (pose_est_results, pose_est_results_list, pred_3d_instances, + next_id) = process_one_image( + args=args, + detector=detector, + frame=frame, + frame_idx=frame_idx, + pose_estimator=pose_estimator, + pose_est_results_last=pose_est_results_last, + pose_est_results_list=pose_est_results_list, + next_id=next_id, + visualize_frame=mmcv.bgr2rgb(frame), + visualizer=visualizer) + + if args.save_predictions: + # save prediction results + pred_instances_list.append( + dict( + frame_id=frame_idx, + instances=split_instances(pred_3d_instances))) + + if save_output: + frame_vis = visualizer.get_image() + if video_writer is None: + # the size of the image with visualization may vary + # depending on the presence of heatmaps + video_writer = cv2.VideoWriter(output_file, fourcc, fps, + (frame_vis.shape[1], + frame_vis.shape[0])) + video_writer.write(mmcv.rgb2bgr(frame_vis)) + + if args.show: + # press ESC to exit + if cv2.waitKey(5) & 0xFF == 27: + break + time.sleep(args.show_interval) + + video.release() + + if video_writer: + video_writer.release() + else: + args.save_predictions = False + raise ValueError( + f'file {os.path.basename(args.input)} has invalid format.') + + if args.save_predictions: + with open(args.pred_save_path, 'w') as f: + json.dump( + dict( + meta_info=pose_estimator.dataset_meta, + instance_info=pred_instances_list), + f, + indent='\t') + print(f'predictions have been saved at {args.pred_save_path}') + + if save_output: + input_type = input_type.replace('webcam', 'video') + print_log( + f'the output {input_type} has been saved at {output_file}', + logger='current', + level=logging.INFO) + + +if __name__ == '__main__': + main() diff --git a/projects/rtmpose3d/configs/default_runtime.py b/projects/rtmpose3d/configs/default_runtime.py new file mode 100644 index 0000000000..6f27c0345a --- /dev/null +++ b/projects/rtmpose3d/configs/default_runtime.py @@ -0,0 +1,54 @@ +default_scope = 'mmpose' + +# hooks +default_hooks = dict( + timer=dict(type='IterTimerHook'), + logger=dict(type='LoggerHook', interval=50), + param_scheduler=dict(type='ParamSchedulerHook'), + checkpoint=dict(type='CheckpointHook', interval=10), + sampler_seed=dict(type='DistSamplerSeedHook'), + visualization=dict(type='PoseVisualizationHook', enable=False), + badcase=dict( + type='BadCaseAnalysisHook', + enable=False, + out_dir='badcase', + metric_type='loss', + badcase_thr=5)) + +# custom hooks +custom_hooks = [ + # Synchronize model buffers such as running_mean and running_var in BN + # at the end of each epoch + dict(type='SyncBuffersHook') +] + +# multi-processing backend +env_cfg = dict( + cudnn_benchmark=False, + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + dist_cfg=dict(backend='nccl'), +) + +# visualizer +vis_backends = [ + dict(type='LocalVisBackend'), + # dict(type='TensorboardVisBackend'), + # dict(type='WandbVisBackend'), +] +visualizer = dict( + type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +# logger +log_processor = dict( + type='LogProcessor', window_size=50, by_epoch=True, num_digits=6) +log_level = 'INFO' +load_from = None +resume = False + +# file I/O backend +backend_args = dict(backend='local') + +# training/validation/testing progress +train_cfg = dict(by_epoch=True) +val_cfg = dict() +test_cfg = dict() diff --git a/projects/rtmpose3d/configs/rtmdet_m_640-8xb32_coco-person.py b/projects/rtmpose3d/configs/rtmdet_m_640-8xb32_coco-person.py new file mode 100644 index 0000000000..620de8dc8f --- /dev/null +++ b/projects/rtmpose3d/configs/rtmdet_m_640-8xb32_coco-person.py @@ -0,0 +1,20 @@ +_base_ = 'mmdet::rtmdet/rtmdet_m_8xb32-300e_coco.py' + +checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth' # noqa + +model = dict( + backbone=dict( + init_cfg=dict( + type='Pretrained', prefix='backbone.', checkpoint=checkpoint)), + bbox_head=dict(num_classes=1), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) + +train_dataloader = dict(dataset=dict(metainfo=dict(classes=('person', )))) + +val_dataloader = dict(dataset=dict(metainfo=dict(classes=('person', )))) +test_dataloader = val_dataloader diff --git a/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py b/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py new file mode 100644 index 0000000000..832742788d --- /dev/null +++ b/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py @@ -0,0 +1,706 @@ +_base_ = ['./default_runtime.py'] + +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +# runtime +max_epochs = 270 +stage2_num_epochs = 10 +base_lr = 5e-4 +num_keypoints = 133 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=2024) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=4096) + +# codec settings +codec = dict( + type='SimCC3DLabel', + input_size=(288, 384, 288), + sigma=(6., 6.93, 6.), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False, + root_index=(11, 12)) + +# model settings +model = dict( + type='TopdownPoseEstimator3D', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1., + widen_factor=1., + channel_attention=True, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='checkpoints/rtmpose-l_simcc-ucoco_dw-ucoco_270e-256x192-4d6dfc62_20230728.pth' # noqa + )), + neck=dict( + type='CSPNeXtPAFPN', + in_channels=[256, 512, 1024], + out_channels=None, + out_indices=( + 1, + 2, + ), + num_csp_blocks=2, + expand_ratio=0.5, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU', inplace=True)), + head=dict( + type='RTMW3DHead', + in_channels=1024, + out_channels=133, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0.1, + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=[ + dict( + type='KLDiscretLoss2', + use_target_weight=True, + beta=10., + label_softmax=True), + dict( + type='BoneLoss', + joint_parents=[0, 1, 2, 3, 4, 5, 6, 5, 6, 7, 8, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 50, 50, 51, 52, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 91, 92, 93, 94, 91, 96, 97, 98, 91, 100, 101, 102, 91, 104, 105, 106, 91, 108, 109, 110, 8, 112, 113, 114, 113, 112, 117, 118, 117, 112, 121, 122, 123, 112, 125, 126, 127, 112, 129, 130, 131], + use_target_weight=True, + loss_weight=2.0 + ) + ], + decoder=codec), + # test_cfg=dict(flip_test=False, mode='2d') + test_cfg=dict(flip_test=False) +) + +# base dataset settings +data_mode = 'topdown' +dataset_type = 'H36MWholeBodyDataset' +backend_args = dict(backend='local') + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='RandomBackground', + bg_dir='/mnt/data/oss_beijing/mmseg/obj365v1_images', + bg_prob=0.5, + ), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=(288, 384)), + dict(type='YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=(288, 384)), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.5, 1.5], + rotate_factor=90), + dict(type='TopdownAffine', input_size=(288, 384)), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# h3wb dataset +h3wb_dataset = dict( + type='H36MWholeBodyDataset', + ann_file='annotation_body3d/h3wb_train_bbox.npz', + seq_len=1, + causal=True, + data_root='data/h36m/', + data_prefix=dict(img='images/'), + test_mode=False, + pipeline=[]) + + +# dna rendering dataset +dna_rendering_dataset = dict( + type='DNARenderingDataset', + data_root='data/dna_rendering_part1', + data_mode='topdown', + ann_file='instances.npz', + subset_frac=0.1, + pipeline=[ + dict(type='LoadMask', backend_args=backend_args) + ], +) + +# mapping + +aic_coco133 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7), (5, 9), (6, 12), + (7, 14), (8, 16), (9, 11), (10, 13), (11, 15)] + +crowdpose_coco133 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11), + (7, 12), (8, 13), (9, 14), (10, 15), (11, 16)] + +mpii_coco133 = [ + (0, 16), + (1, 14), + (2, 12), + (3, 11), + (4, 13), + (5, 15), + (10, 10), + (11, 8), + (12, 6), + (13, 5), + (14, 7), + (15, 9), +] + +jhmdb_coco133 = [ + (3, 6), + (4, 5), + (5, 12), + (6, 11), + (7, 8), + (8, 7), + (9, 14), + (10, 13), + (11, 10), + (12, 9), + (13, 16), + (14, 15), +] + +halpe_coco133 = [(i, i) + for i in range(17)] + [(20, 17), (21, 20), (22, 18), (23, 21), + (24, 19), + (25, 22)] + [(i, i - 3) + for i in range(26, 136)] + +posetrack_coco133 = [ + (0, 0), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +humanart_coco133 = [(i, i) for i in range(17)] + [(17, 99), (18, 120), + (19, 17), (20, 20)] + +data_mode = 'topdown' +data_root = 'data/' + +# train datasets +dataset_coco = dict( + type='CocoWholeBodyDataset', + data_root='data/coco/', + data_mode='topdown', + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=[], +) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=aic_coco133) + ], +) + +dataset_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=crowdpose_coco133) + ], +) + +dataset_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_train.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=mpii_coco133) + ], +) + +dataset_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_train.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=jhmdb_coco133) + ], +) + +dataset_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_train_v1.json', + data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=halpe_coco133) + ], +) + +dataset_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_train.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=posetrack_coco133) + ], +) + +dataset_humanart = dict( + type='HumanArt21Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='HumanArt/annotations/training_humanart.json', + filter_cfg=dict(scenes=['real_human']), + data_prefix=dict(img='pose/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=humanart_coco133) + ]) + +face_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale', padding=1.25), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[1.5, 2.0], + rotate_factor=0), +] + +wflw_coco133 = [(i * 2, 23 + i) + for i in range(17)] + [(33 + i, 40 + i) for i in range(5)] + [ + (42 + i, 45 + i) for i in range(5) + ] + [(51 + i, 50 + i) + for i in range(9)] + [(60, 59), (61, 60), (63, 61), + (64, 62), (65, 63), (67, 64), + (68, 65), (69, 66), (71, 67), + (72, 68), (73, 69), + (75, 70)] + [(76 + i, 71 + i) + for i in range(20)] +dataset_wflw = dict( + type='WFLWDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='wflw/annotations/face_landmarks_wflw_train.json', + data_prefix=dict(img='pose/WFLW/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=wflw_coco133), *face_pipeline + ], +) + +mapping_300w_coco133 = [(i, 23 + i) for i in range(68)] +dataset_300w = dict( + type='Face300WDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='300w/annotations/face_landmarks_300w_train.json', + data_prefix=dict(img='pose/300w/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=mapping_300w_coco133), *face_pipeline + ], +) + +cofw_coco133 = [(0, 40), (2, 44), (4, 42), (1, 49), (3, 45), (6, 47), (8, 59), + (10, 62), (9, 68), (11, 65), (18, 54), (19, 58), (20, 53), + (21, 56), (22, 71), (23, 77), (24, 74), (25, 85), (26, 89), + (27, 80), (28, 31)] +dataset_cofw = dict( + type='COFWDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='cofw/annotations/cofw_train.json', + data_prefix=dict(img='pose/COFW/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=cofw_coco133), *face_pipeline + ], +) + +lapa_coco133 = [(i * 2, 23 + i) for i in range(17)] + [ + (33 + i, 40 + i) for i in range(5) +] + [(42 + i, 45 + i) for i in range(5)] + [ + (51 + i, 50 + i) for i in range(4) +] + [(58 + i, 54 + i) for i in range(5)] + [(66, 59), (67, 60), (69, 61), + (70, 62), (71, 63), (73, 64), + (75, 65), (76, 66), (78, 67), + (79, 68), (80, 69), + (82, 70)] + [(84 + i, 71 + i) + for i in range(20)] +dataset_lapa = dict( + type='LapaDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='LaPa/annotations/lapa_trainval.json', + data_prefix=dict(img='pose/LaPa/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=lapa_coco133), *face_pipeline + ], +) + +dataset_wb = dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=[ + dataset_coco, + dataset_halpe + ], + pipeline=[], + test_mode=False, +) + +dataset_body = dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=[ + dataset_aic, + dataset_crowdpose, + dataset_mpii, + dataset_jhmdb, + dataset_posetrack, + # dataset_humanart, + ], + pipeline=[], + test_mode=False, +) + +dataset_face = dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=[ + dataset_wflw, + dataset_300w, + dataset_cofw, + dataset_lapa, + ], + pipeline=[], + test_mode=False, +) + +hand_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[1.5, 2.0], + rotate_factor=0), +] + +interhand_left = [(21, 95), (22, 94), (23, 93), (24, 92), (25, 99), (26, 98), + (27, 97), (28, 96), (29, 103), (30, 102), (31, 101), + (32, 100), (33, 107), (34, 106), (35, 105), (36, 104), + (37, 111), (38, 110), (39, 109), (40, 108), (41, 91)] +interhand_right = [(i - 21, j + 21) for i, j in interhand_left] +interhand_coco133 = interhand_right + interhand_left + +dataset_interhand2d = dict( + type='InterHand2DDoubleDataset', + data_root='data/interhand2.6m/', + data_mode='topdown', + ann_file='annotations/all/InterHand2.6M_train_data.json', + camera_param_file='annotations/all/InterHand2.6M_train_camera.json', + joint_file='annotations/all/InterHand2.6M_train_joint_3d.json', + data_prefix=dict(img='images/train/'), + sample_interval=10, + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=interhand_coco133, + ), *hand_pipeline + ], +) + +dataset_interhand3d = dict( + type='InterHand3DDataset', + data_root='data/interhand2.6m/', + data_mode='topdown', + ann_file='annotations/all/InterHand2.6M_train_data.json', + camera_param_file='annotations/all/InterHand2.6M_train_camera.json', + joint_file='annotations/all/InterHand2.6M_train_joint_3d.json', + use_gt_root_depth=True, + rootnet_result_file=None, + data_prefix=dict(img='images/train/'), + sample_interval=10, + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=interhand_coco133, + ), *hand_pipeline + ], +) + +dataset_hand = dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=[dataset_interhand3d], + pipeline=[], + test_mode=False, +) + + +# ubody dataset +scenes = [ + 'Magic_show', + 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow', + 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing', + 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference' +] +ubody_datasets = [] +for scene in scenes: + train_ann = f'annotations/{scene}/train_3dkeypoint_annotation.json' + ubody = dict( + type='UBody3dDataset', + data_root='data/UBody/', + ann_file=train_ann, + data_mode='topdown', + causal=True, + seq_len=1, + data_prefix=dict(img='images/'), + subset_frac=0.1, + pipeline=[]) + ubody_datasets.append(ubody) + + +train_datasets = [ + dataset_wb, + dataset_body, + dataset_face, + # dataset_hand, + *ubody_datasets, + h3wb_dataset, + # dna_rendering_dataset +] + + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + datasets=train_datasets, + pipeline=train_pipeline, + metainfo=dict(from_file='configs/_base_/datasets/h3wb.py'), + test_mode=False)) + +# hooks +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + save_best='MPJPE', + rule='less', + max_keep_ckpts=1)) + +# hooks +# default_hooks = dict( +# checkpoint=dict( +# save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# eval h3wb +# val_dataloader = dict( +# batch_size=64, +# num_workers=10, +# persistent_workers=True, +# drop_last=False, +# sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), +# dataset=dict( +# type='H36MWholeBodyDataset', +# ann_file='annotation_body3d/h3wb_train_bbox.npz', +# seq_len=1, +# causal=True, +# data_root='data/h36m/', +# data_prefix=dict(img='images/'), +# test_mode=True, +# pipeline=val_pipeline)) +# test_dataloader = val_dataloader + +# # evaluators +# val_evaluator = [ +# dict(type='SimpleMPJPE', mode='mpjpe'), +# dict(type='SimpleMPJPE', mode='p-mpjpe') +# ] +# test_evaluator = val_evaluator + +# eval coco +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CocoWholeBodyDataset', + data_root='data/coco/', + data_mode='topdown', + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + bbox_file='data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json', + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file='data/coco/' + 'annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/projects/rtmpose3d/configs/rtmw3d-x_8xb64_cocktail14-384x288.py b/projects/rtmpose3d/configs/rtmw3d-x_8xb64_cocktail14-384x288.py new file mode 100644 index 0000000000..3a822f50b8 --- /dev/null +++ b/projects/rtmpose3d/configs/rtmw3d-x_8xb64_cocktail14-384x288.py @@ -0,0 +1,706 @@ +_base_ = ['../../_base_/default_runtime.py'] + +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +# runtime +max_epochs = 270 +stage2_num_epochs = 10 +base_lr = 5e-4 +num_keypoints = 133 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=2024) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=4096) + +# codec settings +codec = dict( + type='SimCC3DLabel', + input_size=(288, 384, 288), + sigma=(6., 6.93, 6.), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False, + root_index=(11, 12)) + +# model settings +model = dict( + type='TopdownPoseEstimator3D', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1.33, + widen_factor=1.25, + channel_attention=True, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='checkpoints/rtmpose-x_simcc-ucoco_pt-aic-coco_270e-384x288-f5b50679_20230822.pth' # noqa + )), + neck=dict( + type='CSPNeXtPAFPN', + in_channels=[320, 640, 1280], + out_channels=None, + out_indices=( + 1, + 2, + ), + num_csp_blocks=2, + expand_ratio=0.5, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU', inplace=True)), + head=dict( + type='RTMW3DHead', + in_channels=1280, + out_channels=133, + input_size=codec['input_size'], + in_featuremap_size=tuple([s // 32 for s in codec['input_size']]), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=[ + dict( + type='KLDiscretLoss2', + use_target_weight=True, + beta=10., + label_softmax=True), + dict( + type='BoneLoss', + joint_parents=[0, 1, 2, 3, 4, 5, 6, 5, 6, 7, 8, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 50, 50, 51, 52, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 91, 92, 93, 94, 91, 96, 97, 98, 91, 100, 101, 102, 91, 104, 105, 106, 91, 108, 109, 110, 8, 112, 113, 114, 113, 112, 117, 118, 117, 112, 121, 122, 123, 112, 125, 126, 127, 112, 129, 130, 131], + use_target_weight=True, + loss_weight=2.0 + ) + ], + decoder=codec), + test_cfg=dict(flip_test=False, mode='2d') + # test_cfg=dict(flip_test=False) +) + +# base dataset settings +data_mode = 'topdown' + +backend_args = dict(backend='local') + +# pipelines +train_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='RandomBackground', + bg_dir='/mnt/data/oss_beijing/mmseg/obj365v1_images', + bg_prob=0.5, + ), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=(288, 384)), + dict(type='YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=(288, 384)), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +train_pipeline_stage2 = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.5, 1.5], + rotate_factor=90), + dict(type='TopdownAffine', input_size=(288, 384)), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# h3wb dataset +h3wb_dataset = dict( + type='H36MWholeBodyDataset', + ann_file='annotation_body3d/h3wb_train_bbox.npz', + seq_len=1, + causal=True, + data_root='data/h36m/', + data_prefix=dict(img='images/'), + test_mode=False, + pipeline=[]) + + +# dna rendering dataset +dna_rendering_dataset = dict( + type='DNARenderingDataset', + data_root='data/dna_rendering_part1', + data_mode='topdown', + ann_file='instances.npz', + subset_frac=0.1, + pipeline=[ + dict(type='LoadMask', backend_args=backend_args) + ], +) + +# mapping + +aic_coco133 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7), (5, 9), (6, 12), + (7, 14), (8, 16), (9, 11), (10, 13), (11, 15)] + +crowdpose_coco133 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11), + (7, 12), (8, 13), (9, 14), (10, 15), (11, 16)] + +mpii_coco133 = [ + (0, 16), + (1, 14), + (2, 12), + (3, 11), + (4, 13), + (5, 15), + (10, 10), + (11, 8), + (12, 6), + (13, 5), + (14, 7), + (15, 9), +] + +jhmdb_coco133 = [ + (3, 6), + (4, 5), + (5, 12), + (6, 11), + (7, 8), + (8, 7), + (9, 14), + (10, 13), + (11, 10), + (12, 9), + (13, 16), + (14, 15), +] + +halpe_coco133 = [(i, i) + for i in range(17)] + [(20, 17), (21, 20), (22, 18), (23, 21), + (24, 19), + (25, 22)] + [(i, i - 3) + for i in range(26, 136)] + +posetrack_coco133 = [ + (0, 0), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +humanart_coco133 = [(i, i) for i in range(17)] + [(17, 99), (18, 120), + (19, 17), (20, 20)] + +data_mode = 'topdown' +data_root = 'data/' + +# train datasets +dataset_coco = dict( + type='CocoWholeBodyDataset', + data_root='data/coco/', + data_mode='topdown', + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=[], +) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=aic_coco133) + ], +) + +dataset_crowdpose = dict( + type='CrowdPoseDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=crowdpose_coco133) + ], +) + +dataset_mpii = dict( + type='MpiiDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='mpii/annotations/mpii_train.json', + data_prefix=dict(img='pose/MPI/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=mpii_coco133) + ], +) + +dataset_jhmdb = dict( + type='JhmdbDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='jhmdb/annotations/Sub1_train.json', + data_prefix=dict(img='pose/JHMDB/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=jhmdb_coco133) + ], +) + +dataset_halpe = dict( + type='HalpeDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='halpe/annotations/halpe_train_v1.json', + data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=halpe_coco133) + ], +) + +dataset_posetrack = dict( + type='PoseTrack18Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='posetrack18/annotations/posetrack18_train.json', + data_prefix=dict(img='pose/PoseChallenge2018/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=posetrack_coco133) + ], +) + +dataset_humanart = dict( + type='HumanArt21Dataset', + data_root=data_root, + data_mode=data_mode, + ann_file='HumanArt/annotations/training_humanart.json', + filter_cfg=dict(scenes=['real_human']), + data_prefix=dict(img='pose/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=humanart_coco133) + ]) + +face_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale', padding=1.25), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[1.5, 2.0], + rotate_factor=0), +] + +wflw_coco133 = [(i * 2, 23 + i) + for i in range(17)] + [(33 + i, 40 + i) for i in range(5)] + [ + (42 + i, 45 + i) for i in range(5) + ] + [(51 + i, 50 + i) + for i in range(9)] + [(60, 59), (61, 60), (63, 61), + (64, 62), (65, 63), (67, 64), + (68, 65), (69, 66), (71, 67), + (72, 68), (73, 69), + (75, 70)] + [(76 + i, 71 + i) + for i in range(20)] +dataset_wflw = dict( + type='WFLWDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='wflw/annotations/face_landmarks_wflw_train.json', + data_prefix=dict(img='pose/WFLW/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=wflw_coco133), *face_pipeline + ], +) + +mapping_300w_coco133 = [(i, 23 + i) for i in range(68)] +dataset_300w = dict( + type='Face300WDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='300w/annotations/face_landmarks_300w_train.json', + data_prefix=dict(img='pose/300w/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=mapping_300w_coco133), *face_pipeline + ], +) + +cofw_coco133 = [(0, 40), (2, 44), (4, 42), (1, 49), (3, 45), (6, 47), (8, 59), + (10, 62), (9, 68), (11, 65), (18, 54), (19, 58), (20, 53), + (21, 56), (22, 71), (23, 77), (24, 74), (25, 85), (26, 89), + (27, 80), (28, 31)] +dataset_cofw = dict( + type='COFWDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='cofw/annotations/cofw_train.json', + data_prefix=dict(img='pose/COFW/images/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=cofw_coco133), *face_pipeline + ], +) + +lapa_coco133 = [(i * 2, 23 + i) for i in range(17)] + [ + (33 + i, 40 + i) for i in range(5) +] + [(42 + i, 45 + i) for i in range(5)] + [ + (51 + i, 50 + i) for i in range(4) +] + [(58 + i, 54 + i) for i in range(5)] + [(66, 59), (67, 60), (69, 61), + (70, 62), (71, 63), (73, 64), + (75, 65), (76, 66), (78, 67), + (79, 68), (80, 69), + (82, 70)] + [(84 + i, 71 + i) + for i in range(20)] +dataset_lapa = dict( + type='LapaDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='LaPa/annotations/lapa_trainval.json', + data_prefix=dict(img='pose/LaPa/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=lapa_coco133), *face_pipeline + ], +) + +dataset_wb = dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=[ + dataset_coco, + dataset_halpe + ], + pipeline=[], + test_mode=False, +) + +dataset_body = dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=[ + dataset_aic, + dataset_crowdpose, + dataset_mpii, + dataset_jhmdb, + dataset_posetrack, + # dataset_humanart, + ], + pipeline=[], + test_mode=False, +) + +dataset_face = dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=[ + dataset_wflw, + dataset_300w, + dataset_cofw, + dataset_lapa, + ], + pipeline=[], + test_mode=False, +) + +hand_pipeline = [ + dict(type='LoadImage', backend_args=backend_args), + dict(type='GetBBoxCenterScale'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[1.5, 2.0], + rotate_factor=0), +] + +interhand_left = [(21, 95), (22, 94), (23, 93), (24, 92), (25, 99), (26, 98), + (27, 97), (28, 96), (29, 103), (30, 102), (31, 101), + (32, 100), (33, 107), (34, 106), (35, 105), (36, 104), + (37, 111), (38, 110), (39, 109), (40, 108), (41, 91)] +interhand_right = [(i - 21, j + 21) for i, j in interhand_left] +interhand_coco133 = interhand_right + interhand_left + +dataset_interhand2d = dict( + type='InterHand2DDoubleDataset', + data_root='data/interhand2.6m/', + data_mode='topdown', + ann_file='annotations/all/InterHand2.6M_train_data.json', + camera_param_file='annotations/all/InterHand2.6M_train_camera.json', + joint_file='annotations/all/InterHand2.6M_train_joint_3d.json', + data_prefix=dict(img='images/train/'), + sample_interval=10, + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=interhand_coco133, + ), *hand_pipeline + ], +) + +dataset_interhand3d = dict( + type='InterHand3DDataset', + data_root='data/interhand2.6m/', + data_mode='topdown', + ann_file='annotations/all/InterHand2.6M_train_data.json', + camera_param_file='annotations/all/InterHand2.6M_train_camera.json', + joint_file='annotations/all/InterHand2.6M_train_joint_3d.json', + use_gt_root_depth=True, + rootnet_result_file=None, + data_prefix=dict(img='images/train/'), + sample_interval=10, + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=num_keypoints, + mapping=interhand_coco133, + ), *hand_pipeline + ], +) + +dataset_hand = dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + datasets=[dataset_interhand3d], + pipeline=[], + test_mode=False, +) + + +# ubody dataset +scenes = [ + 'Magic_show', + 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow', + 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing', + 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference' +] +ubody_datasets = [] +for scene in scenes: + train_ann = f'annotations/{scene}/train_3dkeypoint_annotation.json' + ubody = dict( + type='UBody3dDataset', + data_root='data/UBody/', + ann_file=train_ann, + data_mode='topdown', + causal=True, + seq_len=1, + data_prefix=dict(img='images/'), + subset_frac=0.1, + pipeline=[]) + ubody_datasets.append(ubody) + + +train_datasets = [ + dataset_wb, + dataset_body, + dataset_face, + dataset_hand, + *ubody_datasets, + h3wb_dataset, + # dna_rendering_dataset +] + + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + datasets=train_datasets, + pipeline=train_pipeline, + metainfo=dict(from_file='configs/_base_/datasets/h3wb.py'), + test_mode=False)) + +# hooks +# default_hooks = dict( +# checkpoint=dict( +# type='CheckpointHook', +# save_best='MPJPE', +# rule='less', +# max_keep_ckpts=1)) + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# eval h3wb +# val_dataloader = dict( +# batch_size=64, +# num_workers=10, +# persistent_workers=True, +# drop_last=False, +# sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), +# dataset=dict( +# type='H36MWholeBodyDataset', +# ann_file='annotation_body3d/h3wb_train_bbox.npz', +# seq_len=1, +# causal=True, +# data_root='data/h36m/', +# data_prefix=dict(img='images/'), +# test_mode=True, +# pipeline=val_pipeline)) +# test_dataloader = val_dataloader + +# # evaluators +# val_evaluator = [ +# dict(type='SimpleMPJPE', mode='mpjpe'), +# dict(type='SimpleMPJPE', mode='p-mpjpe') +# ] +# test_evaluator = val_evaluator + +# eval coco +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='CocoWholeBodyDataset', + data_root='data/coco/', + data_mode='topdown', + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + bbox_file='data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json', + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file='data/coco/' + 'annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/projects/rtmpose3d/rtmpose3d/__init__.py b/projects/rtmpose3d/rtmpose3d/__init__.py new file mode 100644 index 0000000000..eec926b2c8 --- /dev/null +++ b/projects/rtmpose3d/rtmpose3d/__init__.py @@ -0,0 +1,6 @@ +from .pose_estimator import TopdownPoseEstimator3D +from .rtmw3d_head import RTMW3DHead +from .simcc_3d_label import SimCC3DLabel +from .loss import KLDiscretLoss2 + +__all__ = ['TopdownPoseEstimator3D', 'RTMW3DHead', 'SimCC3DLabel', 'KLDiscretLoss2'] diff --git a/projects/rtmpose3d/rtmpose3d/loss.py b/projects/rtmpose3d/rtmpose3d/loss.py new file mode 100644 index 0000000000..499befa5a0 --- /dev/null +++ b/projects/rtmpose3d/rtmpose3d/loss.py @@ -0,0 +1,37 @@ +from mmpose.registry import MODELS +from mmpose.models.losses import KLDiscretLoss + +@MODELS.register_module() +class KLDiscretLoss2(KLDiscretLoss): + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self._loss_name = 'loss_kld' + + def forward(self, pred_simcc, gt_simcc, target_weight): + N, K, _ = pred_simcc[0].shape + loss = 0 + + for pred, target, weight in zip(pred_simcc, gt_simcc, target_weight): + pred = pred.reshape(-1, pred.size(-1)) + target = target.reshape(-1, target.size(-1)) + weight = weight.reshape(-1) + + t_loss = self.criterion(pred, target).mul(weight) + + if self.mask is not None: + t_loss = t_loss.reshape(N, K) + t_loss[:, self.mask] = t_loss[:, self.mask] * self.mask_weight + + loss = loss + t_loss.sum() + + return loss / K + + @property + def loss_name(self): + """Loss Name. + + Returns: + str: The name of this loss item. + """ + return self._loss_name \ No newline at end of file diff --git a/projects/rtmpose3d/rtmpose3d/pose_estimator.py b/projects/rtmpose3d/rtmpose3d/pose_estimator.py new file mode 100644 index 0000000000..6854205b4b --- /dev/null +++ b/projects/rtmpose3d/rtmpose3d/pose_estimator.py @@ -0,0 +1,116 @@ +from itertools import zip_longest +from typing import Optional + +import numpy as np + +from mmpose.utils.typing import InstanceList, PixelDataList, SampleList +from mmpose.registry import MODELS +from mmpose.models.pose_estimators import TopdownPoseEstimator + +@MODELS.register_module() +class TopdownPoseEstimator3D(TopdownPoseEstimator): + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.camera_param = { + 'c': [512.54150496, 515.45148698], + 'f': [1145.04940459, 1143.78109572], + } + + def add_pred_to_datasample(self, batch_pred_instances: InstanceList, + batch_pred_fields: Optional[PixelDataList], + batch_data_samples: SampleList) -> SampleList: + """Add predictions into data samples. + + Args: + batch_pred_instances (List[InstanceData]): The predicted instances + of the input data batch + batch_pred_fields (List[PixelData], optional): The predicted + fields (e.g. heatmaps) of the input batch + batch_data_samples (List[PoseDataSample]): The input data batch + + Returns: + List[PoseDataSample]: A list of data samples where the predictions + are stored in the ``pred_instances`` field of each data sample. + """ + assert len(batch_pred_instances) == len(batch_data_samples) + if batch_pred_fields is None: + batch_pred_fields = [] + output_keypoint_indices = self.test_cfg.get('output_keypoint_indices', + None) + mode = self.test_cfg.get('mode', '3d') + assert mode in ['2d', '3d', 'vis', 'simcc'] + for pred_instances, pred_fields, data_sample in zip_longest( + batch_pred_instances, batch_pred_fields, batch_data_samples): + + gt_instances = data_sample.gt_instances + + # convert keypoint coordinates from input space to image space + input_center = data_sample.metainfo['input_center'] + input_scale = data_sample.metainfo['input_scale'] + input_size = data_sample.metainfo['input_size'] + keypoints_3d = pred_instances.keypoints + keypoints_2d = pred_instances.keypoints_2d + keypoints_simcc = pred_instances.keypoints_simcc + keypoints_2d = keypoints_2d / input_size * input_scale \ + + input_center - 0.5 * input_scale + + if gt_instances.get('camera_params', None) is not None: + camera_params = gt_instances.camera_params[0] + f = np.array(camera_params['f']) + c = np.array(camera_params['c']) + else: + f = np.array([1145.04940459, 1143.78109572]) + c = np.array(data_sample.ori_shape) + + kpts_pixel = np.concatenate([ + keypoints_2d, + (keypoints_3d[..., 2] + gt_instances.root_z)[..., None] + ], + axis=-1) + kpts_cam = kpts_pixel.copy() + kpts_cam[..., :2] = (kpts_pixel[..., :2] - c) / f * kpts_pixel[..., + 2:] + if mode == '3d': + pred_instances.keypoints = kpts_cam + pred_instances.transformed_keypoints = keypoints_2d + elif mode == 'vis': + pred_instances.keypoints = keypoints_3d + pred_instances.transformed_keypoints = keypoints_2d + elif mode == 'simcc': + pred_instances.keypoints = keypoints_simcc + pred_instances.transformed_keypoints = keypoints_2d + else: + pred_instances.keypoints = keypoints_2d + pred_instances.transformed_keypoints = keypoints_2d + + if 'keypoints_visible' not in pred_instances: + pred_instances.keypoints_visible = \ + pred_instances.keypoint_scores + + if output_keypoint_indices is not None: + # select output keypoints with given indices + num_keypoints = pred_instances.keypoints.shape[1] + for key, value in pred_instances.all_items(): + if key.startswith('keypoint'): + pred_instances.set_field( + value[:, output_keypoint_indices], key) + + # add bbox information into pred_instances + pred_instances.bboxes = gt_instances.bboxes + pred_instances.bbox_scores = gt_instances.bbox_scores + + data_sample.pred_instances = pred_instances + + if pred_fields is not None: + if output_keypoint_indices is not None: + # select output heatmap channels with keypoint indices + # when the number of heatmap channel matches num_keypoints + for key, value in pred_fields.all_items(): + if value.shape[0] != num_keypoints: + continue + pred_fields.set_field(value[output_keypoint_indices], + key) + data_sample.pred_fields = pred_fields + + return batch_data_samples diff --git a/projects/rtmpose3d/rtmpose3d/rtmw3d_head.py b/projects/rtmpose3d/rtmpose3d/rtmw3d_head.py new file mode 100644 index 0000000000..bbf6bd2b48 --- /dev/null +++ b/projects/rtmpose3d/rtmpose3d/rtmw3d_head.py @@ -0,0 +1,444 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Sequence, Tuple, Union + +import numpy as np +import torch +from mmcv.cnn import ConvModule +from mmengine.structures import InstanceData +from torch import Tensor, nn + +from mmpose.codecs.utils import get_simcc_maximum as get_2d_simcc_maximum +from mmpose.evaluation.functional import keypoint_mpjpe +from mmpose.models.utils.rtmcc_block import RTMCCBlock, ScaleNorm +from mmpose.registry import KEYPOINT_CODECS, MODELS +from mmpose.utils.tensor_utils import to_numpy +from mmpose.utils.typing import (ConfigType, InstanceList, OptConfigType, + OptSampleList) +from mmpose.models.heads import BaseHead +from .utils import get_simcc_maximum + +OptIntSeq = Optional[Sequence[int]] + + +@MODELS.register_module() +class RTMW3DHead(BaseHead): + """Top-down head introduced in RTMPose-Wholebody (2023). + + Args: + in_channels (int | sequence[int]): Number of channels in the input + feature map. + out_channels (int): Number of channels in the output heatmap. + input_size (tuple): Size of input image in shape [w, h]. + in_featuremap_size (int | sequence[int]): Size of input feature map. + simcc_split_ratio (float): Split ratio of pixels. + Default: 2.0. + final_layer_kernel_size (int): Kernel size of the convolutional layer. + Default: 1. + gau_cfg (Config): Config dict for the Gated Attention Unit. + Default: dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='ReLU', + use_rel_bias=False, + pos_enc=False). + loss (Config): Config of the keypoint loss. Defaults to use + :class:`KLDiscretLoss` + decoder (Config, optional): The decoder config that controls decoding + keypoint coordinates from the network output. Defaults to ``None`` + init_cfg (Config, optional): Config to control the initialization. See + :attr:`default_init_cfg` for default settings + """ + + def __init__( + self, + in_channels: Union[int, Sequence[int]], + out_channels: int, + input_size: Tuple[int, int], + in_featuremap_size: Tuple[int, int], + simcc_split_ratio: float = 2.0, + final_layer_kernel_size: int = 1, + gau_cfg: ConfigType = dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='ReLU', + use_rel_bias=False, + pos_enc=False), + loss: ConfigType = dict(type='KLDiscretLoss', use_target_weight=True), + decoder: OptConfigType = None, + init_cfg: OptConfigType = None, + ): + + if init_cfg is None: + init_cfg = self.default_init_cfg + + super().__init__(init_cfg) + + self.in_channels = in_channels + self.out_channels = out_channels + self.input_size = input_size + self.in_featuremap_size = in_featuremap_size + self.simcc_split_ratio = simcc_split_ratio + + self.loss_module = nn.ModuleList() + if isinstance(loss, dict): + self.loss_module.append(MODELS.build(loss)) + elif isinstance(loss, (list, tuple)): + for cfg in loss: + self.loss_module.append(MODELS.build(cfg)) + else: + raise TypeError(f'loss_decode must be a dict or sequence of dict,\ + but got {type(loss)}') + + if decoder is not None: + self.decoder = KEYPOINT_CODECS.build(decoder) + else: + self.decoder = None + + if isinstance(in_channels, (tuple, list)): + raise ValueError( + f'{self.__class__.__name__} does not support selecting ' + 'multiple input features.') + + # Define SimCC layers + flatten_dims = self.in_featuremap_size[0] * self.in_featuremap_size[1] + + ps = 2 + self.ps = nn.PixelShuffle(ps) + self.conv_dec = ConvModule( + in_channels // ps**2, + in_channels // 4, + kernel_size=final_layer_kernel_size, + stride=1, + padding=final_layer_kernel_size // 2, + norm_cfg=dict(type='BN', requires_grad=True), + act_cfg=dict(type='ReLU')) + + self.final_layer = ConvModule( + in_channels, + out_channels, + kernel_size=final_layer_kernel_size, + stride=1, + padding=final_layer_kernel_size // 2, + norm_cfg=dict(type='BN', requires_grad=True), + act_cfg=dict(type='ReLU')) + self.final_layer2 = ConvModule( + in_channels // ps + in_channels // 4, + out_channels, + kernel_size=final_layer_kernel_size, + stride=1, + padding=final_layer_kernel_size // 2, + norm_cfg=dict(type='BN', requires_grad=True), + act_cfg=dict(type='ReLU')) + + self.mlp = nn.Sequential( + ScaleNorm(flatten_dims), + nn.Linear(flatten_dims, gau_cfg['hidden_dims'] // 2, bias=False)) + + self.mlp2 = nn.Sequential( + ScaleNorm(flatten_dims * ps**2), + nn.Linear( + flatten_dims * ps**2, gau_cfg['hidden_dims'] // 2, bias=False)) + + W = int(self.input_size[0] * self.simcc_split_ratio) + H = int(self.input_size[1] * self.simcc_split_ratio) + D = int(self.input_size[2] * self.simcc_split_ratio) + + self.gau = RTMCCBlock( + self.out_channels, + gau_cfg['hidden_dims'], + gau_cfg['hidden_dims'], + s=gau_cfg['s'], + expansion_factor=gau_cfg['expansion_factor'], + dropout_rate=gau_cfg['dropout_rate'], + drop_path=gau_cfg['drop_path'], + attn_type='self-attn', + act_fn=gau_cfg['act_fn'], + use_rel_bias=gau_cfg['use_rel_bias'], + pos_enc=gau_cfg['pos_enc']) + + self.cls_x = nn.Linear(gau_cfg['hidden_dims'], W, bias=False) + self.cls_y = nn.Linear(gau_cfg['hidden_dims'], H, bias=False) + self.cls_z = nn.Linear(gau_cfg['hidden_dims'], D, bias=False) + + def forward(self, feats: Tuple[Tensor, + Tensor]) -> Tuple[Tensor, Tensor, Tensor]: + """Forward the network. + + The input is the feature map extracted by backbone and the + output is the simcc representation. + + Args: + feats (Tuple[Tensor]): Multi scale feature maps. + + Returns: + pred_x (Tensor): 1d representation of x. + pred_y (Tensor): 1d representation of y. + """ + # enc_b n / 2, h, w + # enc_t n, h, w + enc_b, enc_t = feats + + feats_t = self.final_layer(enc_t) + feats_t = torch.flatten(feats_t, 2) + feats_t = self.mlp(feats_t) + + dec_t = self.ps(enc_t) + dec_t = self.conv_dec(dec_t) + enc_b = torch.cat([dec_t, enc_b], dim=1) + + feats_b = self.final_layer2(enc_b) + feats_b = torch.flatten(feats_b, 2) + feats_b = self.mlp2(feats_b) + + feats = torch.cat([feats_t, feats_b], dim=2) + + feats = self.gau(feats) + + pred_x = self.cls_x(feats) + pred_y = self.cls_y(feats) + pred_z = self.cls_z(feats) + + return pred_x, pred_y, pred_z + + def decode(self, batch_outputs: Union[Tensor, + Tuple[Tensor]]) -> InstanceList: + """Decode keypoints from outputs. + + Args: + batch_outputs (Tensor | Tuple[Tensor]): The network outputs of + a data batch + + Returns: + List[InstanceData]: A list of InstanceData, each contains the + decoded pose information of the instances of one data sample. + """ + + def _pack_and_call(args, func): + if not isinstance(args, tuple): + args = (args, ) + return func(*args) + + if self.decoder is None: + raise RuntimeError( + f'The decoder has not been set in {self.__class__.__name__}. ' + 'Please set the decoder configs in the init parameters to ' + 'enable head methods `head.predict()` and `head.decode()`') + + batch_output_np = to_numpy(batch_outputs, unzip=True) + batch_keypoints = [] + batch_keypoints2d = [] + batch_keypoints_simcc = [] + batch_scores = [] + for outputs in batch_output_np: + keypoints_2d, keypoints, keypoints_simcc, scores = _pack_and_call( + outputs, self.decoder.decode) + batch_keypoints2d.append(keypoints_2d) + batch_keypoints.append(keypoints) + batch_keypoints_simcc.append(keypoints_simcc) + batch_scores.append(scores) + + preds = [] + for keypoints_2d, keypoints, keypoints_simcc, scores in zip(batch_keypoints2d, + batch_keypoints, + batch_keypoints_simcc, + batch_scores): + pred = InstanceData( + keypoints_2d=keypoints_2d, + keypoints=keypoints, + keypoints_simcc=keypoints_simcc, + keypoint_scores=scores) + preds.append(pred) + + return preds + + def predict( + self, + feats: Tuple[Tensor], + batch_data_samples: OptSampleList, + test_cfg: OptConfigType = {}, + ) -> InstanceList: + """Predict results from features. + + Args: + feats (Tuple[Tensor] | List[Tuple[Tensor]]): The multi-stage + features (or multiple multi-stage features in TTA) + batch_data_samples (List[:obj:`PoseDataSample`]): The batch + data samples + test_cfg (dict): The runtime config for testing process. Defaults + to {} + + Returns: + List[InstanceData]: The pose predictions, each contains + the following fields: + - keypoints (np.ndarray): predicted keypoint coordinates in + shape (num_instances, K, D) where K is the keypoint number + and D is the keypoint dimension + - keypoint_scores (np.ndarray): predicted keypoint scores in + shape (num_instances, K) + - keypoint_x_labels (np.ndarray, optional): The predicted 1-D + intensity distribution in the x direction + - keypoint_y_labels (np.ndarray, optional): The predicted 1-D + intensity distribution in the y direction + """ + x, y, z = self.forward(feats) + + preds = self.decode((x, y, z)) + + if test_cfg.get('output_heatmaps', False): + raise NotImplementedError + else: + return preds + + def loss( + self, + feats: Tuple[Tensor], + batch_data_samples: OptSampleList, + train_cfg: OptConfigType = {}, + ) -> dict: + """Calculate losses from a batch of inputs and data samples.""" + + pred_x, pred_y, pred_z = self.forward(feats) + + gt_x = torch.cat([ + d.gt_instance_labels.keypoint_x_labels for d in batch_data_samples + ], + dim=0) + gt_y = torch.cat([ + d.gt_instance_labels.keypoint_y_labels for d in batch_data_samples + ], + dim=0) + gt_z = torch.cat([ + d.gt_instance_labels.keypoint_z_labels for d in batch_data_samples + ], + dim=0) + keypoint_weights = torch.cat( + [ + d.gt_instance_labels.keypoint_weights + for d in batch_data_samples + ], + dim=0, + ) + + weight_z = torch.cat( + [d.gt_instance_labels.weight_z for d in batch_data_samples], + dim=0, + ) + + with_z_labels = [ + d.gt_instance_labels.with_z_label[0] for d in batch_data_samples + ] + + N, K, _ = pred_x.shape + keypoint_weights_ = keypoint_weights.clone() + pred_simcc = (pred_x, pred_y, pred_z) + gt_simcc = (gt_x, gt_y, gt_z) + + keypoint_weights = torch.cat([ + keypoint_weights[None, ...], keypoint_weights[None, ...], + weight_z[None, ...] + ]) + + # calculate losses + losses = dict() + for i, loss_ in enumerate(self.loss_module): + if loss_.loss_name == 'loss_bone' or loss_.loss_name == 'loss_mpjpe': + pred_coords = get_3d_coord(pred_x, pred_y, pred_z, + with_z_labels) + gt_coords = get_3d_coord(gt_x, gt_y, gt_z, with_z_labels) + loss = loss_(pred_coords, gt_coords, keypoint_weights_) + else: + loss = loss_(pred_simcc, gt_simcc, keypoint_weights) + losses[loss_.loss_name] = loss + + # calculate accuracy + error = simcc_mpjpe( + output=to_numpy(pred_simcc), + target=to_numpy(gt_simcc), + simcc_split_ratio=self.simcc_split_ratio, + mask=to_numpy(keypoint_weights_) > 0, + ) + + mpjpe = torch.tensor(error, device=gt_x.device) + losses.update(mpjpe=mpjpe) + + return losses + + @property + def default_init_cfg(self): + init_cfg = [ + dict(type='Normal', layer=['Conv2d'], std=0.001), + dict(type='Constant', layer='BatchNorm2d', val=1), + dict(type='Normal', layer=['Linear'], std=0.01, bias=0), + ] + return init_cfg + + +def simcc_mpjpe(output: Tuple[np.ndarray, np.ndarray, np.ndarray], + target: Tuple[np.ndarray, np.ndarray, np.ndarray], + simcc_split_ratio: float, + mask: np.ndarray, + thr: float = 0.05) -> float: + """Calculate the pose accuracy of PCK for each individual keypoint and the + averaged accuracy across all keypoints from 3D SimCC. + + Note: + - PCK metric measures accuracy of the localization of the body joints. + - The distances between predicted positions and the ground-truth ones + are typically normalized by the bounding box size. + + Args: + output (Tuple[np.ndarray, np.ndarray, np.ndarray]): Model predicted + 3D SimCC (x, y, z). + target (Tuple[np.ndarray, np.ndarray, np.ndarray]): Groundtruth + 3D SimCC (x, y, z). + simcc_split_ratio (float): SimCC split ratio for recovering actual + coordinates. + mask (np.ndarray[N, K]): Visibility mask for the target. False for + invisible joints, and True for visible. + thr (float): Threshold for PCK calculation. Default 0.05. + normalize (Optional[np.ndarray[N, 3]]): Normalization factor for + H, W, and Depth. + + Returns: + Tuple[np.ndarray, float, int]: + - np.ndarray[K]: Accuracy of each keypoint. + - float: Averaged accuracy across all keypoints. + - int: Number of valid keypoints. + """ + if len(output) == 3: + pred_x, pred_y, pred_z = output + gt_x, gt_y, gt_z = target + pred_coords, _ = get_simcc_maximum(pred_x, pred_y, pred_z) + gt_coords, _ = get_simcc_maximum(gt_x, gt_y, gt_z) + + else: + pred_x, pred_y = output + gt_x, gt_y = target + pred_coords, _ = get_2d_simcc_maximum(pred_x, pred_y) + gt_coords, _ = get_2d_simcc_maximum(gt_x, gt_y) + + pred_coords /= simcc_split_ratio + gt_coords /= simcc_split_ratio + + return keypoint_mpjpe(pred_coords, gt_coords, mask) + + +def get_3d_coord(simcc_x, simcc_y, simcc_z, with_z_labels): + N, K, W = simcc_x.shape + # 过滤 z 轴 + for i, with_z in enumerate(with_z_labels): + if not with_z: + simcc_z[i] = torch.zeros_like(simcc_z[i]) + x_locs = simcc_x.reshape(N * K, -1).argmax(dim=1) + y_locs = simcc_y.reshape(N * K, -1).argmax(dim=1) + z_locs = simcc_z.reshape(N * K, -1).argmax(dim=1) + + locs = torch.stack((x_locs, y_locs, z_locs), + dim=-1).to(simcc_x).reshape(N, K, 3) + return locs diff --git a/projects/rtmpose3d/rtmpose3d/simcc_3d_label.py b/projects/rtmpose3d/rtmpose3d/simcc_3d_label.py new file mode 100644 index 0000000000..4440caa667 --- /dev/null +++ b/projects/rtmpose3d/rtmpose3d/simcc_3d_label.py @@ -0,0 +1,335 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from itertools import product +from typing import Any, Optional, Tuple, Union + +import numpy as np +from numpy import ndarray + +from mmpose.registry import KEYPOINT_CODECS +from mmpose.codecs.base import BaseKeypointCodec + +from .utils import get_simcc_maximum + + +@KEYPOINT_CODECS.register_module() +class SimCC3DLabel(BaseKeypointCodec): + r"""Generate keypoint representation via "SimCC" approach. + See the paper: `SimCC: a Simple Coordinate Classification Perspective for + Human Pose Estimation`_ by Li et al (2022) for more details. + Old name: SimDR + + Note: + + - instance number: N + - keypoint number: K + - keypoint dimension: D + - image size: [w, h] + + Encoded: + + - keypoint_x_labels (np.ndarray): The generated SimCC label for x-axis. + The label shape is (N, K, Wx) if ``smoothing_type=='gaussian'`` + and (N, K) if `smoothing_type=='standard'``, where + :math:`Wx=w*simcc_split_ratio` + - keypoint_y_labels (np.ndarray): The generated SimCC label for y-axis. + The label shape is (N, K, Wy) if ``smoothing_type=='gaussian'`` + and (N, K) if `smoothing_type=='standard'``, where + :math:`Wy=h*simcc_split_ratio` + - keypoint_weights (np.ndarray): The target weights in shape (N, K) + + Args: + input_size (tuple): Input image size in [w, h] + smoothing_type (str): The SimCC label smoothing strategy. Options are + ``'gaussian'`` and ``'standard'``. Defaults to ``'gaussian'`` + sigma (float | int | tuple): The sigma value in the Gaussian SimCC + label. Defaults to 6.0 + simcc_split_ratio (float): The ratio of the label size to the input + size. For example, if the input width is ``w``, the x label size + will be :math:`w*simcc_split_ratio`. Defaults to 2.0 + label_smooth_weight (float): Label Smoothing weight. Defaults to 0.0 + normalize (bool): Whether to normalize the heatmaps. Defaults to True. + use_dark (bool): Whether to use the DARK post processing. Defaults to + False. + decode_visibility (bool): Whether to decode the visibility. Defaults + to False. + decode_beta (float): The beta value for decoding visibility. Defaults + to 150.0. + + .. _`SimCC: a Simple Coordinate Classification Perspective for Human Pose + Estimation`: https://arxiv.org/abs/2107.03332 + """ + + auxiliary_encode_keys = {'keypoints_3d'} + + label_mapping_table = dict( + keypoint_x_labels='keypoint_x_labels', + keypoint_y_labels='keypoint_y_labels', + keypoint_z_labels='keypoint_z_labels', + keypoint_weights='keypoint_weights', + weight_z='weight_z', + with_z_label='with_z_label') + + instance_mapping_table = dict( + bbox='bboxes', + bbox_score='bbox_scores', + bbox_scale='bbox_scales', + lifting_target='lifting_target', + lifting_target_visible='lifting_target_visible', + camera_param='camera_params', + root_z='root_z') + + def __init__(self, + input_size: Tuple[int, int, int], + smoothing_type: str = 'gaussian', + sigma: Union[float, int, Tuple[float]] = 6.0, + simcc_split_ratio: float = 2.0, + label_smooth_weight: float = 0.0, + normalize: bool = True, + use_dark: bool = False, + decode_visibility: bool = False, + decode_beta: float = 150.0, + root_index: Union[int, Tuple[int]] = 0, + z_range: Optional[int] = None, + sigmoid_z: bool = False) -> None: + super().__init__() + + self.input_size = input_size + self.smoothing_type = smoothing_type + self.simcc_split_ratio = simcc_split_ratio + self.label_smooth_weight = label_smooth_weight + self.normalize = normalize + self.use_dark = use_dark + self.decode_visibility = decode_visibility + self.decode_beta = decode_beta + + if isinstance(sigma, (float, int)): + self.sigma = np.array([sigma, sigma, sigma]) + else: + self.sigma = np.array(sigma) + + if self.smoothing_type not in {'gaussian', 'standard'}: + raise ValueError( + f'{self.__class__.__name__} got invalid `smoothing_type` value' + f'{self.smoothing_type}. Should be one of ' + '{"gaussian", "standard"}') + + if self.smoothing_type == 'gaussian' and self.label_smooth_weight > 0: + raise ValueError('Attribute `label_smooth_weight` is only ' + 'used for `standard` mode.') + + if self.label_smooth_weight < 0.0 or self.label_smooth_weight > 1.0: + raise ValueError('`label_smooth_weight` should be in range [0, 1]') + + self.root_index = list(root_index) if isinstance( + root_index, tuple) else [root_index] + self.z_range = z_range if z_range is not None else 2.1744869 + self.sigmoid_z = sigmoid_z + self.root_z = [5.14388] + + def encode(self, + keypoints: np.ndarray, + keypoints_3d: Optional[np.ndarray] = None, + keypoints_visible: Optional[np.ndarray] = None) -> dict: + + if keypoints_visible is None: + keypoints_visible = np.ones(keypoints.shape[:2], dtype=np.float32) + lifting_target = [None] + root_z = self.root_z + with_z_label = False + if keypoints_3d is not None: + lifting_target = keypoints_3d.copy() + root_z = keypoints_3d[..., self.root_index, 2].mean(1) + keypoints_3d[..., 2] -= root_z + if self.sigmoid_z: + keypoints_z = (1 / (1 + np.exp(-(3 * keypoints_3d[..., 2]))) + ) * self.input_size[2] + else: + keypoints_z = (keypoints_3d[..., 2] / self.z_range + 1) * ( + self.input_size[2] / 2) + + keypoints_3d = np.concatenate([keypoints, keypoints_z[..., None]], + axis=-1) + x, y, z, keypoint_weights = self._generate_gaussian( + keypoints_3d, keypoints_visible) + weight_z = keypoint_weights + with_z_label = True + else: + if keypoints.shape != np.zeros([]).shape: + keypoints_z = np.ones((keypoints.shape[0], + keypoints.shape[1], 1), dtype=np.float32) + keypoints = np.concatenate([keypoints, keypoints_z], axis=-1) + x, y, z, keypoint_weights = self._generate_gaussian( + keypoints, keypoints_visible) + else: + x, y, z = np.zeros((3, 1), dtype=np.float32) + keypoint_weights = np.ones((1, )) + weight_z = np.zeros_like(keypoint_weights) + with_z_label = False + + encoded = dict( + keypoint_x_labels=x, + keypoint_y_labels=y, + keypoint_z_labels=z, + lifting_target=lifting_target, + root_z=root_z, + keypoint_weights=keypoint_weights, + weight_z=weight_z, + with_z_label=[with_z_label]) + + return encoded + + def decode(self, x: np.ndarray, y: np.ndarray, z: np.ndarray): + """Decode SimCC labels into 3D keypoints. + + Args: + encoded (Tuple[np.ndarray, np.ndarray]): SimCC labels for x-axis, + y-axis and z-axis in shape (N, K, Wx), (N, K, Wy) and (N, K, Wz) + + Returns: + tuple: + - keypoints (np.ndarray): Decoded coordinates in shape (N, K, D) + - scores (np.ndarray): The keypoint scores in shape (N, K). + It usually represents the confidence of the keypoint prediction + """ + + keypoints, scores = get_simcc_maximum(x, y, z) + + # Unsqueeze the instance dimension for single-instance results + if keypoints.ndim == 2: + keypoints = keypoints[None, :] + scores = scores[None, :] + + keypoints /= self.simcc_split_ratio + keypoints_simcc = keypoints.copy() + keypoints_2d = keypoints[..., :2] + keypoints_z = keypoints[..., 2:3] + if self.sigmoid_z: + keypoints_z /= self.input_size[2] + keypoints_z[keypoints_z <= 0] = 1e-8 + scores[(keypoints_z <= 0).squeeze(-1)] = 0 + keypoints[..., 2:3] = np.log(keypoints_z / (1 - keypoints_z)) / 3 + else: + keypoints[..., + 2:3] = (keypoints_z / + (self.input_size[-1] / 2) - 1) * self.z_range + return keypoints_2d, keypoints, keypoints_simcc, scores + + def _map_coordinates( + self, + keypoints: np.ndarray, + keypoints_visible: Optional[np.ndarray] = None + ) -> Tuple[np.ndarray, np.ndarray]: + """Mapping keypoint coordinates into SimCC space.""" + + keypoints_split = keypoints.copy() + keypoints_split = np.around(keypoints_split * self.simcc_split_ratio) + keypoints_split = keypoints_split.astype(np.int64) + keypoint_weights = keypoints_visible.copy() + + return keypoints_split, keypoint_weights + + def _generate_gaussian( + self, + keypoints: np.ndarray, + keypoints_visible: Optional[np.ndarray] = None + ) -> tuple[ndarray, ndarray, ndarray, ndarray]: + """Encoding keypoints into SimCC labels with Gaussian Label Smoothing + strategy.""" + + N, K, _ = keypoints.shape + w, h, d = self.input_size + W = np.around(w * self.simcc_split_ratio).astype(int) + H = np.around(h * self.simcc_split_ratio).astype(int) + D = np.around(d * self.simcc_split_ratio).astype(int) + + keypoints_split, keypoint_weights = self._map_coordinates( + keypoints, keypoints_visible) + + target_x = np.zeros((N, K, W), dtype=np.float32) + target_y = np.zeros((N, K, H), dtype=np.float32) + target_z = np.zeros((N, K, D), dtype=np.float32) + + # 3-sigma rule + radius = self.sigma * 3 + + # xy grid + x = np.arange(0, W, 1, dtype=np.float32) + y = np.arange(0, H, 1, dtype=np.float32) + z = np.arange(0, D, 1, dtype=np.float32) + + for n, k in product(range(N), range(K)): + # skip unlabled keypoints + if keypoints_visible[n, k] < 0.5: + continue + + mu = keypoints_split[n, k] + + # check that the gaussian has in-bounds part + left, top, near = mu - radius + right, bottom, far = mu + radius + 1 + + if left >= W or top >= H or near >= D or right < 0 or bottom < 0 or far < 0: # noqa: E501 + keypoint_weights[n, k] = 0 + continue + + mu_x, mu_y, mu_z = mu + + target_x[n, k] = np.exp(-((x - mu_x)**2) / (2 * self.sigma[0]**2)) + target_y[n, k] = np.exp(-((y - mu_y)**2) / (2 * self.sigma[1]**2)) + target_z[n, k] = np.exp(-((z - mu_z)**2) / (2 * self.sigma[2]**2)) + + if self.normalize: + norm_value = self.sigma * np.sqrt(np.pi * 2) + target_x /= norm_value[0] + target_y /= norm_value[1] + target_z /= norm_value[2] + + return target_x, target_y, target_z, keypoint_weights + + def _generate_standard( + self, + keypoints: np.ndarray, + keypoints_visible: Optional[np.ndarray] = None + ) -> tuple[ndarray, ndarray, ndarray, Any]: + """Encoding keypoints into SimCC labels with Standard Label Smoothing + strategy. + + Labels will be one-hot vectors if self.label_smooth_weight==0.0 + """ + + N, K, _ = keypoints.shape + w, h, d = self.input_size + w = np.around(w * self.simcc_split_ratio).astype(int) + h = np.around(h * self.simcc_split_ratio).astype(int) + d = np.around(d * self.simcc_split_ratio).astype(int) + + keypoints_split, keypoint_weights = self._map_coordinates( + keypoints, keypoints_visible) + + x = np.zeros((N, K, w), dtype=np.float32) + y = np.zeros((N, K, h), dtype=np.float32) + z = np.zeros((N, K, d), dtype=np.float32) + + for n, k in product(range(N), range(K)): + # skip unlabled keypoints + if keypoints_visible[n, k] < 0.5: + continue + + # get center coordinates + mu_x, mu_y, mu_z = keypoints_split[n, k].astype(np.int64) + + # detect abnormal coords and assign the weight 0 + if mu_x >= w or mu_y >= h or mu_x < 0 or mu_y < 0: + keypoint_weights[n, k] = 0 + continue + + if self.label_smooth_weight > 0: + x[n, k] = self.label_smooth_weight / (w - 1) + y[n, k] = self.label_smooth_weight / (h - 1) + z[n, k] = self.label_smooth_weight / (d - 1) + + x[n, k, mu_x] = 1.0 - self.label_smooth_weight + y[n, k, mu_y] = 1.0 - self.label_smooth_weight + z[n, k, mu_z] = 1.0 - self.label_smooth_weight + + return x, y, z, keypoint_weights diff --git a/projects/rtmpose3d/rtmpose3d/utils.py b/projects/rtmpose3d/rtmpose3d/utils.py new file mode 100644 index 0000000000..8dab90de20 --- /dev/null +++ b/projects/rtmpose3d/rtmpose3d/utils.py @@ -0,0 +1,76 @@ +from typing import Tuple + +import numpy as np + + +def get_simcc_maximum(simcc_x: np.ndarray, + simcc_y: np.ndarray, + simcc_z: np.ndarray, + apply_softmax: bool = False + ) -> Tuple[np.ndarray, np.ndarray]: + """Get maximum response location and value from simcc representations. + + Note: + instance number: N + num_keypoints: K + heatmap height: H + heatmap width: W + + Args: + encoded_keypoints (dict): encoded keypoints with simcc representations. + apply_softmax (bool): whether to apply softmax on the heatmap. + Defaults to False. + + Returns: + tuple: + - locs (np.ndarray): locations of maximum heatmap responses in shape + (K, 2) or (N, K, 2) + - vals (np.ndarray): values of maximum heatmap responses in shape + (K,) or (N, K) + """ + assert isinstance(simcc_x, np.ndarray), 'simcc_x should be numpy.ndarray' + assert isinstance(simcc_y, np.ndarray), 'simcc_y should be numpy.ndarray' + assert isinstance(simcc_z, np.ndarray), 'simcc_z should be numpy.ndarray' + assert simcc_x.ndim == 2 or simcc_x.ndim == 3, ( + f'Invalid shape {simcc_x.shape}') + assert simcc_y.ndim == 2 or simcc_y.ndim == 3, ( + f'Invalid shape {simcc_y.shape}') + assert simcc_z.ndim == 2 or simcc_z.ndim == 3, ( + f'Invalid shape {simcc_z.shape}') + assert simcc_x.ndim == simcc_y.ndim == simcc_z.ndim, ( + f'{simcc_x.shape} != {simcc_y.shape} or {simcc_z.shape}') + + if simcc_x.ndim == 3: + n, k, _ = simcc_x.shape + simcc_x = simcc_x.reshape(n * k, -1) + simcc_y = simcc_y.reshape(n * k, -1) + simcc_z = simcc_z.reshape(n * k, -1) + else: + n = None + + if apply_softmax: + simcc_x = simcc_x - np.max(simcc_x, axis=1, keepdims=True) + simcc_y = simcc_y - np.max(simcc_y, axis=1, keepdims=True) + simcc_z = simcc_z - np.max(simcc_z, axis=1, keepdims=True) + ex, ey, ez = np.exp(simcc_x), np.exp(simcc_y), np.exp(simcc_z) + simcc_x = ex / np.sum(ex, axis=1, keepdims=True) + simcc_y = ey / np.sum(ey, axis=1, keepdims=True) + simcc_z = ez / np.sum(ez, axis=1, keepdims=True) + + x_locs = np.argmax(simcc_x, axis=1) + y_locs = np.argmax(simcc_y, axis=1) + z_locs = np.argmax(simcc_z, axis=1) + locs = np.stack((x_locs, y_locs, z_locs), axis=-1).astype(np.float32) + max_val_x = np.amax(simcc_x, axis=1) + max_val_y = np.amax(simcc_y, axis=1) + + mask = max_val_x > max_val_y + max_val_x[mask] = max_val_y[mask] + vals = max_val_x + locs[vals <= 0.] = -1 + + if n is not None: + locs = locs.reshape(n, k, 3) + vals = vals.reshape(n, k) + + return locs, vals \ No newline at end of file From 379358439c1626f9570630c9d5835f62b9b9387a Mon Sep 17 00:00:00 2001 From: xiexinch Date: Wed, 24 Apr 2024 23:33:13 +0800 Subject: [PATCH 02/15] --doc=add rtmpose3d readme --- projects/rtmpose3d/README.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 projects/rtmpose3d/README.md diff --git a/projects/rtmpose3d/README.md b/projects/rtmpose3d/README.md new file mode 100644 index 0000000000..e279078eb0 --- /dev/null +++ b/projects/rtmpose3d/README.md @@ -0,0 +1,16 @@ +# RTMPose3D: Real-Time 3D Pose Estimation toolkit based on RTMPose + +## Abstract + +RTMPose3D is a toolkit for real-time 3D pose estimation. It is based on the RTMPose model, which is a 2D pose estimation model that is capable of predicting 2D keypoints and body part associations in real-time. RTMPose3D extends RTMPose by adding a 3D pose estimation branch that can predict 3D keypoints from images directly. + +## Usage + +👉🏼 TRY RTMO NOW + +python .\body3d_img2pose_demo.py .\configs\rtmdet_m_640-8xb32_coco-person.py ..\..\..\mmpose\checkpoints\rtmdet_m_8xb32-100e_coco-obj365-person-235e8209.pth .\configs\rtmw3d-l_8xb64_cocktail14-384x288.py ..\..\..\mmpose\checkpoints\rtmw3d-l_cock14-0d4ad840_20240422.pth --input ..\..\tests\data\coco\000000000785.jpg --output-root results + +```bash +cd /path/to/mmpose/projects/rtmpose3d +python body3d_img2pose_demo.py configs/rtmdet_m_640-8xb32_coco-person.py https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmdet_m_8xb32-100e_coco-obj365-person-235e8209.pth configs\rtmw3d-l_8xb64_cocktail14-384x288.py rtmw3d-l_cock14-0d4ad840_20240422.pth --input /path/to/image --output-root /path/to/output +``` From dc9114ee478b534e12716a01190fe34035edf9f0 Mon Sep 17 00:00:00 2001 From: xiexinch Date: Thu, 25 Apr 2024 00:03:44 +0800 Subject: [PATCH 03/15] --doc=remove --- projects/rtmpose3d/README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/projects/rtmpose3d/README.md b/projects/rtmpose3d/README.md index e279078eb0..4f75c44fe2 100644 --- a/projects/rtmpose3d/README.md +++ b/projects/rtmpose3d/README.md @@ -8,8 +8,6 @@ RTMPose3D is a toolkit for real-time 3D pose estimation. It is based on the RTMP 👉🏼 TRY RTMO NOW -python .\body3d_img2pose_demo.py .\configs\rtmdet_m_640-8xb32_coco-person.py ..\..\..\mmpose\checkpoints\rtmdet_m_8xb32-100e_coco-obj365-person-235e8209.pth .\configs\rtmw3d-l_8xb64_cocktail14-384x288.py ..\..\..\mmpose\checkpoints\rtmw3d-l_cock14-0d4ad840_20240422.pth --input ..\..\tests\data\coco\000000000785.jpg --output-root results - ```bash cd /path/to/mmpose/projects/rtmpose3d python body3d_img2pose_demo.py configs/rtmdet_m_640-8xb32_coco-person.py https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmdet_m_8xb32-100e_coco-obj365-person-235e8209.pth configs\rtmw3d-l_8xb64_cocktail14-384x288.py rtmw3d-l_cock14-0d4ad840_20240422.pth --input /path/to/image --output-root /path/to/output From 9aa8bf5729ec10f2d3e3a390d80f7e8427543766 Mon Sep 17 00:00:00 2001 From: xiexinch Date: Thu, 25 Apr 2024 00:32:59 +0800 Subject: [PATCH 04/15] --fix=fix infer video --- projects/rtmpose3d/body3d_img2pose_demo.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/projects/rtmpose3d/body3d_img2pose_demo.py b/projects/rtmpose3d/body3d_img2pose_demo.py index 200043d7d4..e32db3323a 100644 --- a/projects/rtmpose3d/body3d_img2pose_demo.py +++ b/projects/rtmpose3d/body3d_img2pose_demo.py @@ -216,8 +216,6 @@ def process_one_image(args, detector, frame: np.ndarray, frame_idx: int, keypoints = np.squeeze(keypoints, axis=1) keypoints = -keypoints[..., [0, 2, 1]] - # keypoints[..., 0] = -keypoints[..., 0] - # keypoints[..., 2] = -keypoints[..., 2] # rebase height (z-axis) if not args.disable_rebase_keypoint: @@ -254,8 +252,7 @@ def process_one_image(args, detector, frame: np.ndarray, frame_idx: int, axis_azimuth=70, axis_elev=15, num_instances=args.num_instances, - wait_time=args.show_interval, - root_index=[11, 12]) + wait_time=args.show_interval) return pose_est_results, pose_est_results_list, pred_3d_instances, next_id From fb15f42e13b0590e22e0e94d47824cfd2543f2be Mon Sep 17 00:00:00 2001 From: xiexinch Date: Thu, 25 Apr 2024 14:20:27 +0800 Subject: [PATCH 05/15] --fix=fix lint --- projects/rtmpose3d/body3d_img2pose_demo.py | 5 +- .../rtmw3d-l_8xb64_cocktail14-384x288.py | 51 +++++++++---------- .../rtmw3d-x_8xb64_cocktail14-384x288.py | 48 ++++++++--------- projects/rtmpose3d/rtmpose3d/__init__.py | 6 ++- projects/rtmpose3d/rtmpose3d/loss.py | 5 +- .../rtmpose3d/rtmpose3d/pose_estimator.py | 5 +- projects/rtmpose3d/rtmpose3d/rtmw3d_head.py | 11 ++-- .../rtmpose3d/rtmpose3d/simcc_3d_label.py | 8 +-- projects/rtmpose3d/rtmpose3d/utils.py | 4 +- 9 files changed, 73 insertions(+), 70 deletions(-) diff --git a/projects/rtmpose3d/body3d_img2pose_demo.py b/projects/rtmpose3d/body3d_img2pose_demo.py index e32db3323a..4a0e90040c 100644 --- a/projects/rtmpose3d/body3d_img2pose_demo.py +++ b/projects/rtmpose3d/body3d_img2pose_demo.py @@ -19,7 +19,6 @@ split_instances) from mmpose.utils import adapt_mmdet_pipeline from mmpose.visualization import Pose3dLocalVisualizer -from rtmpose3d import * try: from mmdet.apis import inference_detector, init_detector @@ -27,6 +26,8 @@ except (ImportError, ModuleNotFoundError): has_mmdet = False +from rtmpose3d import * # noqa: F401, F403 + def parse_args(): parser = ArgumentParser() @@ -124,7 +125,7 @@ def parse_args(): def process_one_image(args, detector, frame: np.ndarray, frame_idx: int, - pose_estimator: TopdownPoseEstimator3D, + pose_estimator, pose_est_results_last: List[PoseDataSample], pose_est_results_list: List[List[PoseDataSample]], next_id: int, visualize_frame: np.ndarray, diff --git a/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py b/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py index 832742788d..950216c9bd 100644 --- a/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py +++ b/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py @@ -53,6 +53,8 @@ use_dark=False, root_index=(11, 12)) +backbone_path = 'checkpoints/rtmpose-l_simcc-ucoco_dw-ucoco_270e-256x192-4d6dfc62_20230728.pth' # noqa + # model settings model = dict( type='TopdownPoseEstimator3D', @@ -71,10 +73,7 @@ norm_cfg=dict(type='BN'), act_cfg=dict(type='SiLU'), init_cfg=dict( - type='Pretrained', - prefix='backbone.', - checkpoint='checkpoints/rtmpose-l_simcc-ucoco_dw-ucoco_270e-256x192-4d6dfc62_20230728.pth' # noqa - )), + type='Pretrained', prefix='backbone.', checkpoint=backbone_path)), neck=dict( type='CSPNeXtPAFPN', in_channels=[256, 512, 1024], @@ -112,15 +111,23 @@ label_softmax=True), dict( type='BoneLoss', - joint_parents=[0, 1, 2, 3, 4, 5, 6, 5, 6, 7, 8, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 50, 50, 51, 52, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 91, 92, 93, 94, 91, 96, 97, 98, 91, 100, 101, 102, 91, 104, 105, 106, 91, 108, 109, 110, 8, 112, 113, 114, 113, 112, 117, 118, 117, 112, 121, 122, 123, 112, 125, 126, 127, 112, 129, 130, 131], + joint_parents=[ + 0, 1, 2, 3, 4, 5, 6, 5, 6, 7, 8, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, 2, 2, 2, 2, 2, 3, 3, 3, 3, + 3, 50, 50, 51, 52, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 3, 3, + 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 91, 92, 93, 94, 91, 96, 97, 98, 91, 100, + 101, 102, 91, 104, 105, 106, 91, 108, 109, 110, 8, 112, + 113, 114, 113, 112, 117, 118, 117, 112, 121, 122, 123, 112, + 125, 126, 127, 112, 129, 130, 131 + ], use_target_weight=True, - loss_weight=2.0 - ) + loss_weight=2.0) ], decoder=codec), # test_cfg=dict(flip_test=False, mode='2d') - test_cfg=dict(flip_test=False) -) + test_cfg=dict(flip_test=False)) # base dataset settings data_mode = 'topdown' @@ -130,9 +137,10 @@ # pipelines train_pipeline = [ dict(type='LoadImage', backend_args=backend_args), - dict(type='RandomBackground', - bg_dir='/mnt/data/oss_beijing/mmseg/obj365v1_images', - bg_prob=0.5, + dict( + type='RandomBackground', + bg_dir='/mnt/data/oss_beijing/mmseg/obj365v1_images', + bg_prob=0.5, ), dict(type='GetBBoxCenterScale'), dict(type='RandomFlip', direction='horizontal'), @@ -198,7 +206,6 @@ test_mode=False, pipeline=[]) - # dna rendering dataset dna_rendering_dataset = dict( type='DNARenderingDataset', @@ -206,9 +213,7 @@ data_mode='topdown', ann_file='instances.npz', subset_frac=0.1, - pipeline=[ - dict(type='LoadMask', backend_args=backend_args) - ], + pipeline=[dict(type='LoadMask', backend_args=backend_args)], ) # mapping @@ -482,10 +487,7 @@ dataset_wb = dict( type='CombinedDataset', metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), - datasets=[ - dataset_coco, - dataset_halpe - ], + datasets=[dataset_coco, dataset_halpe], pipeline=[], test_mode=False, ) @@ -581,11 +583,9 @@ test_mode=False, ) - # ubody dataset scenes = [ - 'Magic_show', - 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow', + 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow', 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing', 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference' ] @@ -604,7 +604,6 @@ pipeline=[]) ubody_datasets.append(ubody) - train_datasets = [ dataset_wb, dataset_body, @@ -615,7 +614,6 @@ # dna_rendering_dataset ] - # data loaders train_dataloader = dict( batch_size=64, @@ -694,7 +692,8 @@ ann_file='annotations/coco_wholebody_val_v1.0.json', data_prefix=dict(img='val2017/'), test_mode=True, - bbox_file='data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', pipeline=val_pipeline, )) test_dataloader = val_dataloader diff --git a/projects/rtmpose3d/configs/rtmw3d-x_8xb64_cocktail14-384x288.py b/projects/rtmpose3d/configs/rtmw3d-x_8xb64_cocktail14-384x288.py index 3a822f50b8..2352072b62 100644 --- a/projects/rtmpose3d/configs/rtmw3d-x_8xb64_cocktail14-384x288.py +++ b/projects/rtmpose3d/configs/rtmw3d-x_8xb64_cocktail14-384x288.py @@ -53,6 +53,8 @@ use_dark=False, root_index=(11, 12)) +backbone_path = 'checkpoints/rtmpose-x_simcc-ucoco_pt-aic-coco_270e-384x288-f5b50679_20230822.pth' # noqa + # model settings model = dict( type='TopdownPoseEstimator3D', @@ -71,10 +73,7 @@ norm_cfg=dict(type='BN'), act_cfg=dict(type='SiLU'), init_cfg=dict( - type='Pretrained', - prefix='backbone.', - checkpoint='checkpoints/rtmpose-x_simcc-ucoco_pt-aic-coco_270e-384x288-f5b50679_20230822.pth' # noqa - )), + type='Pretrained', prefix='backbone.', checkpoint=backbone_path)), neck=dict( type='CSPNeXtPAFPN', in_channels=[320, 640, 1280], @@ -112,10 +111,19 @@ label_softmax=True), dict( type='BoneLoss', - joint_parents=[0, 1, 2, 3, 4, 5, 6, 5, 6, 7, 8, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 50, 50, 51, 52, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 91, 92, 93, 94, 91, 96, 97, 98, 91, 100, 101, 102, 91, 104, 105, 106, 91, 108, 109, 110, 8, 112, 113, 114, 113, 112, 117, 118, 117, 112, 121, 122, 123, 112, 125, 126, 127, 112, 129, 130, 131], + joint_parents=[ + 0, 1, 2, 3, 4, 5, 6, 5, 6, 7, 8, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, 2, 2, 2, 2, 2, 3, 3, 3, 3, + 3, 50, 50, 51, 52, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 3, 3, + 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 91, 92, 93, 94, 91, 96, 97, 98, 91, 100, + 101, 102, 91, 104, 105, 106, 91, 108, 109, 110, 8, 112, + 113, 114, 113, 112, 117, 118, 117, 112, 121, 122, 123, 112, + 125, 126, 127, 112, 129, 130, 131 + ], use_target_weight=True, - loss_weight=2.0 - ) + loss_weight=2.0) ], decoder=codec), test_cfg=dict(flip_test=False, mode='2d') @@ -130,9 +138,10 @@ # pipelines train_pipeline = [ dict(type='LoadImage', backend_args=backend_args), - dict(type='RandomBackground', - bg_dir='/mnt/data/oss_beijing/mmseg/obj365v1_images', - bg_prob=0.5, + dict( + type='RandomBackground', + bg_dir='/mnt/data/oss_beijing/mmseg/obj365v1_images', + bg_prob=0.5, ), dict(type='GetBBoxCenterScale'), dict(type='RandomFlip', direction='horizontal'), @@ -198,7 +207,6 @@ test_mode=False, pipeline=[]) - # dna rendering dataset dna_rendering_dataset = dict( type='DNARenderingDataset', @@ -206,9 +214,7 @@ data_mode='topdown', ann_file='instances.npz', subset_frac=0.1, - pipeline=[ - dict(type='LoadMask', backend_args=backend_args) - ], + pipeline=[dict(type='LoadMask', backend_args=backend_args)], ) # mapping @@ -482,10 +488,7 @@ dataset_wb = dict( type='CombinedDataset', metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), - datasets=[ - dataset_coco, - dataset_halpe - ], + datasets=[dataset_coco, dataset_halpe], pipeline=[], test_mode=False, ) @@ -581,11 +584,9 @@ test_mode=False, ) - # ubody dataset scenes = [ - 'Magic_show', - 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow', + 'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow', 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing', 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference' ] @@ -604,7 +605,6 @@ pipeline=[]) ubody_datasets.append(ubody) - train_datasets = [ dataset_wb, dataset_body, @@ -615,7 +615,6 @@ # dna_rendering_dataset ] - # data loaders train_dataloader = dict( batch_size=32, @@ -694,7 +693,8 @@ ann_file='annotations/coco_wholebody_val_v1.0.json', data_prefix=dict(img='val2017/'), test_mode=True, - bbox_file='data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', pipeline=val_pipeline, )) test_dataloader = val_dataloader diff --git a/projects/rtmpose3d/rtmpose3d/__init__.py b/projects/rtmpose3d/rtmpose3d/__init__.py index eec926b2c8..8bbd120d68 100644 --- a/projects/rtmpose3d/rtmpose3d/__init__.py +++ b/projects/rtmpose3d/rtmpose3d/__init__.py @@ -1,6 +1,8 @@ +from .loss import KLDiscretLoss2 from .pose_estimator import TopdownPoseEstimator3D from .rtmw3d_head import RTMW3DHead from .simcc_3d_label import SimCC3DLabel -from .loss import KLDiscretLoss2 -__all__ = ['TopdownPoseEstimator3D', 'RTMW3DHead', 'SimCC3DLabel', 'KLDiscretLoss2'] +__all__ = [ + 'TopdownPoseEstimator3D', 'RTMW3DHead', 'SimCC3DLabel', 'KLDiscretLoss2' +] diff --git a/projects/rtmpose3d/rtmpose3d/loss.py b/projects/rtmpose3d/rtmpose3d/loss.py index 499befa5a0..9869fd00ff 100644 --- a/projects/rtmpose3d/rtmpose3d/loss.py +++ b/projects/rtmpose3d/rtmpose3d/loss.py @@ -1,5 +1,6 @@ -from mmpose.registry import MODELS from mmpose.models.losses import KLDiscretLoss +from mmpose.registry import MODELS + @MODELS.register_module() class KLDiscretLoss2(KLDiscretLoss): @@ -34,4 +35,4 @@ def loss_name(self): Returns: str: The name of this loss item. """ - return self._loss_name \ No newline at end of file + return self._loss_name diff --git a/projects/rtmpose3d/rtmpose3d/pose_estimator.py b/projects/rtmpose3d/rtmpose3d/pose_estimator.py index 6854205b4b..3ef7411738 100644 --- a/projects/rtmpose3d/rtmpose3d/pose_estimator.py +++ b/projects/rtmpose3d/rtmpose3d/pose_estimator.py @@ -3,9 +3,10 @@ import numpy as np -from mmpose.utils.typing import InstanceList, PixelDataList, SampleList -from mmpose.registry import MODELS from mmpose.models.pose_estimators import TopdownPoseEstimator +from mmpose.registry import MODELS +from mmpose.utils.typing import InstanceList, PixelDataList, SampleList + @MODELS.register_module() class TopdownPoseEstimator3D(TopdownPoseEstimator): diff --git a/projects/rtmpose3d/rtmpose3d/rtmw3d_head.py b/projects/rtmpose3d/rtmpose3d/rtmw3d_head.py index bbf6bd2b48..c56db1c9b9 100644 --- a/projects/rtmpose3d/rtmpose3d/rtmw3d_head.py +++ b/projects/rtmpose3d/rtmpose3d/rtmw3d_head.py @@ -9,12 +9,12 @@ from mmpose.codecs.utils import get_simcc_maximum as get_2d_simcc_maximum from mmpose.evaluation.functional import keypoint_mpjpe +from mmpose.models.heads import BaseHead from mmpose.models.utils.rtmcc_block import RTMCCBlock, ScaleNorm from mmpose.registry import KEYPOINT_CODECS, MODELS from mmpose.utils.tensor_utils import to_numpy from mmpose.utils.typing import (ConfigType, InstanceList, OptConfigType, OptSampleList) -from mmpose.models.heads import BaseHead from .utils import get_simcc_maximum OptIntSeq = Optional[Sequence[int]] @@ -244,10 +244,9 @@ def _pack_and_call(args, func): batch_scores.append(scores) preds = [] - for keypoints_2d, keypoints, keypoints_simcc, scores in zip(batch_keypoints2d, - batch_keypoints, - batch_keypoints_simcc, - batch_scores): + for keypoints_2d, keypoints, keypoints_simcc, scores in zip( + batch_keypoints2d, batch_keypoints, batch_keypoints_simcc, + batch_scores): pred = InstanceData( keypoints_2d=keypoints_2d, keypoints=keypoints, @@ -347,7 +346,7 @@ def loss( # calculate losses losses = dict() for i, loss_ in enumerate(self.loss_module): - if loss_.loss_name == 'loss_bone' or loss_.loss_name == 'loss_mpjpe': + if loss_.loss_name in ['loss_bone', 'loss_mpjpe']: pred_coords = get_3d_coord(pred_x, pred_y, pred_z, with_z_labels) gt_coords = get_3d_coord(gt_x, gt_y, gt_z, with_z_labels) diff --git a/projects/rtmpose3d/rtmpose3d/simcc_3d_label.py b/projects/rtmpose3d/rtmpose3d/simcc_3d_label.py index 4440caa667..41f2064d15 100644 --- a/projects/rtmpose3d/rtmpose3d/simcc_3d_label.py +++ b/projects/rtmpose3d/rtmpose3d/simcc_3d_label.py @@ -5,9 +5,8 @@ import numpy as np from numpy import ndarray -from mmpose.registry import KEYPOINT_CODECS from mmpose.codecs.base import BaseKeypointCodec - +from mmpose.registry import KEYPOINT_CODECS from .utils import get_simcc_maximum @@ -155,8 +154,9 @@ def encode(self, with_z_label = True else: if keypoints.shape != np.zeros([]).shape: - keypoints_z = np.ones((keypoints.shape[0], - keypoints.shape[1], 1), dtype=np.float32) + keypoints_z = np.ones( + (keypoints.shape[0], keypoints.shape[1], 1), + dtype=np.float32) keypoints = np.concatenate([keypoints, keypoints_z], axis=-1) x, y, z, keypoint_weights = self._generate_gaussian( keypoints, keypoints_visible) diff --git a/projects/rtmpose3d/rtmpose3d/utils.py b/projects/rtmpose3d/rtmpose3d/utils.py index 8dab90de20..62837bde4c 100644 --- a/projects/rtmpose3d/rtmpose3d/utils.py +++ b/projects/rtmpose3d/rtmpose3d/utils.py @@ -63,7 +63,7 @@ def get_simcc_maximum(simcc_x: np.ndarray, locs = np.stack((x_locs, y_locs, z_locs), axis=-1).astype(np.float32) max_val_x = np.amax(simcc_x, axis=1) max_val_y = np.amax(simcc_y, axis=1) - + mask = max_val_x > max_val_y max_val_x[mask] = max_val_y[mask] vals = max_val_x @@ -73,4 +73,4 @@ def get_simcc_maximum(simcc_x: np.ndarray, locs = locs.reshape(n, k, 3) vals = vals.reshape(n, k) - return locs, vals \ No newline at end of file + return locs, vals From fc87ebd60ad6e364ad8d52cc44d4a735562864ba Mon Sep 17 00:00:00 2001 From: xiexinch Date: Sun, 28 Apr 2024 15:55:20 +0800 Subject: [PATCH 06/15] --update=refactor --- projects/rtmpose3d/configs/default_runtime.py | 54 ------- .../rtmw3d-l_8xb64_cocktail14-384x288.py | 137 ++++++++---------- ...y => rtmw3d-x_8xb32_cocktail14-384x288.py} | 136 ++++++++--------- .../{ => demo}/body3d_img2pose_demo.py | 2 +- .../rtmdet_m_640-8xb32_coco-person.py | 0 projects/rtmpose3d/rtmpose3d/__init__.py | 5 +- projects/rtmpose3d/rtmpose3d/loss.py | 2 +- .../rtmpose3d/rtmpose3d/pose_estimator.py | 17 ++- projects/rtmpose3d/rtmpose3d/rtmw3d_head.py | 11 +- .../rtmpose3d/rtmpose3d/simcc_3d_label.py | 135 ++++------------- 10 files changed, 177 insertions(+), 322 deletions(-) delete mode 100644 projects/rtmpose3d/configs/default_runtime.py rename projects/rtmpose3d/configs/{rtmw3d-x_8xb64_cocktail14-384x288.py => rtmw3d-x_8xb32_cocktail14-384x288.py} (92%) rename projects/rtmpose3d/{ => demo}/body3d_img2pose_demo.py (99%) rename projects/rtmpose3d/{configs => demo}/rtmdet_m_640-8xb32_coco-person.py (100%) diff --git a/projects/rtmpose3d/configs/default_runtime.py b/projects/rtmpose3d/configs/default_runtime.py deleted file mode 100644 index 6f27c0345a..0000000000 --- a/projects/rtmpose3d/configs/default_runtime.py +++ /dev/null @@ -1,54 +0,0 @@ -default_scope = 'mmpose' - -# hooks -default_hooks = dict( - timer=dict(type='IterTimerHook'), - logger=dict(type='LoggerHook', interval=50), - param_scheduler=dict(type='ParamSchedulerHook'), - checkpoint=dict(type='CheckpointHook', interval=10), - sampler_seed=dict(type='DistSamplerSeedHook'), - visualization=dict(type='PoseVisualizationHook', enable=False), - badcase=dict( - type='BadCaseAnalysisHook', - enable=False, - out_dir='badcase', - metric_type='loss', - badcase_thr=5)) - -# custom hooks -custom_hooks = [ - # Synchronize model buffers such as running_mean and running_var in BN - # at the end of each epoch - dict(type='SyncBuffersHook') -] - -# multi-processing backend -env_cfg = dict( - cudnn_benchmark=False, - mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), - dist_cfg=dict(backend='nccl'), -) - -# visualizer -vis_backends = [ - dict(type='LocalVisBackend'), - # dict(type='TensorboardVisBackend'), - # dict(type='WandbVisBackend'), -] -visualizer = dict( - type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer') - -# logger -log_processor = dict( - type='LogProcessor', window_size=50, by_epoch=True, num_digits=6) -log_level = 'INFO' -load_from = None -resume = False - -# file I/O backend -backend_args = dict(backend='local') - -# training/validation/testing progress -train_cfg = dict(by_epoch=True) -val_cfg = dict() -test_cfg = dict() diff --git a/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py b/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py index 950216c9bd..460ae6300d 100644 --- a/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py +++ b/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py @@ -1,4 +1,6 @@ -_base_ = ['./default_runtime.py'] +_base_ = ['mmpose::_base_/default_runtime.py'] + +custom_imports = dict(imports=['rtmpose3d'], allow_failed_imports=False) vis_backends = [ dict(type='LocalVisBackend'), @@ -105,7 +107,7 @@ pos_enc=False), loss=[ dict( - type='KLDiscretLoss2', + type='KLDiscretLossWithWeight', use_target_weight=True, beta=10., label_softmax=True), @@ -195,27 +197,6 @@ dict(type='PackPoseInputs') ] -# h3wb dataset -h3wb_dataset = dict( - type='H36MWholeBodyDataset', - ann_file='annotation_body3d/h3wb_train_bbox.npz', - seq_len=1, - causal=True, - data_root='data/h36m/', - data_prefix=dict(img='images/'), - test_mode=False, - pipeline=[]) - -# dna rendering dataset -dna_rendering_dataset = dict( - type='DNARenderingDataset', - data_root='data/dna_rendering_part1', - data_mode='topdown', - ann_file='instances.npz', - subset_frac=0.1, - pipeline=[dict(type='LoadMask', backend_args=backend_args)], -) - # mapping aic_coco133 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7), (5, 9), (6, 12), @@ -604,14 +585,23 @@ pipeline=[]) ubody_datasets.append(ubody) +# h3wb dataset +h3wb_dataset = dict( + type='H36MWholeBodyDataset', + ann_file='annotation_body3d/h3wb_train_bbox.npz', + seq_len=1, + causal=True, + data_root='data/h36m/', + data_prefix=dict(img='images/'), + test_mode=False, + pipeline=[]) + train_datasets = [ dataset_wb, dataset_body, dataset_face, - # dataset_hand, *ubody_datasets, h3wb_dataset, - # dna_rendering_dataset ] # data loaders @@ -626,20 +616,7 @@ pipeline=train_pipeline, metainfo=dict(from_file='configs/_base_/datasets/h3wb.py'), test_mode=False)) - -# hooks -default_hooks = dict( - checkpoint=dict( - type='CheckpointHook', - save_best='MPJPE', - rule='less', - max_keep_ckpts=1)) - # hooks -# default_hooks = dict( -# checkpoint=dict( -# save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) - custom_hooks = [ dict( type='EMAHook', @@ -653,7 +630,39 @@ switch_pipeline=train_pipeline_stage2) ] +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + save_best='MPJPE', + rule='less', + max_keep_ckpts=1)) + # eval h3wb +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='H36MWholeBodyDataset', + ann_file='annotation_body3d/h3wb_train_bbox.npz', + seq_len=1, + causal=True, + data_root='data/h36m/', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline)) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='SimpleMPJPE', mode='mpjpe'), + dict(type='SimpleMPJPE', mode='p-mpjpe') +] +test_evaluator = val_evaluator + +# eval coco # val_dataloader = dict( # batch_size=64, # num_workers=10, @@ -661,45 +670,25 @@ # drop_last=False, # sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), # dataset=dict( -# type='H36MWholeBodyDataset', -# ann_file='annotation_body3d/h3wb_train_bbox.npz', -# seq_len=1, -# causal=True, -# data_root='data/h36m/', -# data_prefix=dict(img='images/'), +# type='CocoWholeBodyDataset', +# data_root='data/coco/', +# data_mode='topdown', +# ann_file='annotations/coco_wholebody_val_v1.0.json', +# data_prefix=dict(img='val2017/'), # test_mode=True, -# pipeline=val_pipeline)) +# bbox_file='data/coco/person_detection_results/' +# 'COCO_val2017_detections_AP_H_56_person.json', +# pipeline=val_pipeline, +# )) # test_dataloader = val_dataloader # # evaluators -# val_evaluator = [ -# dict(type='SimpleMPJPE', mode='mpjpe'), -# dict(type='SimpleMPJPE', mode='p-mpjpe') -# ] +# val_evaluator = dict( +# type='CocoWholeBodyMetric', +# ann_file='data/coco/' + 'annotations/coco_wholebody_val_v1.0.json') # test_evaluator = val_evaluator -# eval coco -val_dataloader = dict( - batch_size=64, - num_workers=10, - persistent_workers=True, - drop_last=False, - sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), - dataset=dict( - type='CocoWholeBodyDataset', - data_root='data/coco/', - data_mode='topdown', - ann_file='annotations/coco_wholebody_val_v1.0.json', - data_prefix=dict(img='val2017/'), - test_mode=True, - bbox_file='data/coco/person_detection_results/' - 'COCO_val2017_detections_AP_H_56_person.json', - pipeline=val_pipeline, - )) -test_dataloader = val_dataloader - -# evaluators -val_evaluator = dict( - type='CocoWholeBodyMetric', - ann_file='data/coco/' + 'annotations/coco_wholebody_val_v1.0.json') -test_evaluator = val_evaluator +# hooks +# default_hooks = dict( +# checkpoint=dict( +# save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) diff --git a/projects/rtmpose3d/configs/rtmw3d-x_8xb64_cocktail14-384x288.py b/projects/rtmpose3d/configs/rtmw3d-x_8xb32_cocktail14-384x288.py similarity index 92% rename from projects/rtmpose3d/configs/rtmw3d-x_8xb64_cocktail14-384x288.py rename to projects/rtmpose3d/configs/rtmw3d-x_8xb32_cocktail14-384x288.py index 2352072b62..c171835e0e 100644 --- a/projects/rtmpose3d/configs/rtmw3d-x_8xb64_cocktail14-384x288.py +++ b/projects/rtmpose3d/configs/rtmw3d-x_8xb32_cocktail14-384x288.py @@ -1,4 +1,6 @@ -_base_ = ['../../_base_/default_runtime.py'] +_base_ = ['mmpose::_base_/default_runtime.py'] + +custom_imports = dict(imports=['rtmpose3d'], allow_failed_imports=False) vis_backends = [ dict(type='LocalVisBackend'), @@ -105,7 +107,7 @@ pos_enc=False), loss=[ dict( - type='KLDiscretLoss2', + type='KLDiscretLossWithWeight', use_target_weight=True, beta=10., label_softmax=True), @@ -196,27 +198,6 @@ dict(type='PackPoseInputs') ] -# h3wb dataset -h3wb_dataset = dict( - type='H36MWholeBodyDataset', - ann_file='annotation_body3d/h3wb_train_bbox.npz', - seq_len=1, - causal=True, - data_root='data/h36m/', - data_prefix=dict(img='images/'), - test_mode=False, - pipeline=[]) - -# dna rendering dataset -dna_rendering_dataset = dict( - type='DNARenderingDataset', - data_root='data/dna_rendering_part1', - data_mode='topdown', - ann_file='instances.npz', - subset_frac=0.1, - pipeline=[dict(type='LoadMask', backend_args=backend_args)], -) - # mapping aic_coco133 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7), (5, 9), (6, 12), @@ -605,14 +586,23 @@ pipeline=[]) ubody_datasets.append(ubody) +# h3wb dataset +h3wb_dataset = dict( + type='H36MWholeBodyDataset', + ann_file='annotation_body3d/h3wb_train_bbox.npz', + seq_len=1, + causal=True, + data_root='data/h36m/', + data_prefix=dict(img='images/'), + test_mode=False, + pipeline=[]) + train_datasets = [ dataset_wb, dataset_body, dataset_face, - dataset_hand, *ubody_datasets, h3wb_dataset, - # dna_rendering_dataset ] # data loaders @@ -629,18 +619,6 @@ test_mode=False)) # hooks -# default_hooks = dict( -# checkpoint=dict( -# type='CheckpointHook', -# save_best='MPJPE', -# rule='less', -# max_keep_ckpts=1)) - -# hooks -default_hooks = dict( - checkpoint=dict( - save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) - custom_hooks = [ dict( type='EMAHook', @@ -654,7 +632,39 @@ switch_pipeline=train_pipeline_stage2) ] +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + save_best='MPJPE', + rule='less', + max_keep_ckpts=1)) + # eval h3wb +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type='H36MWholeBodyDataset', + ann_file='annotation_body3d/h3wb_train_bbox.npz', + seq_len=1, + causal=True, + data_root='data/h36m/', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline)) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = [ + dict(type='SimpleMPJPE', mode='mpjpe'), + dict(type='SimpleMPJPE', mode='p-mpjpe') +] +test_evaluator = val_evaluator + +# eval coco # val_dataloader = dict( # batch_size=64, # num_workers=10, @@ -662,45 +672,25 @@ # drop_last=False, # sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), # dataset=dict( -# type='H36MWholeBodyDataset', -# ann_file='annotation_body3d/h3wb_train_bbox.npz', -# seq_len=1, -# causal=True, -# data_root='data/h36m/', -# data_prefix=dict(img='images/'), +# type='CocoWholeBodyDataset', +# data_root='data/coco/', +# data_mode='topdown', +# ann_file='annotations/coco_wholebody_val_v1.0.json', +# data_prefix=dict(img='val2017/'), # test_mode=True, -# pipeline=val_pipeline)) +# bbox_file='data/coco/person_detection_results/' +# 'COCO_val2017_detections_AP_H_56_person.json', +# pipeline=val_pipeline, +# )) # test_dataloader = val_dataloader # # evaluators -# val_evaluator = [ -# dict(type='SimpleMPJPE', mode='mpjpe'), -# dict(type='SimpleMPJPE', mode='p-mpjpe') -# ] +# val_evaluator = dict( +# type='CocoWholeBodyMetric', +# ann_file='data/coco/' + 'annotations/coco_wholebody_val_v1.0.json') # test_evaluator = val_evaluator -# eval coco -val_dataloader = dict( - batch_size=64, - num_workers=10, - persistent_workers=True, - drop_last=False, - sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), - dataset=dict( - type='CocoWholeBodyDataset', - data_root='data/coco/', - data_mode='topdown', - ann_file='annotations/coco_wholebody_val_v1.0.json', - data_prefix=dict(img='val2017/'), - test_mode=True, - bbox_file='data/coco/person_detection_results/' - 'COCO_val2017_detections_AP_H_56_person.json', - pipeline=val_pipeline, - )) -test_dataloader = val_dataloader - -# evaluators -val_evaluator = dict( - type='CocoWholeBodyMetric', - ann_file='data/coco/' + 'annotations/coco_wholebody_val_v1.0.json') -test_evaluator = val_evaluator +# hooks +# default_hooks = dict( +# checkpoint=dict( +# save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) diff --git a/projects/rtmpose3d/body3d_img2pose_demo.py b/projects/rtmpose3d/demo/body3d_img2pose_demo.py similarity index 99% rename from projects/rtmpose3d/body3d_img2pose_demo.py rename to projects/rtmpose3d/demo/body3d_img2pose_demo.py index 4a0e90040c..72f259414b 100644 --- a/projects/rtmpose3d/body3d_img2pose_demo.py +++ b/projects/rtmpose3d/demo/body3d_img2pose_demo.py @@ -283,7 +283,7 @@ def main(): det_dataset_link_color = pose_estimator.dataset_meta.get( 'skeleton_link_colors', None) - pose_estimator.cfg.model.test_cfg.mode = 'simcc' + pose_estimator.cfg.model.test_cfg.mode = 'vis' pose_estimator.cfg.visualizer.radius = args.radius pose_estimator.cfg.visualizer.line_width = args.thickness pose_estimator.cfg.visualizer.det_kpt_color = det_kpt_color diff --git a/projects/rtmpose3d/configs/rtmdet_m_640-8xb32_coco-person.py b/projects/rtmpose3d/demo/rtmdet_m_640-8xb32_coco-person.py similarity index 100% rename from projects/rtmpose3d/configs/rtmdet_m_640-8xb32_coco-person.py rename to projects/rtmpose3d/demo/rtmdet_m_640-8xb32_coco-person.py diff --git a/projects/rtmpose3d/rtmpose3d/__init__.py b/projects/rtmpose3d/rtmpose3d/__init__.py index 8bbd120d68..740b92d7cb 100644 --- a/projects/rtmpose3d/rtmpose3d/__init__.py +++ b/projects/rtmpose3d/rtmpose3d/__init__.py @@ -1,8 +1,9 @@ -from .loss import KLDiscretLoss2 +from .loss import KLDiscretLossWithWeight from .pose_estimator import TopdownPoseEstimator3D from .rtmw3d_head import RTMW3DHead from .simcc_3d_label import SimCC3DLabel __all__ = [ - 'TopdownPoseEstimator3D', 'RTMW3DHead', 'SimCC3DLabel', 'KLDiscretLoss2' + 'TopdownPoseEstimator3D', 'RTMW3DHead', 'SimCC3DLabel', + 'KLDiscretLossWithWeight' ] diff --git a/projects/rtmpose3d/rtmpose3d/loss.py b/projects/rtmpose3d/rtmpose3d/loss.py index 9869fd00ff..87289762d8 100644 --- a/projects/rtmpose3d/rtmpose3d/loss.py +++ b/projects/rtmpose3d/rtmpose3d/loss.py @@ -3,7 +3,7 @@ @MODELS.register_module() -class KLDiscretLoss2(KLDiscretLoss): +class KLDiscretLossWithWeight(KLDiscretLoss): def __init__(self, **kwargs): super().__init__(**kwargs) diff --git a/projects/rtmpose3d/rtmpose3d/pose_estimator.py b/projects/rtmpose3d/rtmpose3d/pose_estimator.py index 3ef7411738..438cd2b67b 100644 --- a/projects/rtmpose3d/rtmpose3d/pose_estimator.py +++ b/projects/rtmpose3d/rtmpose3d/pose_estimator.py @@ -13,6 +13,8 @@ class TopdownPoseEstimator3D(TopdownPoseEstimator): def __init__(self, **kwargs): super().__init__(**kwargs) + + # a default camera parameter for 3D pose estimation self.camera_param = { 'c': [512.54150496, 515.45148698], 'f': [1145.04940459, 1143.78109572], @@ -40,7 +42,7 @@ def add_pred_to_datasample(self, batch_pred_instances: InstanceList, output_keypoint_indices = self.test_cfg.get('output_keypoint_indices', None) mode = self.test_cfg.get('mode', '3d') - assert mode in ['2d', '3d', 'vis', 'simcc'] + assert mode in ['2d', '3d', 'vis'] for pred_instances, pred_fields, data_sample in zip_longest( batch_pred_instances, batch_pred_fields, batch_data_samples): @@ -51,11 +53,14 @@ def add_pred_to_datasample(self, batch_pred_instances: InstanceList, input_scale = data_sample.metainfo['input_scale'] input_size = data_sample.metainfo['input_size'] keypoints_3d = pred_instances.keypoints - keypoints_2d = pred_instances.keypoints_2d keypoints_simcc = pred_instances.keypoints_simcc + + # convert keypoints from input space to image space + keypoints_2d = keypoints_3d[..., :2].copy() keypoints_2d = keypoints_2d / input_size * input_scale \ + input_center - 0.5 * input_scale + # convert keypoints from image space to camera space if gt_instances.get('camera_params', None) is not None: camera_params = gt_instances.camera_params[0] f = np.array(camera_params['f']) @@ -63,7 +68,6 @@ def add_pred_to_datasample(self, batch_pred_instances: InstanceList, else: f = np.array([1145.04940459, 1143.78109572]) c = np.array(data_sample.ori_shape) - kpts_pixel = np.concatenate([ keypoints_2d, (keypoints_3d[..., 2] + gt_instances.root_z)[..., None] @@ -72,16 +76,17 @@ def add_pred_to_datasample(self, batch_pred_instances: InstanceList, kpts_cam = kpts_pixel.copy() kpts_cam[..., :2] = (kpts_pixel[..., :2] - c) / f * kpts_pixel[..., 2:] + if mode == '3d': + # Evaluation with 3D keypoint coordinates pred_instances.keypoints = kpts_cam pred_instances.transformed_keypoints = keypoints_2d elif mode == 'vis': - pred_instances.keypoints = keypoints_3d - pred_instances.transformed_keypoints = keypoints_2d - elif mode == 'simcc': + # Visualization with SimCC keypoints pred_instances.keypoints = keypoints_simcc pred_instances.transformed_keypoints = keypoints_2d else: + # Evaluation with 2D keypoint coordinates pred_instances.keypoints = keypoints_2d pred_instances.transformed_keypoints = keypoints_2d diff --git a/projects/rtmpose3d/rtmpose3d/rtmw3d_head.py b/projects/rtmpose3d/rtmpose3d/rtmw3d_head.py index c56db1c9b9..90e73b7255 100644 --- a/projects/rtmpose3d/rtmpose3d/rtmw3d_head.py +++ b/projects/rtmpose3d/rtmpose3d/rtmw3d_head.py @@ -232,23 +232,20 @@ def _pack_and_call(args, func): batch_output_np = to_numpy(batch_outputs, unzip=True) batch_keypoints = [] - batch_keypoints2d = [] batch_keypoints_simcc = [] batch_scores = [] for outputs in batch_output_np: - keypoints_2d, keypoints, keypoints_simcc, scores = _pack_and_call( + keypoints, keypoints_simcc, scores = _pack_and_call( outputs, self.decoder.decode) - batch_keypoints2d.append(keypoints_2d) batch_keypoints.append(keypoints) batch_keypoints_simcc.append(keypoints_simcc) batch_scores.append(scores) preds = [] - for keypoints_2d, keypoints, keypoints_simcc, scores in zip( - batch_keypoints2d, batch_keypoints, batch_keypoints_simcc, - batch_scores): + for keypoints, keypoints_simcc, scores in zip(batch_keypoints, + batch_keypoints_simcc, + batch_scores): pred = InstanceData( - keypoints_2d=keypoints_2d, keypoints=keypoints, keypoints_simcc=keypoints_simcc, keypoint_scores=scores) diff --git a/projects/rtmpose3d/rtmpose3d/simcc_3d_label.py b/projects/rtmpose3d/rtmpose3d/simcc_3d_label.py index 41f2064d15..ead72c5090 100644 --- a/projects/rtmpose3d/rtmpose3d/simcc_3d_label.py +++ b/projects/rtmpose3d/rtmpose3d/simcc_3d_label.py @@ -1,11 +1,12 @@ # Copyright (c) OpenMMLab. All rights reserved. from itertools import product -from typing import Any, Optional, Tuple, Union +from typing import Optional, Tuple, Union import numpy as np from numpy import ndarray from mmpose.codecs.base import BaseKeypointCodec +from mmpose.codecs.utils.refinement import refine_simcc_dark from mmpose.registry import KEYPOINT_CODECS from .utils import get_simcc_maximum @@ -27,32 +28,23 @@ class SimCC3DLabel(BaseKeypointCodec): Encoded: - keypoint_x_labels (np.ndarray): The generated SimCC label for x-axis. - The label shape is (N, K, Wx) if ``smoothing_type=='gaussian'`` - and (N, K) if `smoothing_type=='standard'``, where - :math:`Wx=w*simcc_split_ratio` - keypoint_y_labels (np.ndarray): The generated SimCC label for y-axis. - The label shape is (N, K, Wy) if ``smoothing_type=='gaussian'`` - and (N, K) if `smoothing_type=='standard'``, where - :math:`Wy=h*simcc_split_ratio` + - keypoint_z_labels (np.ndarray): The generated SimCC label for z-axis. - keypoint_weights (np.ndarray): The target weights in shape (N, K) Args: input_size (tuple): Input image size in [w, h] - smoothing_type (str): The SimCC label smoothing strategy. Options are - ``'gaussian'`` and ``'standard'``. Defaults to ``'gaussian'`` sigma (float | int | tuple): The sigma value in the Gaussian SimCC label. Defaults to 6.0 simcc_split_ratio (float): The ratio of the label size to the input size. For example, if the input width is ``w``, the x label size will be :math:`w*simcc_split_ratio`. Defaults to 2.0 - label_smooth_weight (float): Label Smoothing weight. Defaults to 0.0 normalize (bool): Whether to normalize the heatmaps. Defaults to True. use_dark (bool): Whether to use the DARK post processing. Defaults to False. - decode_visibility (bool): Whether to decode the visibility. Defaults - to False. - decode_beta (float): The beta value for decoding visibility. Defaults - to 150.0. + root_index (int | tuple): The index of the root keypoint. Defaults to + 0. + z_range (float): The range of the z-axis. Defaults to None. .. _`SimCC: a Simple Coordinate Classification Perspective for Human Pose Estimation`: https://arxiv.org/abs/2107.03332 @@ -79,51 +71,30 @@ class SimCC3DLabel(BaseKeypointCodec): def __init__(self, input_size: Tuple[int, int, int], - smoothing_type: str = 'gaussian', sigma: Union[float, int, Tuple[float]] = 6.0, simcc_split_ratio: float = 2.0, - label_smooth_weight: float = 0.0, normalize: bool = True, use_dark: bool = False, - decode_visibility: bool = False, - decode_beta: float = 150.0, root_index: Union[int, Tuple[int]] = 0, - z_range: Optional[int] = None, - sigmoid_z: bool = False) -> None: + z_range: Optional[int] = None) -> None: super().__init__() self.input_size = input_size - self.smoothing_type = smoothing_type self.simcc_split_ratio = simcc_split_ratio - self.label_smooth_weight = label_smooth_weight self.normalize = normalize self.use_dark = use_dark - self.decode_visibility = decode_visibility - self.decode_beta = decode_beta if isinstance(sigma, (float, int)): self.sigma = np.array([sigma, sigma, sigma]) else: self.sigma = np.array(sigma) - if self.smoothing_type not in {'gaussian', 'standard'}: - raise ValueError( - f'{self.__class__.__name__} got invalid `smoothing_type` value' - f'{self.smoothing_type}. Should be one of ' - '{"gaussian", "standard"}') - - if self.smoothing_type == 'gaussian' and self.label_smooth_weight > 0: - raise ValueError('Attribute `label_smooth_weight` is only ' - 'used for `standard` mode.') - - if self.label_smooth_weight < 0.0 or self.label_smooth_weight > 1.0: - raise ValueError('`label_smooth_weight` should be in range [0, 1]') - self.root_index = list(root_index) if isinstance( root_index, tuple) else [root_index] - self.z_range = z_range if z_range is not None else 2.1744869 - self.sigmoid_z = sigmoid_z + + # Mean value of the root z-axis of datasets self.root_z = [5.14388] + self.z_range = z_range if z_range is not None else 2.1744869 def encode(self, keypoints: np.ndarray, @@ -139,12 +110,8 @@ def encode(self, lifting_target = keypoints_3d.copy() root_z = keypoints_3d[..., self.root_index, 2].mean(1) keypoints_3d[..., 2] -= root_z - if self.sigmoid_z: - keypoints_z = (1 / (1 + np.exp(-(3 * keypoints_3d[..., 2]))) - ) * self.input_size[2] - else: - keypoints_z = (keypoints_3d[..., 2] / self.z_range + 1) * ( - self.input_size[2] / 2) + keypoints_z = (keypoints_3d[..., 2] / self.z_range + 1) * ( + self.input_size[2] / 2) keypoints_3d = np.concatenate([keypoints, keypoints_z[..., None]], axis=-1) @@ -161,6 +128,7 @@ def encode(self, x, y, z, keypoint_weights = self._generate_gaussian( keypoints, keypoints_visible) else: + # placeholder for empty keypoints x, y, z = np.zeros((3, 1), dtype=np.float32) keypoint_weights = np.ones((1, )) weight_z = np.zeros_like(keypoint_weights) @@ -199,20 +167,27 @@ def decode(self, x: np.ndarray, y: np.ndarray, z: np.ndarray): keypoints = keypoints[None, :] scores = scores[None, :] + if self.use_dark: + x_blur = int((self.sigma[0] * 20 - 7) // 3) + y_blur = int((self.sigma[1] * 20 - 7) // 3) + z_blur = int((self.sigma[2] * 20 - 7) // 3) + x_blur -= int((x_blur % 2) == 0) + y_blur -= int((y_blur % 2) == 0) + z_blur -= int((z_blur % 2) == 0) + keypoints[:, :, 0] = refine_simcc_dark(keypoints[:, :, 0], x, + x_blur) + keypoints[:, :, 1] = refine_simcc_dark(keypoints[:, :, 1], y, + y_blur) + keypoints[:, :, 2] = refine_simcc_dark(keypoints[:, :, 2], z, + z_blur) + keypoints /= self.simcc_split_ratio keypoints_simcc = keypoints.copy() - keypoints_2d = keypoints[..., :2] keypoints_z = keypoints[..., 2:3] - if self.sigmoid_z: - keypoints_z /= self.input_size[2] - keypoints_z[keypoints_z <= 0] = 1e-8 - scores[(keypoints_z <= 0).squeeze(-1)] = 0 - keypoints[..., 2:3] = np.log(keypoints_z / (1 - keypoints_z)) / 3 - else: - keypoints[..., - 2:3] = (keypoints_z / - (self.input_size[-1] / 2) - 1) * self.z_range - return keypoints_2d, keypoints, keypoints_simcc, scores + + keypoints[..., 2:3] = (keypoints_z / + (self.input_size[-1] / 2) - 1) * self.z_range + return keypoints, keypoints_simcc, scores def _map_coordinates( self, @@ -285,51 +260,3 @@ def _generate_gaussian( target_z /= norm_value[2] return target_x, target_y, target_z, keypoint_weights - - def _generate_standard( - self, - keypoints: np.ndarray, - keypoints_visible: Optional[np.ndarray] = None - ) -> tuple[ndarray, ndarray, ndarray, Any]: - """Encoding keypoints into SimCC labels with Standard Label Smoothing - strategy. - - Labels will be one-hot vectors if self.label_smooth_weight==0.0 - """ - - N, K, _ = keypoints.shape - w, h, d = self.input_size - w = np.around(w * self.simcc_split_ratio).astype(int) - h = np.around(h * self.simcc_split_ratio).astype(int) - d = np.around(d * self.simcc_split_ratio).astype(int) - - keypoints_split, keypoint_weights = self._map_coordinates( - keypoints, keypoints_visible) - - x = np.zeros((N, K, w), dtype=np.float32) - y = np.zeros((N, K, h), dtype=np.float32) - z = np.zeros((N, K, d), dtype=np.float32) - - for n, k in product(range(N), range(K)): - # skip unlabled keypoints - if keypoints_visible[n, k] < 0.5: - continue - - # get center coordinates - mu_x, mu_y, mu_z = keypoints_split[n, k].astype(np.int64) - - # detect abnormal coords and assign the weight 0 - if mu_x >= w or mu_y >= h or mu_x < 0 or mu_y < 0: - keypoint_weights[n, k] = 0 - continue - - if self.label_smooth_weight > 0: - x[n, k] = self.label_smooth_weight / (w - 1) - y[n, k] = self.label_smooth_weight / (h - 1) - z[n, k] = self.label_smooth_weight / (d - 1) - - x[n, k, mu_x] = 1.0 - self.label_smooth_weight - y[n, k, mu_y] = 1.0 - self.label_smooth_weight - z[n, k, mu_z] = 1.0 - self.label_smooth_weight - - return x, y, z, keypoint_weights From 88ad8e4f2fa6e2b5196b41852a5996602c4c4eb2 Mon Sep 17 00:00:00 2001 From: xiexinch Date: Sun, 28 Apr 2024 16:27:13 +0800 Subject: [PATCH 07/15] --fix=fix 3d training --- .../datasets/base/base_coco_style_dataset.py | 2 + mmpose/datasets/datasets/body/mpii_dataset.py | 3 +- .../wholebody/coco_wholebody_dataset.py | 3 +- .../datasets/wholebody3d/h3wb_dataset.py | 41 ++++++++--- .../datasets/wholebody3d/ubody3d_dataset.py | 68 +++++++++++++------ 5 files changed, 86 insertions(+), 31 deletions(-) diff --git a/mmpose/datasets/datasets/base/base_coco_style_dataset.py b/mmpose/datasets/datasets/base/base_coco_style_dataset.py index ac94961f2c..9a223984e0 100644 --- a/mmpose/datasets/datasets/base/base_coco_style_dataset.py +++ b/mmpose/datasets/datasets/base/base_coco_style_dataset.py @@ -210,6 +210,8 @@ def load_data_list(self) -> List[dict]: data_list = self._get_bottomup_data_infos( instance_list, image_list) + if hasattr(self, 'coco'): + del self.coco return data_list def _load_annotations(self) -> Tuple[List[dict], List[dict]]: diff --git a/mmpose/datasets/datasets/body/mpii_dataset.py b/mmpose/datasets/datasets/body/mpii_dataset.py index 28d53bd8b8..d60338657e 100644 --- a/mmpose/datasets/datasets/body/mpii_dataset.py +++ b/mmpose/datasets/datasets/body/mpii_dataset.py @@ -221,5 +221,6 @@ def _load_annotations(self) -> Tuple[List[dict], List[dict]]: instance_list.append(instance_info) ann_id = ann_id + 1 - + del self.anns + self.coco = None return instance_list, image_list diff --git a/mmpose/datasets/datasets/wholebody/coco_wholebody_dataset.py b/mmpose/datasets/datasets/wholebody/coco_wholebody_dataset.py index b0e20e1335..2539a9817e 100644 --- a/mmpose/datasets/datasets/wholebody/coco_wholebody_dataset.py +++ b/mmpose/datasets/datasets/wholebody/coco_wholebody_dataset.py @@ -120,12 +120,13 @@ def parse_data_info(self, raw_data_info: dict) -> Optional[dict]: 'bbox_score': np.ones(1, dtype=np.float32), 'num_keypoints': num_keypoints, 'keypoints': keypoints, + 'keypoints_3d': None, 'keypoints_visible': keypoints_visible, 'iscrowd': ann['iscrowd'], 'segmentation': ann['segmentation'], 'area': area, 'id': ann['id'], - 'category_id': np.array(ann['category_id']), + 'category_id': ann['category_id'], # store the raw annotation of the instance # it is useful for evaluation without providing ann_file 'raw_ann_info': copy.deepcopy(ann), diff --git a/mmpose/datasets/datasets/wholebody3d/h3wb_dataset.py b/mmpose/datasets/datasets/wholebody3d/h3wb_dataset.py index 95e40db4b4..26ffa7d14f 100644 --- a/mmpose/datasets/datasets/wholebody3d/h3wb_dataset.py +++ b/mmpose/datasets/datasets/wholebody3d/h3wb_dataset.py @@ -106,6 +106,7 @@ def _load_ann_file(self, ann_file: str) -> dict: self.ann_data = data['train_data'].item() self.camera_data = data['metadata'].item() + self.bboxes = data['bbox'].item() def get_sequence_indices(self) -> List[List[int]]: return [] @@ -132,19 +133,26 @@ def _load_annotations(self) -> Tuple[List[dict], List[dict]]: 'K': camera_param['K'][0, :2, ...], 'R': camera_param['R'][0], 'T': camera_param['T'].reshape(3, 1), - 'Distortion': camera_param['Distortion'][0] + 'Distortion': camera_param['Distortion'][0], } + camera_param['f'] = (camera_param['K'][0, 0] * 1000, + camera_param['K'][1, 1] * 1000) + camera_param['c'] = (camera_param['K'][0, 2] * 1000, + camera_param['K'][1, 2] * 1000) seq_step = 1 _len = (self.seq_len - 1) * seq_step + 1 _indices = list( range(len(self.ann_data[subject][act]['frame_id']))) + seq_indices = [ _indices[i:(i + _len):seq_step] for i in list(range(0, len(_indices) - _len + 1)) ] + frames = self.ann_data[subject][act]['frame_id'] + for idx, frame_ids in enumerate(seq_indices): expected_num_frames = self.seq_len if self.multiple_target: @@ -163,6 +171,20 @@ def _load_annotations(self) -> Tuple[List[dict], List[dict]]: if self.multiple_target > 0: target_idx = list(range(self.multiple_target)) + bbox = self.bboxes[(subject, act, cam, + frames[frame_ids[-1]])] + bbox = np.array([[ + bbox['x_min'], bbox['y_min'], bbox['x_max'], + bbox['y_max'] + ]], + dtype=np.float32) + + img_path = f'{self.data_root}original/{subject}/Images/{act}.{cam}/frame_{frames[frame_ids[-1]]}.jpg' # noqa + img_paths = [ + f'{self.data_root}original/{subject}/Images/{act}.{cam}/frame_{frames[i]}.jpg' # noqa + for i in frame_ids + ] + instance_info = { 'num_keypoints': num_keypoints, @@ -174,6 +196,10 @@ def _load_annotations(self) -> Tuple[List[dict], List[dict]]: np.ones_like(_kpts_2d[..., 0], dtype=np.float32), 'keypoints_3d_visible': np.ones_like(_kpts_2d[..., 0], dtype=np.float32), + 'bbox': + bbox, + 'bbox_score': + np.ones((len(frame_ids), )), 'scale': np.zeros((1, 1), dtype=np.float32), 'center': @@ -186,12 +212,11 @@ def _load_annotations(self) -> Tuple[List[dict], List[dict]]: 1, 'iscrowd': 0, - 'camera_param': - camera_param, - 'img_paths': [ - f'{subject}/{act}/{cam}/{i:06d}.jpg' - for i in frame_ids - ], + 'camera_param': [camera_param], + 'img_paths': + img_paths, + 'img_path': + img_path, 'img_ids': frame_ids, 'lifting_target': @@ -209,5 +234,5 @@ def _load_annotations(self) -> Tuple[List[dict], List[dict]]: image_list.append(img_info) instance_id += 1 - + del self.ann_data return instance_list, image_list diff --git a/mmpose/datasets/datasets/wholebody3d/ubody3d_dataset.py b/mmpose/datasets/datasets/wholebody3d/ubody3d_dataset.py index 85b8d893e7..5e6e564168 100644 --- a/mmpose/datasets/datasets/wholebody3d/ubody3d_dataset.py +++ b/mmpose/datasets/datasets/wholebody3d/ubody3d_dataset.py @@ -84,7 +84,7 @@ def __init__(self, super().__init__(multiple_target=multiple_target, **kwargs) - METAINFO: dict = dict(from_file='configs/_base_/datasets/ubody3d.py') + METAINFO: dict = dict(from_file='configs/_base_/datasets/h3wb.py') def _load_ann_file(self, ann_file: str) -> dict: """Load annotation file.""" @@ -167,7 +167,7 @@ def _parse_image_name(self, image_path: str) -> Tuple[str, int]: def _load_annotations(self): """Load data from annotations in COCO format.""" - num_keypoints = self.metainfo['num_keypoints'] + num_keypoints = 133 self._metainfo['CLASSES'] = self.ann_data.loadCats( self.ann_data.getCatIds()) @@ -184,23 +184,37 @@ def _load_annotations(self): f'got {len(_ann_ids)} ') anns = self.ann_data.loadAnns(_ann_ids) + num_anns = len(anns) img_ids = [] - kpts = np.zeros((len(anns), num_keypoints, 2), dtype=np.float32) - kpts_3d = np.zeros((len(anns), num_keypoints, 3), dtype=np.float32) - keypoints_visible = np.zeros((len(anns), num_keypoints, 1), + kpts = np.zeros((num_anns, num_keypoints, 2), dtype=np.float32) + kpts_3d = np.zeros((num_anns, num_keypoints, 3), dtype=np.float32) + keypoints_visible = np.zeros((num_anns, num_keypoints), dtype=np.float32) + scales = np.zeros((num_anns, 2), dtype=np.float32) + centers = np.zeros((num_anns, 2), dtype=np.float32) + bboxes = np.zeros((num_anns, 4), dtype=np.float32) + bbox_scores = np.zeros((num_anns, ), dtype=np.float32) + bbox_scales = np.zeros((num_anns, 2), dtype=np.float32) + for j, ann in enumerate(anns): img_ids.append(ann['image_id']) kpts[j] = np.array(ann['keypoints'], dtype=np.float32) kpts_3d[j] = np.array(ann['keypoints_3d'], dtype=np.float32) keypoints_visible[j] = np.array( ann['keypoints_valid'], dtype=np.float32) + if 'scale' in ann: + scales[j] = np.array(ann['scale']) + if 'center' in ann: + centers[j] = np.array(ann['center']) + bboxes[j] = np.array(ann['bbox'], dtype=np.float32) + bbox_scores[j] = np.array([1], dtype=np.float32) + bbox_scales[j] = np.array([1, 1], dtype=np.float32) + imgs = self.ann_data.loadImgs(img_ids) - keypoints_visible = keypoints_visible.squeeze(-1) - scales = np.zeros(len(imgs), dtype=np.float32) - centers = np.zeros((len(imgs), 2), dtype=np.float32) - img_paths = np.array([img['file_name'] for img in imgs]) + img_paths = np.array([ + f'{self.data_root}/images/' + img['file_name'] for img in imgs + ]) factors = np.zeros((kpts_3d.shape[0], ), dtype=np.float32) target_idx = [-1] if self.causal else [int(self.seq_len // 2)] @@ -212,6 +226,8 @@ def _load_annotations(self): cam_param['w'] = 1000 cam_param['h'] = 1000 + cam_param = {'f': cam_param['focal'], 'c': cam_param['princpt']} + instance_info = { 'num_keypoints': num_keypoints, 'keypoints': kpts, @@ -223,25 +239,35 @@ def _load_annotations(self): 'category_id': 1, 'iscrowd': 0, 'img_paths': list(img_paths), + 'img_path': img_paths[-1], 'img_ids': [img['id'] for img in imgs], 'lifting_target': kpts_3d[target_idx], 'lifting_target_visible': keypoints_visible[target_idx], - 'target_img_paths': img_paths[target_idx], - 'camera_param': cam_param, + 'target_img_paths': list(img_paths[target_idx]), + 'camera_param': [cam_param], 'factor': factors, 'target_idx': target_idx, + 'bbox': bboxes, + 'bbox_scales': bbox_scales, + 'bbox_scores': bbox_scores } instance_list.append(instance_info) - for img_id in self.ann_data.getImgIds(): - img = self.ann_data.loadImgs(img_id)[0] - img.update({ - 'img_id': - img_id, - 'img_path': - osp.join(self.data_prefix['img'], img['file_name']), - }) - image_list.append(img) - + if self.data_mode == 'bottomup': + for img_id in self.ann_data.getImgIds(): + img = self.ann_data.loadImgs(img_id)[0] + img.update({ + 'img_id': + img_id, + 'img_path': + osp.join(self.data_prefix['img'], img['file_name']), + }) + image_list.append(img) + del self.ann_data return instance_list, image_list + + def load_data_list(self) -> List[dict]: + data_list = super().load_data_list() + self.ann_data = None + return data_list From 50af983cc9ae8f54763a03588dde3cdf3f52dbb9 Mon Sep 17 00:00:00 2001 From: xiexinch Date: Sun, 28 Apr 2024 16:43:18 +0800 Subject: [PATCH 08/15] --fix=fix configs --- configs/_base_/datasets/h3wb.py | 256 +++++++++--------- .../rtmw3d-l_8xb64_cocktail14-384x288.py | 15 +- .../rtmw3d-x_8xb32_cocktail14-384x288.py | 15 +- 3 files changed, 138 insertions(+), 148 deletions(-) diff --git a/configs/_base_/datasets/h3wb.py b/configs/_base_/datasets/h3wb.py index bb47a1b3f5..5a341cebda 100644 --- a/configs/_base_/datasets/h3wb.py +++ b/configs/_base_/datasets/h3wb.py @@ -168,433 +168,433 @@ dict( name='face-0', id=23, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-16'), 24: dict( name='face-1', id=24, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-15'), 25: dict( name='face-2', id=25, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-14'), 26: dict( name='face-3', id=26, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-13'), 27: dict( name='face-4', id=27, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-12'), 28: dict( name='face-5', id=28, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-11'), 29: dict( name='face-6', id=29, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-10'), 30: dict( name='face-7', id=30, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-9'), 31: - dict(name='face-8', id=31, color=[255, 255, 255], type='', swap=''), + dict(name='face-8', id=31, color=[247, 34, 5], type='upper', swap=''), 32: dict( name='face-9', id=32, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-7'), 33: dict( name='face-10', id=33, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-6'), 34: dict( name='face-11', id=34, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-5'), 35: dict( name='face-12', id=35, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-4'), 36: dict( name='face-13', id=36, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-3'), 37: dict( name='face-14', id=37, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-2'), 38: dict( name='face-15', id=38, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-1'), 39: dict( name='face-16', id=39, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-0'), 40: dict( name='face-17', id=40, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-26'), 41: dict( name='face-18', id=41, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-25'), 42: dict( name='face-19', id=42, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-24'), 43: dict( name='face-20', id=43, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-23'), 44: dict( name='face-21', id=44, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-22'), 45: dict( name='face-22', id=45, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-21'), 46: dict( name='face-23', id=46, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-20'), 47: dict( name='face-24', id=47, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-19'), 48: dict( name='face-25', id=48, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-18'), 49: dict( name='face-26', id=49, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-17'), 50: - dict(name='face-27', id=50, color=[255, 255, 255], type='', swap=''), + dict(name='face-27', id=50, color=[247, 34, 5], type='upper', swap=''), 51: - dict(name='face-28', id=51, color=[255, 255, 255], type='', swap=''), + dict(name='face-28', id=51, color=[247, 34, 5], type='upper', swap=''), 52: - dict(name='face-29', id=52, color=[255, 255, 255], type='', swap=''), + dict(name='face-29', id=52, color=[247, 34, 5], type='upper', swap=''), 53: - dict(name='face-30', id=53, color=[255, 255, 255], type='', swap=''), + dict(name='face-30', id=53, color=[247, 34, 5], type='upper', swap=''), 54: dict( name='face-31', id=54, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-35'), 55: dict( name='face-32', id=55, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-34'), 56: - dict(name='face-33', id=56, color=[255, 255, 255], type='', swap=''), + dict(name='face-33', id=56, color=[247, 34, 5], type='upper', swap=''), 57: dict( name='face-34', id=57, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-32'), 58: dict( name='face-35', id=58, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-31'), 59: dict( name='face-36', id=59, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-45'), 60: dict( name='face-37', id=60, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-44'), 61: dict( name='face-38', id=61, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-43'), 62: dict( name='face-39', id=62, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-42'), 63: dict( name='face-40', id=63, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-47'), 64: dict( name='face-41', id=64, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-46'), 65: dict( name='face-42', id=65, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-39'), 66: dict( name='face-43', id=66, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-38'), 67: dict( name='face-44', id=67, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-37'), 68: dict( name='face-45', id=68, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-36'), 69: dict( name='face-46', id=69, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-41'), 70: dict( name='face-47', id=70, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-40'), 71: dict( name='face-48', id=71, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-54'), 72: dict( name='face-49', id=72, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-53'), 73: dict( name='face-50', id=73, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-52'), 74: - dict(name='face-51', id=74, color=[255, 255, 255], type='', swap=''), + dict(name='face-51', id=74, color=[247, 34, 5], type='upper', swap=''), 75: dict( name='face-52', id=75, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-50'), 76: dict( name='face-53', id=76, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-49'), 77: dict( name='face-54', id=77, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-48'), 78: dict( name='face-55', id=78, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-59'), 79: dict( name='face-56', id=79, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-58'), 80: - dict(name='face-57', id=80, color=[255, 255, 255], type='', swap=''), + dict(name='face-57', id=80, color=[247, 34, 5], type='upper', swap=''), 81: dict( name='face-58', id=81, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-56'), 82: dict( name='face-59', id=82, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-55'), 83: dict( name='face-60', id=83, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-64'), 84: dict( name='face-61', id=84, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-63'), 85: - dict(name='face-62', id=85, color=[255, 255, 255], type='', swap=''), + dict(name='face-62', id=85, color=[247, 34, 5], type='upper', swap=''), 86: dict( name='face-63', id=86, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-61'), 87: dict( name='face-64', id=87, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-60'), 88: dict( name='face-65', id=88, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-67'), 89: - dict(name='face-66', id=89, color=[255, 255, 255], type='', swap=''), + dict(name='face-66', id=89, color=[247, 34, 5], type='upper', swap=''), 90: dict( name='face-67', id=90, - color=[255, 255, 255], - type='', + color=[247, 34, 5], + type='upper', swap='face-65'), 91: dict( name='left_hand_root', id=91, - color=[255, 255, 255], + color=[247, 34, 5], type='', swap='right_hand_root'), 92: @@ -741,7 +741,7 @@ dict( name='right_hand_root', id=112, - color=[255, 255, 255], + color=[247, 34, 5], type='', swap='left_hand_root'), 113: diff --git a/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py b/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py index 460ae6300d..997d43a6fa 100644 --- a/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py +++ b/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py @@ -139,11 +139,6 @@ # pipelines train_pipeline = [ dict(type='LoadImage', backend_args=backend_args), - dict( - type='RandomBackground', - bg_dir='/mnt/data/oss_beijing/mmseg/obj365v1_images', - bg_prob=0.5, - ), dict(type='GetBBoxCenterScale'), dict(type='RandomFlip', direction='horizontal'), dict(type='RandomHalfBody'), @@ -467,7 +462,7 @@ dataset_wb = dict( type='CombinedDataset', - metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + metainfo=dict(from_file='mmpose::_base_/datasets/coco_wholebody.py'), datasets=[dataset_coco, dataset_halpe], pipeline=[], test_mode=False, @@ -475,7 +470,7 @@ dataset_body = dict( type='CombinedDataset', - metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + metainfo=dict(from_file='mmpose::_base_/datasets/coco_wholebody.py'), datasets=[ dataset_aic, dataset_crowdpose, @@ -490,7 +485,7 @@ dataset_face = dict( type='CombinedDataset', - metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + metainfo=dict(from_file='mmpose::_base_/datasets/coco_wholebody.py'), datasets=[ dataset_wflw, dataset_300w, @@ -558,7 +553,7 @@ dataset_hand = dict( type='CombinedDataset', - metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + metainfo=dict(from_file='mmpose::_base_/datasets/coco_wholebody.py'), datasets=[dataset_interhand3d], pipeline=[], test_mode=False, @@ -614,7 +609,7 @@ type='CombinedDataset', datasets=train_datasets, pipeline=train_pipeline, - metainfo=dict(from_file='configs/_base_/datasets/h3wb.py'), + metainfo=dict(from_file='mmpose::_base_/datasets/h3wb.py'), test_mode=False)) # hooks custom_hooks = [ diff --git a/projects/rtmpose3d/configs/rtmw3d-x_8xb32_cocktail14-384x288.py b/projects/rtmpose3d/configs/rtmw3d-x_8xb32_cocktail14-384x288.py index c171835e0e..0126290826 100644 --- a/projects/rtmpose3d/configs/rtmw3d-x_8xb32_cocktail14-384x288.py +++ b/projects/rtmpose3d/configs/rtmw3d-x_8xb32_cocktail14-384x288.py @@ -140,11 +140,6 @@ # pipelines train_pipeline = [ dict(type='LoadImage', backend_args=backend_args), - dict( - type='RandomBackground', - bg_dir='/mnt/data/oss_beijing/mmseg/obj365v1_images', - bg_prob=0.5, - ), dict(type='GetBBoxCenterScale'), dict(type='RandomFlip', direction='horizontal'), dict(type='RandomHalfBody'), @@ -468,7 +463,7 @@ dataset_wb = dict( type='CombinedDataset', - metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + metainfo=dict(from_file='mmpose::_base_/datasets/coco_wholebody.py'), datasets=[dataset_coco, dataset_halpe], pipeline=[], test_mode=False, @@ -476,7 +471,7 @@ dataset_body = dict( type='CombinedDataset', - metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + metainfo=dict(from_file='mmpose::_base_/datasets/coco_wholebody.py'), datasets=[ dataset_aic, dataset_crowdpose, @@ -491,7 +486,7 @@ dataset_face = dict( type='CombinedDataset', - metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + metainfo=dict(from_file='mmpose::_base_/datasets/coco_wholebody.py'), datasets=[ dataset_wflw, dataset_300w, @@ -559,7 +554,7 @@ dataset_hand = dict( type='CombinedDataset', - metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'), + metainfo=dict(from_file='mmpose::_base_/datasets/coco_wholebody.py'), datasets=[dataset_interhand3d], pipeline=[], test_mode=False, @@ -615,7 +610,7 @@ type='CombinedDataset', datasets=train_datasets, pipeline=train_pipeline, - metainfo=dict(from_file='configs/_base_/datasets/h3wb.py'), + metainfo=dict(from_file='mmpose::_base_/datasets/h3wb.py'), test_mode=False)) # hooks From 907d1cbb6cd55b6ad5647c7c439b3c869ed334c5 Mon Sep 17 00:00:00 2001 From: xiexinch Date: Sun, 28 Apr 2024 17:00:18 +0800 Subject: [PATCH 09/15] --fix=fix pretrain link --- projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py | 2 +- projects/rtmpose3d/configs/rtmw3d-x_8xb32_cocktail14-384x288.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py b/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py index 997d43a6fa..8511a4a039 100644 --- a/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py +++ b/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py @@ -55,7 +55,7 @@ use_dark=False, root_index=(11, 12)) -backbone_path = 'checkpoints/rtmpose-l_simcc-ucoco_dw-ucoco_270e-256x192-4d6dfc62_20230728.pth' # noqa +backbone_path = 'https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-ucoco_dw-ucoco_270e-256x192-4d6dfc62_20230728.pth' # noqa # model settings model = dict( diff --git a/projects/rtmpose3d/configs/rtmw3d-x_8xb32_cocktail14-384x288.py b/projects/rtmpose3d/configs/rtmw3d-x_8xb32_cocktail14-384x288.py index 0126290826..61f93f108a 100644 --- a/projects/rtmpose3d/configs/rtmw3d-x_8xb32_cocktail14-384x288.py +++ b/projects/rtmpose3d/configs/rtmw3d-x_8xb32_cocktail14-384x288.py @@ -55,7 +55,7 @@ use_dark=False, root_index=(11, 12)) -backbone_path = 'checkpoints/rtmpose-x_simcc-ucoco_pt-aic-coco_270e-384x288-f5b50679_20230822.pth' # noqa +backbone_path = 'https://download.openmmlab.com/mmpose/v1/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-x_simcc-ucoco_pt-aic-coco_270e-384x288-f5b50679_20230822.pth' # noqa # model settings model = dict( From cf2420296c587945d30603736f2344bb9f87a672 Mon Sep 17 00:00:00 2001 From: xiexinch Date: Sun, 28 Apr 2024 17:37:54 +0800 Subject: [PATCH 10/15] --fix=fix loss name --- mmpose/models/losses/regression_loss.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/mmpose/models/losses/regression_loss.py b/mmpose/models/losses/regression_loss.py index 948d65bae7..83d03625b1 100644 --- a/mmpose/models/losses/regression_loss.py +++ b/mmpose/models/losses/regression_loss.py @@ -573,7 +573,11 @@ class BoneLoss(nn.Module): loss_weight (float): Weight of the loss. Default: 1.0. """ - def __init__(self, joint_parents, use_target_weight=False, loss_weight=1.): + def __init__(self, + joint_parents, + use_target_weight: bool = False, + loss_weight: float = 1., + loss_name: str = 'loss_bone'): super().__init__() self.joint_parents = joint_parents self.use_target_weight = use_target_weight @@ -584,6 +588,8 @@ def __init__(self, joint_parents, use_target_weight=False, loss_weight=1.): if i != self.joint_parents[i]: self.non_root_indices.append(i) + self._loss_name = loss_name + def forward(self, output, target, target_weight=None): """Forward function. @@ -606,6 +612,7 @@ def forward(self, output, target, target_weight=None): dim=-1)[:, self.non_root_indices] if self.use_target_weight: assert target_weight is not None + target_weight = target_weight[:, self.non_root_indices] loss = torch.mean( torch.abs((output_bone * target_weight).mean(dim=0) - (target_bone * target_weight).mean(dim=0))) @@ -615,6 +622,15 @@ def forward(self, output, target, target_weight=None): return loss * self.loss_weight + @property + def loss_name(self): + """Loss Name. + + Returns: + str: The name of this loss item. + """ + return self._loss_name + @MODELS.register_module() class SemiSupervisionLoss(nn.Module): From e0ebd1d6a9c8ab48d3bdec8257a5501f55129d47 Mon Sep 17 00:00:00 2001 From: xiexinch Date: Mon, 6 May 2024 10:17:32 +0800 Subject: [PATCH 11/15] --doc=update readme --- projects/rtmpose3d/README.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/projects/rtmpose3d/README.md b/projects/rtmpose3d/README.md index 4f75c44fe2..b680f2a6b7 100644 --- a/projects/rtmpose3d/README.md +++ b/projects/rtmpose3d/README.md @@ -4,9 +4,16 @@ RTMPose3D is a toolkit for real-time 3D pose estimation. It is based on the RTMPose model, which is a 2D pose estimation model that is capable of predicting 2D keypoints and body part associations in real-time. RTMPose3D extends RTMPose by adding a 3D pose estimation branch that can predict 3D keypoints from images directly. +## 🗂️ Model Zoo + +| Model | AP on COCO-Wholebody | MPJPE on H3WB | Download | +| :--------------------------------------------------------- | :------------------: | :-----------: | :-----------------------------------------------------------------------------------------------------------: | +| [RTMW3D-L](./configs/rtmw3d-l_8xb64_cocktail14-384x288.py) | 0.678 | 0.052 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_coco-640x640-8db55a59_20231211.pth) | +| [RTMW3D-X](./configs/rtmw3d-x_8xb32_cocktail14-384x288.py) | 0.687 | 0.056 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_coco-640x640-8db55a59_20231211.pth) | + ## Usage -👉🏼 TRY RTMO NOW +👉🏼 TRY RTMPose3D NOW ```bash cd /path/to/mmpose/projects/rtmpose3d From eb42de036e76ce2b1dbeb12d81f6573521e21790 Mon Sep 17 00:00:00 2001 From: xiexinch Date: Wed, 26 Jun 2024 14:12:15 +0800 Subject: [PATCH 12/15] --update=make path more robust --- mmpose/datasets/datasets/wholebody3d/h3wb_dataset.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mmpose/datasets/datasets/wholebody3d/h3wb_dataset.py b/mmpose/datasets/datasets/wholebody3d/h3wb_dataset.py index 26ffa7d14f..f6df56c1b8 100644 --- a/mmpose/datasets/datasets/wholebody3d/h3wb_dataset.py +++ b/mmpose/datasets/datasets/wholebody3d/h3wb_dataset.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp from typing import List, Tuple import numpy as np @@ -179,9 +180,10 @@ def _load_annotations(self) -> Tuple[List[dict], List[dict]]: ]], dtype=np.float32) - img_path = f'{self.data_root}original/{subject}/Images/{act}.{cam}/frame_{frames[frame_ids[-1]]}.jpg' # noqa img_paths = [ - f'{self.data_root}original/{subject}/Images/{act}.{cam}/frame_{frames[i]}.jpg' # noqa + osp.join(self.data_root, 'original', subject, + 'Images', f'{act}.{cam}', + f'frame_{frames[i]}.jpg') # noqa for i in frame_ids ] @@ -216,7 +218,7 @@ def _load_annotations(self) -> Tuple[List[dict], List[dict]]: 'img_paths': img_paths, 'img_path': - img_path, + img_paths[-1], 'img_ids': frame_ids, 'lifting_target': From e35ff2376286e999c0efa4eb1907a6fa7d87bb1d Mon Sep 17 00:00:00 2001 From: xiexinch Date: Wed, 26 Jun 2024 21:22:42 +0800 Subject: [PATCH 13/15] --update=update results --- projects/rtmpose3d/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/rtmpose3d/README.md b/projects/rtmpose3d/README.md index b680f2a6b7..d73e710dc0 100644 --- a/projects/rtmpose3d/README.md +++ b/projects/rtmpose3d/README.md @@ -9,7 +9,7 @@ RTMPose3D is a toolkit for real-time 3D pose estimation. It is based on the RTMP | Model | AP on COCO-Wholebody | MPJPE on H3WB | Download | | :--------------------------------------------------------- | :------------------: | :-----------: | :-----------------------------------------------------------------------------------------------------------: | | [RTMW3D-L](./configs/rtmw3d-l_8xb64_cocktail14-384x288.py) | 0.678 | 0.052 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_coco-640x640-8db55a59_20231211.pth) | -| [RTMW3D-X](./configs/rtmw3d-x_8xb32_cocktail14-384x288.py) | 0.687 | 0.056 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_coco-640x640-8db55a59_20231211.pth) | +| [RTMW3D-X](./configs/rtmw3d-x_8xb32_cocktail14-384x288.py) | 0.680 | 0.052 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_coco-640x640-8db55a59_20231211.pth) | ## Usage From 755e515203047d14f6a265eef187f7b10c4c0e35 Mon Sep 17 00:00:00 2001 From: xiexinch Date: Thu, 27 Jun 2024 11:21:05 +0800 Subject: [PATCH 14/15] --fix=fix lint --- projects/rtmpose3d/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/rtmpose3d/README.md b/projects/rtmpose3d/README.md index d73e710dc0..8c9c7542f8 100644 --- a/projects/rtmpose3d/README.md +++ b/projects/rtmpose3d/README.md @@ -9,7 +9,7 @@ RTMPose3D is a toolkit for real-time 3D pose estimation. It is based on the RTMP | Model | AP on COCO-Wholebody | MPJPE on H3WB | Download | | :--------------------------------------------------------- | :------------------: | :-----------: | :-----------------------------------------------------------------------------------------------------------: | | [RTMW3D-L](./configs/rtmw3d-l_8xb64_cocktail14-384x288.py) | 0.678 | 0.052 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_coco-640x640-8db55a59_20231211.pth) | -| [RTMW3D-X](./configs/rtmw3d-x_8xb32_cocktail14-384x288.py) | 0.680 | 0.052 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_coco-640x640-8db55a59_20231211.pth) | +| [RTMW3D-X](./configs/rtmw3d-x_8xb32_cocktail14-384x288.py) | 0.680 | 0.052 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_coco-640x640-8db55a59_20231211.pth) | ## Usage From 0f8da98e509bd92112477b86117a69356184f102 Mon Sep 17 00:00:00 2001 From: xiexinch Date: Fri, 12 Jul 2024 10:43:58 +0800 Subject: [PATCH 15/15] --update=add ckpt links --- projects/rtmpose3d/README.md | 7 +++++-- .../rtmpose3d/configs/rtmw3d-x_8xb32_cocktail14-384x288.py | 5 ++--- projects/rtmpose3d/rtmpose3d/pose_estimator.py | 2 +- projects/rtmpose3d/rtmpose3d/simcc_3d_label.py | 3 +++ 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/projects/rtmpose3d/README.md b/projects/rtmpose3d/README.md index 8c9c7542f8..b8cd0adcda 100644 --- a/projects/rtmpose3d/README.md +++ b/projects/rtmpose3d/README.md @@ -4,12 +4,14 @@ RTMPose3D is a toolkit for real-time 3D pose estimation. It is based on the RTMPose model, which is a 2D pose estimation model that is capable of predicting 2D keypoints and body part associations in real-time. RTMPose3D extends RTMPose by adding a 3D pose estimation branch that can predict 3D keypoints from images directly. +Please refer to our [technical report](https://arxiv.org/pdf/2407.08634) for more details. + ## 🗂️ Model Zoo | Model | AP on COCO-Wholebody | MPJPE on H3WB | Download | | :--------------------------------------------------------- | :------------------: | :-----------: | :-----------------------------------------------------------------------------------------------------------: | -| [RTMW3D-L](./configs/rtmw3d-l_8xb64_cocktail14-384x288.py) | 0.678 | 0.052 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_coco-640x640-8db55a59_20231211.pth) | -| [RTMW3D-X](./configs/rtmw3d-x_8xb32_cocktail14-384x288.py) | 0.680 | 0.052 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_coco-640x640-8db55a59_20231211.pth) | +| [RTMW3D-L](./configs/rtmw3d-l_8xb64_cocktail14-384x288.py) | 0.678 | 0.056 | [ckpt](https://download.openmmlab.com/mmpose/v1/wholebody_3d_keypoint/rtmw3d/rtmw3d-l_8xb64_cocktail14-384x288-794dbc78_20240626.pth) | +| [RTMW3D-X](./configs/rtmw3d-x_8xb32_cocktail14-384x288.py) | 0.680 | 0.057 | [ckpt](https://download.openmmlab.com/mmpose/v1/wholebody_3d_keypoint/rtmw3d/rtmw3d-x_8xb64_cocktail14-384x288-b0a0eab7_20240626.pth) | ## Usage @@ -17,5 +19,6 @@ RTMPose3D is a toolkit for real-time 3D pose estimation. It is based on the RTMP ```bash cd /path/to/mmpose/projects/rtmpose3d +export PYTHONPATH=$(pwd):$PYTHONPATH python body3d_img2pose_demo.py configs/rtmdet_m_640-8xb32_coco-person.py https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmdet_m_8xb32-100e_coco-obj365-person-235e8209.pth configs\rtmw3d-l_8xb64_cocktail14-384x288.py rtmw3d-l_cock14-0d4ad840_20240422.pth --input /path/to/image --output-root /path/to/output ``` diff --git a/projects/rtmpose3d/configs/rtmw3d-x_8xb32_cocktail14-384x288.py b/projects/rtmpose3d/configs/rtmw3d-x_8xb32_cocktail14-384x288.py index 61f93f108a..f1475e97c9 100644 --- a/projects/rtmpose3d/configs/rtmw3d-x_8xb32_cocktail14-384x288.py +++ b/projects/rtmpose3d/configs/rtmw3d-x_8xb32_cocktail14-384x288.py @@ -128,9 +128,8 @@ loss_weight=2.0) ], decoder=codec), - test_cfg=dict(flip_test=False, mode='2d') - # test_cfg=dict(flip_test=False) -) + # test_cfg=dict(flip_test=False, mode='2d') + test_cfg=dict(flip_test=False)) # base dataset settings data_mode = 'topdown' diff --git a/projects/rtmpose3d/rtmpose3d/pose_estimator.py b/projects/rtmpose3d/rtmpose3d/pose_estimator.py index 438cd2b67b..90ec43cdc7 100644 --- a/projects/rtmpose3d/rtmpose3d/pose_estimator.py +++ b/projects/rtmpose3d/rtmpose3d/pose_estimator.py @@ -67,7 +67,7 @@ def add_pred_to_datasample(self, batch_pred_instances: InstanceList, c = np.array(camera_params['c']) else: f = np.array([1145.04940459, 1143.78109572]) - c = np.array(data_sample.ori_shape) + c = np.array(data_sample.ori_shape) / 2 kpts_pixel = np.concatenate([ keypoints_2d, (keypoints_3d[..., 2] + gt_instances.root_z)[..., None] diff --git a/projects/rtmpose3d/rtmpose3d/simcc_3d_label.py b/projects/rtmpose3d/rtmpose3d/simcc_3d_label.py index ead72c5090..22e42079c8 100644 --- a/projects/rtmpose3d/rtmpose3d/simcc_3d_label.py +++ b/projects/rtmpose3d/rtmpose3d/simcc_3d_label.py @@ -18,6 +18,8 @@ class SimCC3DLabel(BaseKeypointCodec): Human Pose Estimation`_ by Li et al (2022) for more details. Old name: SimDR + We generate the SimCC label for 3D keypoint estimation. + Note: - instance number: N @@ -93,6 +95,7 @@ def __init__(self, root_index, tuple) else [root_index] # Mean value of the root z-axis of datasets + # These values are statistics from the training set self.root_z = [5.14388] self.z_range = z_range if z_range is not None else 2.1744869