From 48a4bd4e53dd91f8cd00369ce44afb6287f90b22 Mon Sep 17 00:00:00 2001
From: xiexinch <xiexinch@outlook.com>
Date: Wed, 24 Apr 2024 23:13:44 +0800
Subject: [PATCH 01/15] --feat=add rtmpose3d inference demo

---
 .../datasets/transforms/common_transforms.py  |   2 +-
 .../datasets/transforms/topdown_transforms.py |   3 +
 projects/rtmpose3d/body3d_img2pose_demo.py    | 439 +++++++++++
 projects/rtmpose3d/configs/default_runtime.py |  54 ++
 .../configs/rtmdet_m_640-8xb32_coco-person.py |  20 +
 .../rtmw3d-l_8xb64_cocktail14-384x288.py      | 706 ++++++++++++++++++
 .../rtmw3d-x_8xb64_cocktail14-384x288.py      | 706 ++++++++++++++++++
 projects/rtmpose3d/rtmpose3d/__init__.py      |   6 +
 projects/rtmpose3d/rtmpose3d/loss.py          |  37 +
 .../rtmpose3d/rtmpose3d/pose_estimator.py     | 116 +++
 projects/rtmpose3d/rtmpose3d/rtmw3d_head.py   | 444 +++++++++++
 .../rtmpose3d/rtmpose3d/simcc_3d_label.py     | 335 +++++++++
 projects/rtmpose3d/rtmpose3d/utils.py         |  76 ++
 13 files changed, 2943 insertions(+), 1 deletion(-)
 create mode 100644 projects/rtmpose3d/body3d_img2pose_demo.py
 create mode 100644 projects/rtmpose3d/configs/default_runtime.py
 create mode 100644 projects/rtmpose3d/configs/rtmdet_m_640-8xb32_coco-person.py
 create mode 100644 projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py
 create mode 100644 projects/rtmpose3d/configs/rtmw3d-x_8xb64_cocktail14-384x288.py
 create mode 100644 projects/rtmpose3d/rtmpose3d/__init__.py
 create mode 100644 projects/rtmpose3d/rtmpose3d/loss.py
 create mode 100644 projects/rtmpose3d/rtmpose3d/pose_estimator.py
 create mode 100644 projects/rtmpose3d/rtmpose3d/rtmw3d_head.py
 create mode 100644 projects/rtmpose3d/rtmpose3d/simcc_3d_label.py
 create mode 100644 projects/rtmpose3d/rtmpose3d/utils.py

diff --git a/mmpose/datasets/transforms/common_transforms.py b/mmpose/datasets/transforms/common_transforms.py
index 33f9c560c0..b29417f045 100644
--- a/mmpose/datasets/transforms/common_transforms.py
+++ b/mmpose/datasets/transforms/common_transforms.py
@@ -973,7 +973,7 @@ def transform(self, results: Dict) -> Optional[dict]:
             # For single encoding, the encoded items will be directly added
             # into results.
             auxiliary_encode_kwargs = {
-                key: results[key]
+                key: results.get(key, None)
                 for key in self.encoder.auxiliary_encode_keys
             }
             encoded = self.encoder.encode(
diff --git a/mmpose/datasets/transforms/topdown_transforms.py b/mmpose/datasets/transforms/topdown_transforms.py
index 3480c5b38c..c76d45e46a 100644
--- a/mmpose/datasets/transforms/topdown_transforms.py
+++ b/mmpose/datasets/transforms/topdown_transforms.py
@@ -126,6 +126,9 @@ def transform(self, results: Dict) -> Optional[dict]:
             transformed_keypoints[..., :2] = cv2.transform(
                 results['keypoints'][..., :2], warp_mat)
             results['transformed_keypoints'] = transformed_keypoints
+        else:
+            results['transformed_keypoints'] = np.zeros([])
+            results['keypoints_visible'] = np.ones((1, 1, 1))
 
         results['input_size'] = (w, h)
         results['input_center'] = center
diff --git a/projects/rtmpose3d/body3d_img2pose_demo.py b/projects/rtmpose3d/body3d_img2pose_demo.py
new file mode 100644
index 0000000000..200043d7d4
--- /dev/null
+++ b/projects/rtmpose3d/body3d_img2pose_demo.py
@@ -0,0 +1,439 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import mimetypes
+import os
+import time
+from argparse import ArgumentParser
+from typing import List
+
+import cv2
+import json_tricks as json
+import mmcv
+import mmengine
+import numpy as np
+from mmengine.logging import print_log
+
+from mmpose.apis import inference_topdown, init_model
+from mmpose.registry import VISUALIZERS
+from mmpose.structures import (PoseDataSample, merge_data_samples,
+                               split_instances)
+from mmpose.utils import adapt_mmdet_pipeline
+from mmpose.visualization import Pose3dLocalVisualizer
+from rtmpose3d import *
+
+try:
+    from mmdet.apis import inference_detector, init_detector
+    has_mmdet = True
+except (ImportError, ModuleNotFoundError):
+    has_mmdet = False
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument('det_config', help='Config file for detection')
+    parser.add_argument('det_checkpoint', help='Checkpoint file for detection')
+    parser.add_argument(
+        'pose3d_estimator_config',
+        type=str,
+        default=None,
+        help='Config file for the 3D pose estimator')
+    parser.add_argument(
+        'pose3d_estimator_checkpoint',
+        type=str,
+        default=None,
+        help='Checkpoint file for the 3D pose estimator')
+    parser.add_argument('--input', type=str, default='', help='Video path')
+    parser.add_argument(
+        '--show',
+        action='store_true',
+        default=False,
+        help='Whether to show visualizations')
+    parser.add_argument(
+        '--disable-rebase-keypoint',
+        action='store_true',
+        default=False,
+        help='Whether to disable rebasing the predicted 3D pose so its '
+        'lowest keypoint has a height of 0 (landing on the ground). Rebase '
+        'is useful for visualization when the model do not predict the '
+        'global position of the 3D pose.')
+    parser.add_argument(
+        '--disable-norm-pose-2d',
+        action='store_true',
+        default=False,
+        help='Whether to scale the bbox (along with the 2D pose) to the '
+        'average bbox scale of the dataset, and move the bbox (along with the '
+        '2D pose) to the average bbox center of the dataset. This is useful '
+        'when bbox is small, especially in multi-person scenarios.')
+    parser.add_argument(
+        '--num-instances',
+        type=int,
+        default=1,
+        help='The number of 3D poses to be visualized in every frame. If '
+        'less than 0, it will be set to the number of pose results in the '
+        'first frame.')
+    parser.add_argument(
+        '--output-root',
+        type=str,
+        default='',
+        help='Root of the output video file. '
+        'Default not saving the visualization video.')
+    parser.add_argument(
+        '--save-predictions',
+        action='store_true',
+        default=False,
+        help='Whether to save predicted results')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    parser.add_argument(
+        '--det-cat-id',
+        type=int,
+        default=0,
+        help='Category id for bounding box detection model')
+    parser.add_argument(
+        '--bbox-thr',
+        type=float,
+        default=0.5,
+        help='Bounding box score threshold')
+    parser.add_argument('--kpt-thr', type=float, default=0.3)
+    parser.add_argument(
+        '--use-oks-tracking', action='store_true', help='Using OKS tracking')
+    parser.add_argument(
+        '--tracking-thr', type=float, default=0.3, help='Tracking threshold')
+    parser.add_argument(
+        '--show-interval', type=int, default=0, help='Sleep seconds per frame')
+    parser.add_argument(
+        '--thickness',
+        type=int,
+        default=1,
+        help='Link thickness for visualization')
+    parser.add_argument(
+        '--radius',
+        type=int,
+        default=3,
+        help='Keypoint radius for visualization')
+    parser.add_argument(
+        '--online',
+        action='store_true',
+        default=False,
+        help='Inference mode. If set to True, can not use future frame'
+        'information when using multi frames for inference in the 2D pose'
+        'detection stage. Default: False.')
+
+    args = parser.parse_args()
+    return args
+
+
+def process_one_image(args, detector, frame: np.ndarray, frame_idx: int,
+                      pose_estimator: TopdownPoseEstimator3D,
+                      pose_est_results_last: List[PoseDataSample],
+                      pose_est_results_list: List[List[PoseDataSample]],
+                      next_id: int, visualize_frame: np.ndarray,
+                      visualizer: Pose3dLocalVisualizer):
+    """Visualize detected and predicted keypoints of one image.
+
+    Pipeline of this function:
+
+                              frame
+                                |
+                                V
+                        +-----------------+
+                        |     detector    |
+                        +-----------------+
+                                |  det_result
+                                V
+                        +-----------------+
+                        |  pose_estimator |
+                        +-----------------+
+                                |  pose_est_results
+                                V
+                       +-----------------+
+                       | post-processing |
+                       +-----------------+
+                                |  pred_3d_data_samples
+                                V
+                         +------------+
+                         | visualizer |
+                         +------------+
+
+    Args:
+        args (Argument): Custom command-line arguments.
+        detector (mmdet.BaseDetector): The mmdet detector.
+        frame (np.ndarray): The image frame read from input image or video.
+        frame_idx (int): The index of current frame.
+        pose_estimator (TopdownPoseEstimator): The pose estimator for 2d pose.
+        pose_est_results_last (list(PoseDataSample)): The results of pose
+            estimation from the last frame for tracking instances.
+        pose_est_results_list (list(list(PoseDataSample))): The list of all
+            pose estimation results converted by
+            ``convert_keypoint_definition`` from previous frames. In
+            pose-lifting stage it is used to obtain the 2d estimation sequence.
+        next_id (int): The next track id to be used.
+        pose_lifter (PoseLifter): The pose-lifter for estimating 3d pose.
+        visualize_frame (np.ndarray): The image for drawing the results on.
+        visualizer (Visualizer): The visualizer for visualizing the 2d and 3d
+            pose estimation results.
+
+    Returns:
+        pose_est_results (list(PoseDataSample)): The pose estimation result of
+            the current frame.
+        pose_est_results_list (list(list(PoseDataSample))): The list of all
+            converted pose estimation results until the current frame.
+        pred_3d_instances (InstanceData): The result of pose-lifting.
+            Specifically, the predicted keypoints and scores are saved at
+            ``pred_3d_instances.keypoints`` and
+            ``pred_3d_instances.keypoint_scores``.
+        next_id (int): The next track id to be used.
+    """
+    # pose_dataset = pose_estimator.cfg.test_dataloader.dataset
+    pose_det_dataset_name = pose_estimator.dataset_meta['dataset_name']
+
+    # First stage: conduct 2D pose detection in a Topdown manner
+    # use detector to obtain person bounding boxes
+    det_result = inference_detector(detector, frame)
+    pred_instance = det_result.pred_instances.cpu().numpy()
+
+    # filter out the person instances with category and bbox threshold
+    # e.g. 0 for person in COCO
+    bboxes = pred_instance.bboxes
+    bboxes = bboxes[np.logical_and(pred_instance.labels == args.det_cat_id,
+                                   pred_instance.scores > args.bbox_thr)]
+
+    # estimate pose results for current image
+    pose_est_results = inference_topdown(pose_estimator, frame, bboxes)
+
+    # post-processing
+    for idx, pose_est_result in enumerate(pose_est_results):
+        pose_est_result.track_id = pose_est_results[idx].get('track_id', 1e4)
+
+        pred_instances = pose_est_result.pred_instances
+        keypoints = pred_instances.keypoints
+        keypoint_scores = pred_instances.keypoint_scores
+        if keypoint_scores.ndim == 3:
+            keypoint_scores = np.squeeze(keypoint_scores, axis=1)
+            pose_est_results[
+                idx].pred_instances.keypoint_scores = keypoint_scores
+        if keypoints.ndim == 4:
+            keypoints = np.squeeze(keypoints, axis=1)
+
+        keypoints = -keypoints[..., [0, 2, 1]]
+        # keypoints[..., 0] = -keypoints[..., 0]
+        # keypoints[..., 2] = -keypoints[..., 2]
+
+        # rebase height (z-axis)
+        if not args.disable_rebase_keypoint:
+            keypoints[..., 2] -= np.min(
+                keypoints[..., 2], axis=-1, keepdims=True)
+
+        pose_est_results[idx].pred_instances.keypoints = keypoints
+
+    pose_est_results = sorted(
+        pose_est_results, key=lambda x: x.get('track_id', 1e4))
+
+    pred_3d_data_samples = merge_data_samples(pose_est_results)
+    pred_3d_instances = pred_3d_data_samples.get('pred_instances', None)
+
+    if args.num_instances < 0:
+        args.num_instances = len(pose_est_results)
+
+    # Visualization
+    if visualizer is not None:
+        visualizer.add_datasample(
+            'result',
+            visualize_frame,
+            data_sample=pred_3d_data_samples,
+            det_data_sample=pred_3d_data_samples,
+            draw_gt=False,
+            draw_2d=True,
+            dataset_2d=pose_det_dataset_name,
+            dataset_3d=pose_det_dataset_name,
+            show=args.show,
+            draw_bbox=True,
+            kpt_thr=args.kpt_thr,
+            convert_keypoint=False,
+            axis_limit=400,
+            axis_azimuth=70,
+            axis_elev=15,
+            num_instances=args.num_instances,
+            wait_time=args.show_interval,
+            root_index=[11, 12])
+
+    return pose_est_results, pose_est_results_list, pred_3d_instances, next_id
+
+
+def main():
+    assert has_mmdet, 'Please install mmdet to run the demo.'
+
+    args = parse_args()
+
+    assert args.show or (args.output_root != '')
+    assert args.input != ''
+    assert args.det_config is not None
+    assert args.det_checkpoint is not None
+
+    detector = init_detector(
+        args.det_config, args.det_checkpoint, device=args.device.lower())
+    detector.cfg = adapt_mmdet_pipeline(detector.cfg)
+
+    pose_estimator = init_model(
+        args.pose3d_estimator_config,
+        args.pose3d_estimator_checkpoint,
+        device=args.device.lower())
+
+    det_kpt_color = pose_estimator.dataset_meta.get('keypoint_colors', None)
+    det_dataset_skeleton = pose_estimator.dataset_meta.get(
+        'skeleton_links', None)
+    det_dataset_link_color = pose_estimator.dataset_meta.get(
+        'skeleton_link_colors', None)
+
+    pose_estimator.cfg.model.test_cfg.mode = 'simcc'
+    pose_estimator.cfg.visualizer.radius = args.radius
+    pose_estimator.cfg.visualizer.line_width = args.thickness
+    pose_estimator.cfg.visualizer.det_kpt_color = det_kpt_color
+    pose_estimator.cfg.visualizer.det_dataset_skeleton = det_dataset_skeleton
+    pose_estimator.cfg.visualizer.det_dataset_link_color = det_dataset_link_color  # noqa: E501
+    pose_estimator.cfg.visualizer.skeleton = det_dataset_skeleton
+    pose_estimator.cfg.visualizer.link_color = det_dataset_link_color
+    pose_estimator.cfg.visualizer.kpt_color = det_kpt_color
+    visualizer = VISUALIZERS.build(pose_estimator.cfg.visualizer)
+
+    if args.input == 'webcam':
+        input_type = 'webcam'
+    else:
+        input_type = mimetypes.guess_type(args.input)[0].split('/')[0]
+
+    if args.output_root == '':
+        save_output = False
+    else:
+        mmengine.mkdir_or_exist(args.output_root)
+        output_file = os.path.join(args.output_root,
+                                   os.path.basename(args.input))
+        if args.input == 'webcam':
+            output_file += '.mp4'
+        save_output = True
+
+    if args.save_predictions:
+        assert args.output_root != ''
+        args.pred_save_path = f'{args.output_root}/results_' \
+            f'{os.path.splitext(os.path.basename(args.input))[0]}.json'
+
+    if save_output:
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+
+    pose_est_results_list = []
+    pred_instances_list = []
+    if input_type == 'image':
+        frame = mmcv.imread(args.input, channel_order='rgb')
+        _, _, pred_3d_instances, _ = process_one_image(
+            args=args,
+            detector=detector,
+            frame=args.input,
+            frame_idx=0,
+            pose_estimator=pose_estimator,
+            pose_est_results_last=[],
+            pose_est_results_list=pose_est_results_list,
+            next_id=0,
+            visualize_frame=frame,
+            visualizer=visualizer)
+
+        if args.save_predictions:
+            # save prediction results
+            pred_instances_list = split_instances(pred_3d_instances)
+
+        if save_output:
+            frame_vis = visualizer.get_image()
+            mmcv.imwrite(mmcv.rgb2bgr(frame_vis), output_file)
+
+    elif input_type in ['webcam', 'video']:
+        next_id = 0
+        pose_est_results = []
+
+        if args.input == 'webcam':
+            video = cv2.VideoCapture(0)
+        else:
+            video = cv2.VideoCapture(args.input)
+
+        (major_ver, minor_ver, subminor_ver) = (cv2.__version__).split('.')
+        if int(major_ver) < 3:
+            fps = video.get(cv2.cv.CV_CAP_PROP_FPS)
+        else:
+            fps = video.get(cv2.CAP_PROP_FPS)
+
+        video_writer = None
+        frame_idx = 0
+
+        while video.isOpened():
+            success, frame = video.read()
+            frame_idx += 1
+
+            if not success:
+                break
+
+            pose_est_results_last = pose_est_results
+
+            # First stage: 2D pose detection
+            # make person results for current image
+            (pose_est_results, pose_est_results_list, pred_3d_instances,
+             next_id) = process_one_image(
+                 args=args,
+                 detector=detector,
+                 frame=frame,
+                 frame_idx=frame_idx,
+                 pose_estimator=pose_estimator,
+                 pose_est_results_last=pose_est_results_last,
+                 pose_est_results_list=pose_est_results_list,
+                 next_id=next_id,
+                 visualize_frame=mmcv.bgr2rgb(frame),
+                 visualizer=visualizer)
+
+            if args.save_predictions:
+                # save prediction results
+                pred_instances_list.append(
+                    dict(
+                        frame_id=frame_idx,
+                        instances=split_instances(pred_3d_instances)))
+
+            if save_output:
+                frame_vis = visualizer.get_image()
+                if video_writer is None:
+                    # the size of the image with visualization may vary
+                    # depending on the presence of heatmaps
+                    video_writer = cv2.VideoWriter(output_file, fourcc, fps,
+                                                   (frame_vis.shape[1],
+                                                    frame_vis.shape[0]))
+                video_writer.write(mmcv.rgb2bgr(frame_vis))
+
+            if args.show:
+                # press ESC to exit
+                if cv2.waitKey(5) & 0xFF == 27:
+                    break
+                time.sleep(args.show_interval)
+
+        video.release()
+
+        if video_writer:
+            video_writer.release()
+    else:
+        args.save_predictions = False
+        raise ValueError(
+            f'file {os.path.basename(args.input)} has invalid format.')
+
+    if args.save_predictions:
+        with open(args.pred_save_path, 'w') as f:
+            json.dump(
+                dict(
+                    meta_info=pose_estimator.dataset_meta,
+                    instance_info=pred_instances_list),
+                f,
+                indent='\t')
+        print(f'predictions have been saved at {args.pred_save_path}')
+
+    if save_output:
+        input_type = input_type.replace('webcam', 'video')
+        print_log(
+            f'the output {input_type} has been saved at {output_file}',
+            logger='current',
+            level=logging.INFO)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/projects/rtmpose3d/configs/default_runtime.py b/projects/rtmpose3d/configs/default_runtime.py
new file mode 100644
index 0000000000..6f27c0345a
--- /dev/null
+++ b/projects/rtmpose3d/configs/default_runtime.py
@@ -0,0 +1,54 @@
+default_scope = 'mmpose'
+
+# hooks
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', interval=10),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='PoseVisualizationHook', enable=False),
+    badcase=dict(
+        type='BadCaseAnalysisHook',
+        enable=False,
+        out_dir='badcase',
+        metric_type='loss',
+        badcase_thr=5))
+
+# custom hooks
+custom_hooks = [
+    # Synchronize model buffers such as running_mean and running_var in BN
+    # at the end of each epoch
+    dict(type='SyncBuffersHook')
+]
+
+# multi-processing backend
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'),
+)
+
+# visualizer
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    # dict(type='TensorboardVisBackend'),
+    # dict(type='WandbVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+# logger
+log_processor = dict(
+    type='LogProcessor', window_size=50, by_epoch=True, num_digits=6)
+log_level = 'INFO'
+load_from = None
+resume = False
+
+# file I/O backend
+backend_args = dict(backend='local')
+
+# training/validation/testing progress
+train_cfg = dict(by_epoch=True)
+val_cfg = dict()
+test_cfg = dict()
diff --git a/projects/rtmpose3d/configs/rtmdet_m_640-8xb32_coco-person.py b/projects/rtmpose3d/configs/rtmdet_m_640-8xb32_coco-person.py
new file mode 100644
index 0000000000..620de8dc8f
--- /dev/null
+++ b/projects/rtmpose3d/configs/rtmdet_m_640-8xb32_coco-person.py
@@ -0,0 +1,20 @@
+_base_ = 'mmdet::rtmdet/rtmdet_m_8xb32-300e_coco.py'
+
+checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        init_cfg=dict(
+            type='Pretrained', prefix='backbone.', checkpoint=checkpoint)),
+    bbox_head=dict(num_classes=1),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.6),
+        max_per_img=100))
+
+train_dataloader = dict(dataset=dict(metainfo=dict(classes=('person', ))))
+
+val_dataloader = dict(dataset=dict(metainfo=dict(classes=('person', ))))
+test_dataloader = val_dataloader
diff --git a/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py b/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py
new file mode 100644
index 0000000000..832742788d
--- /dev/null
+++ b/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py
@@ -0,0 +1,706 @@
+_base_ = ['./default_runtime.py']
+
+vis_backends = [
+    dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+    type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+# runtime
+max_epochs = 270
+stage2_num_epochs = 10
+base_lr = 5e-4
+num_keypoints = 133
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=2024)
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+    paramwise_cfg=dict(
+        norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0e-5,
+        by_epoch=False,
+        begin=0,
+        end=1000),
+    dict(
+        type='CosineAnnealingLR',
+        eta_min=base_lr * 0.05,
+        begin=max_epochs // 2,
+        end=max_epochs,
+        T_max=max_epochs // 2,
+        by_epoch=True,
+        convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=4096)
+
+# codec settings
+codec = dict(
+    type='SimCC3DLabel',
+    input_size=(288, 384, 288),
+    sigma=(6., 6.93, 6.),
+    simcc_split_ratio=2.0,
+    normalize=False,
+    use_dark=False,
+    root_index=(11, 12))
+
+# model settings
+model = dict(
+    type='TopdownPoseEstimator3D',
+    data_preprocessor=dict(
+        type='PoseDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True),
+    backbone=dict(
+        type='CSPNeXt',
+        arch='P5',
+        expand_ratio=0.5,
+        deepen_factor=1.,
+        widen_factor=1.,
+        channel_attention=True,
+        norm_cfg=dict(type='BN'),
+        act_cfg=dict(type='SiLU'),
+        init_cfg=dict(
+            type='Pretrained',
+            prefix='backbone.',
+            checkpoint='checkpoints/rtmpose-l_simcc-ucoco_dw-ucoco_270e-256x192-4d6dfc62_20230728.pth'  # noqa
+        )),
+    neck=dict(
+        type='CSPNeXtPAFPN',
+        in_channels=[256, 512, 1024],
+        out_channels=None,
+        out_indices=(
+            1,
+            2,
+        ),
+        num_csp_blocks=2,
+        expand_ratio=0.5,
+        norm_cfg=dict(type='SyncBN'),
+        act_cfg=dict(type='SiLU', inplace=True)),
+    head=dict(
+        type='RTMW3DHead',
+        in_channels=1024,
+        out_channels=133,
+        input_size=codec['input_size'],
+        in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+        simcc_split_ratio=codec['simcc_split_ratio'],
+        final_layer_kernel_size=7,
+        gau_cfg=dict(
+            hidden_dims=256,
+            s=128,
+            expansion_factor=2,
+            dropout_rate=0.1,
+            drop_path=0.,
+            act_fn='SiLU',
+            use_rel_bias=False,
+            pos_enc=False),
+        loss=[
+            dict(
+                type='KLDiscretLoss2',
+                use_target_weight=True,
+                beta=10.,
+                label_softmax=True),
+            dict(
+                type='BoneLoss',
+                joint_parents=[0, 1, 2, 3, 4, 5, 6, 5, 6, 7, 8, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 50, 50, 51, 52, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 91, 92, 93, 94, 91, 96, 97, 98, 91, 100, 101, 102, 91, 104, 105, 106, 91, 108, 109, 110, 8, 112, 113, 114, 113, 112, 117, 118, 117, 112, 121, 122, 123, 112, 125, 126, 127, 112, 129, 130, 131],
+                use_target_weight=True,
+                loss_weight=2.0
+            )
+        ],
+        decoder=codec),
+    # test_cfg=dict(flip_test=False, mode='2d')
+    test_cfg=dict(flip_test=False)
+)
+
+# base dataset settings
+data_mode = 'topdown'
+dataset_type = 'H36MWholeBodyDataset'
+backend_args = dict(backend='local')
+
+# pipelines
+train_pipeline = [
+    dict(type='LoadImage', backend_args=backend_args),
+    dict(type='RandomBackground',
+         bg_dir='/mnt/data/oss_beijing/mmseg/obj365v1_images',
+         bg_prob=0.5,
+    ),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='RandomHalfBody'),
+    dict(
+        type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+    dict(type='TopdownAffine', input_size=(288, 384)),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(
+        type='Albumentation',
+        transforms=[
+            dict(type='Blur', p=0.1),
+            dict(type='MedianBlur', p=0.1),
+            dict(
+                type='CoarseDropout',
+                max_holes=1,
+                max_height=0.4,
+                max_width=0.4,
+                min_holes=1,
+                min_height=0.2,
+                min_width=0.2,
+                p=1.0),
+        ]),
+    dict(type='GenerateTarget', encoder=codec),
+    dict(type='PackPoseInputs')
+]
+val_pipeline = [
+    dict(type='LoadImage', backend_args=backend_args),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='TopdownAffine', input_size=(288, 384)),
+    dict(type='GenerateTarget', encoder=codec),
+    dict(type='PackPoseInputs')
+]
+train_pipeline_stage2 = [
+    dict(type='LoadImage', backend_args=backend_args),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='RandomHalfBody'),
+    dict(
+        type='RandomBBoxTransform',
+        shift_factor=0.,
+        scale_factor=[0.5, 1.5],
+        rotate_factor=90),
+    dict(type='TopdownAffine', input_size=(288, 384)),
+    dict(
+        type='Albumentation',
+        transforms=[
+            dict(type='Blur', p=0.1),
+            dict(type='MedianBlur', p=0.1),
+        ]),
+    dict(type='GenerateTarget', encoder=codec),
+    dict(type='PackPoseInputs')
+]
+
+# h3wb dataset
+h3wb_dataset = dict(
+    type='H36MWholeBodyDataset',
+    ann_file='annotation_body3d/h3wb_train_bbox.npz',
+    seq_len=1,
+    causal=True,
+    data_root='data/h36m/',
+    data_prefix=dict(img='images/'),
+    test_mode=False,
+    pipeline=[])
+
+
+# dna rendering dataset
+dna_rendering_dataset = dict(
+    type='DNARenderingDataset',
+    data_root='data/dna_rendering_part1',
+    data_mode='topdown',
+    ann_file='instances.npz',
+    subset_frac=0.1,
+    pipeline=[
+        dict(type='LoadMask', backend_args=backend_args)
+    ],
+)
+
+# mapping
+
+aic_coco133 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7), (5, 9), (6, 12),
+               (7, 14), (8, 16), (9, 11), (10, 13), (11, 15)]
+
+crowdpose_coco133 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11),
+                     (7, 12), (8, 13), (9, 14), (10, 15), (11, 16)]
+
+mpii_coco133 = [
+    (0, 16),
+    (1, 14),
+    (2, 12),
+    (3, 11),
+    (4, 13),
+    (5, 15),
+    (10, 10),
+    (11, 8),
+    (12, 6),
+    (13, 5),
+    (14, 7),
+    (15, 9),
+]
+
+jhmdb_coco133 = [
+    (3, 6),
+    (4, 5),
+    (5, 12),
+    (6, 11),
+    (7, 8),
+    (8, 7),
+    (9, 14),
+    (10, 13),
+    (11, 10),
+    (12, 9),
+    (13, 16),
+    (14, 15),
+]
+
+halpe_coco133 = [(i, i)
+                 for i in range(17)] + [(20, 17), (21, 20), (22, 18), (23, 21),
+                                        (24, 19),
+                                        (25, 22)] + [(i, i - 3)
+                                                     for i in range(26, 136)]
+
+posetrack_coco133 = [
+    (0, 0),
+    (3, 3),
+    (4, 4),
+    (5, 5),
+    (6, 6),
+    (7, 7),
+    (8, 8),
+    (9, 9),
+    (10, 10),
+    (11, 11),
+    (12, 12),
+    (13, 13),
+    (14, 14),
+    (15, 15),
+    (16, 16),
+]
+
+humanart_coco133 = [(i, i) for i in range(17)] + [(17, 99), (18, 120),
+                                                  (19, 17), (20, 20)]
+
+data_mode = 'topdown'
+data_root = 'data/'
+
+# train datasets
+dataset_coco = dict(
+    type='CocoWholeBodyDataset',
+    data_root='data/coco/',
+    data_mode='topdown',
+    ann_file='annotations/coco_wholebody_train_v1.0.json',
+    data_prefix=dict(img='train2017/'),
+    pipeline=[],
+)
+
+dataset_aic = dict(
+    type='AicDataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='aic/annotations/aic_train.json',
+    data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+                     '_train_20170902/keypoint_train_images_20170902/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=aic_coco133)
+    ],
+)
+
+dataset_crowdpose = dict(
+    type='CrowdPoseDataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
+    data_prefix=dict(img='pose/CrowdPose/images/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=crowdpose_coco133)
+    ],
+)
+
+dataset_mpii = dict(
+    type='MpiiDataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='mpii/annotations/mpii_train.json',
+    data_prefix=dict(img='pose/MPI/images/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=mpii_coco133)
+    ],
+)
+
+dataset_jhmdb = dict(
+    type='JhmdbDataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='jhmdb/annotations/Sub1_train.json',
+    data_prefix=dict(img='pose/JHMDB/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=jhmdb_coco133)
+    ],
+)
+
+dataset_halpe = dict(
+    type='HalpeDataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='halpe/annotations/halpe_train_v1.json',
+    data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=halpe_coco133)
+    ],
+)
+
+dataset_posetrack = dict(
+    type='PoseTrack18Dataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='posetrack18/annotations/posetrack18_train.json',
+    data_prefix=dict(img='pose/PoseChallenge2018/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=posetrack_coco133)
+    ],
+)
+
+dataset_humanart = dict(
+    type='HumanArt21Dataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='HumanArt/annotations/training_humanart.json',
+    filter_cfg=dict(scenes=['real_human']),
+    data_prefix=dict(img='pose/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=humanart_coco133)
+    ])
+
+face_pipeline = [
+    dict(type='LoadImage', backend_args=backend_args),
+    dict(type='GetBBoxCenterScale', padding=1.25),
+    dict(
+        type='RandomBBoxTransform',
+        shift_factor=0.,
+        scale_factor=[1.5, 2.0],
+        rotate_factor=0),
+]
+
+wflw_coco133 = [(i * 2, 23 + i)
+                for i in range(17)] + [(33 + i, 40 + i) for i in range(5)] + [
+                    (42 + i, 45 + i) for i in range(5)
+                ] + [(51 + i, 50 + i)
+                     for i in range(9)] + [(60, 59), (61, 60), (63, 61),
+                                           (64, 62), (65, 63), (67, 64),
+                                           (68, 65), (69, 66), (71, 67),
+                                           (72, 68), (73, 69),
+                                           (75, 70)] + [(76 + i, 71 + i)
+                                                        for i in range(20)]
+dataset_wflw = dict(
+    type='WFLWDataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='wflw/annotations/face_landmarks_wflw_train.json',
+    data_prefix=dict(img='pose/WFLW/images/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=wflw_coco133), *face_pipeline
+    ],
+)
+
+mapping_300w_coco133 = [(i, 23 + i) for i in range(68)]
+dataset_300w = dict(
+    type='Face300WDataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='300w/annotations/face_landmarks_300w_train.json',
+    data_prefix=dict(img='pose/300w/images/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=mapping_300w_coco133), *face_pipeline
+    ],
+)
+
+cofw_coco133 = [(0, 40), (2, 44), (4, 42), (1, 49), (3, 45), (6, 47), (8, 59),
+                (10, 62), (9, 68), (11, 65), (18, 54), (19, 58), (20, 53),
+                (21, 56), (22, 71), (23, 77), (24, 74), (25, 85), (26, 89),
+                (27, 80), (28, 31)]
+dataset_cofw = dict(
+    type='COFWDataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='cofw/annotations/cofw_train.json',
+    data_prefix=dict(img='pose/COFW/images/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=cofw_coco133), *face_pipeline
+    ],
+)
+
+lapa_coco133 = [(i * 2, 23 + i) for i in range(17)] + [
+    (33 + i, 40 + i) for i in range(5)
+] + [(42 + i, 45 + i) for i in range(5)] + [
+    (51 + i, 50 + i) for i in range(4)
+] + [(58 + i, 54 + i) for i in range(5)] + [(66, 59), (67, 60), (69, 61),
+                                            (70, 62), (71, 63), (73, 64),
+                                            (75, 65), (76, 66), (78, 67),
+                                            (79, 68), (80, 69),
+                                            (82, 70)] + [(84 + i, 71 + i)
+                                                         for i in range(20)]
+dataset_lapa = dict(
+    type='LapaDataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='LaPa/annotations/lapa_trainval.json',
+    data_prefix=dict(img='pose/LaPa/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=lapa_coco133), *face_pipeline
+    ],
+)
+
+dataset_wb = dict(
+    type='CombinedDataset',
+    metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+    datasets=[
+        dataset_coco,
+        dataset_halpe
+    ],
+    pipeline=[],
+    test_mode=False,
+)
+
+dataset_body = dict(
+    type='CombinedDataset',
+    metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+    datasets=[
+        dataset_aic,
+        dataset_crowdpose,
+        dataset_mpii,
+        dataset_jhmdb,
+        dataset_posetrack,
+        # dataset_humanart,
+    ],
+    pipeline=[],
+    test_mode=False,
+)
+
+dataset_face = dict(
+    type='CombinedDataset',
+    metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+    datasets=[
+        dataset_wflw,
+        dataset_300w,
+        dataset_cofw,
+        dataset_lapa,
+    ],
+    pipeline=[],
+    test_mode=False,
+)
+
+hand_pipeline = [
+    dict(type='LoadImage', backend_args=backend_args),
+    dict(type='GetBBoxCenterScale'),
+    dict(
+        type='RandomBBoxTransform',
+        shift_factor=0.,
+        scale_factor=[1.5, 2.0],
+        rotate_factor=0),
+]
+
+interhand_left = [(21, 95), (22, 94), (23, 93), (24, 92), (25, 99), (26, 98),
+                  (27, 97), (28, 96), (29, 103), (30, 102), (31, 101),
+                  (32, 100), (33, 107), (34, 106), (35, 105), (36, 104),
+                  (37, 111), (38, 110), (39, 109), (40, 108), (41, 91)]
+interhand_right = [(i - 21, j + 21) for i, j in interhand_left]
+interhand_coco133 = interhand_right + interhand_left
+
+dataset_interhand2d = dict(
+    type='InterHand2DDoubleDataset',
+    data_root='data/interhand2.6m/',
+    data_mode='topdown',
+    ann_file='annotations/all/InterHand2.6M_train_data.json',
+    camera_param_file='annotations/all/InterHand2.6M_train_camera.json',
+    joint_file='annotations/all/InterHand2.6M_train_joint_3d.json',
+    data_prefix=dict(img='images/train/'),
+    sample_interval=10,
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=interhand_coco133,
+        ), *hand_pipeline
+    ],
+)
+
+dataset_interhand3d = dict(
+    type='InterHand3DDataset',
+    data_root='data/interhand2.6m/',
+    data_mode='topdown',
+    ann_file='annotations/all/InterHand2.6M_train_data.json',
+    camera_param_file='annotations/all/InterHand2.6M_train_camera.json',
+    joint_file='annotations/all/InterHand2.6M_train_joint_3d.json',
+    use_gt_root_depth=True,
+    rootnet_result_file=None,
+    data_prefix=dict(img='images/train/'),
+    sample_interval=10,
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=interhand_coco133,
+        ), *hand_pipeline
+    ],
+)
+
+dataset_hand = dict(
+    type='CombinedDataset',
+    metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+    datasets=[dataset_interhand3d],
+    pipeline=[],
+    test_mode=False,
+)
+
+
+# ubody dataset
+scenes = [
+    'Magic_show',
+    'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow',
+    'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing',
+    'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference'
+]
+ubody_datasets = []
+for scene in scenes:
+    train_ann = f'annotations/{scene}/train_3dkeypoint_annotation.json'
+    ubody = dict(
+        type='UBody3dDataset',
+        data_root='data/UBody/',
+        ann_file=train_ann,
+        data_mode='topdown',
+        causal=True,
+        seq_len=1,
+        data_prefix=dict(img='images/'),
+        subset_frac=0.1,
+        pipeline=[])
+    ubody_datasets.append(ubody)
+
+
+train_datasets = [
+    dataset_wb,
+    dataset_body,
+    dataset_face,
+    # dataset_hand,
+    *ubody_datasets,
+    h3wb_dataset,
+    # dna_rendering_dataset
+]
+
+
+# data loaders
+train_dataloader = dict(
+    batch_size=64,
+    num_workers=10,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='CombinedDataset',
+        datasets=train_datasets,
+        pipeline=train_pipeline,
+        metainfo=dict(from_file='configs/_base_/datasets/h3wb.py'),
+        test_mode=False))
+
+# hooks
+default_hooks = dict(
+    checkpoint=dict(
+        type='CheckpointHook',
+        save_best='MPJPE',
+        rule='less',
+        max_keep_ckpts=1))
+
+# hooks
+# default_hooks = dict(
+#     checkpoint=dict(
+#         save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+    dict(
+        type='EMAHook',
+        ema_type='ExpMomentumEMA',
+        momentum=0.0002,
+        update_buffers=True,
+        priority=49),
+    dict(
+        type='mmdet.PipelineSwitchHook',
+        switch_epoch=max_epochs - stage2_num_epochs,
+        switch_pipeline=train_pipeline_stage2)
+]
+
+# eval h3wb
+# val_dataloader = dict(
+#     batch_size=64,
+#     num_workers=10,
+#     persistent_workers=True,
+#     drop_last=False,
+#     sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+#     dataset=dict(
+#         type='H36MWholeBodyDataset',
+#         ann_file='annotation_body3d/h3wb_train_bbox.npz',
+#         seq_len=1,
+#         causal=True,
+#         data_root='data/h36m/',
+#         data_prefix=dict(img='images/'),
+#         test_mode=True,
+#         pipeline=val_pipeline))
+# test_dataloader = val_dataloader
+
+# # evaluators
+# val_evaluator = [
+#     dict(type='SimpleMPJPE', mode='mpjpe'),
+#     dict(type='SimpleMPJPE', mode='p-mpjpe')
+# ]
+# test_evaluator = val_evaluator
+
+# eval coco
+val_dataloader = dict(
+    batch_size=64,
+    num_workers=10,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type='CocoWholeBodyDataset',
+        data_root='data/coco/',
+        data_mode='topdown',
+        ann_file='annotations/coco_wholebody_val_v1.0.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        bbox_file='data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json',
+        pipeline=val_pipeline,
+    ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+    type='CocoWholeBodyMetric',
+    ann_file='data/coco/' + 'annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/projects/rtmpose3d/configs/rtmw3d-x_8xb64_cocktail14-384x288.py b/projects/rtmpose3d/configs/rtmw3d-x_8xb64_cocktail14-384x288.py
new file mode 100644
index 0000000000..3a822f50b8
--- /dev/null
+++ b/projects/rtmpose3d/configs/rtmw3d-x_8xb64_cocktail14-384x288.py
@@ -0,0 +1,706 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+vis_backends = [
+    dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+    type='Pose3dLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+# runtime
+max_epochs = 270
+stage2_num_epochs = 10
+base_lr = 5e-4
+num_keypoints = 133
+
+train_cfg = dict(max_epochs=max_epochs, val_interval=10)
+randomness = dict(seed=2024)
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
+    paramwise_cfg=dict(
+        norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0e-5,
+        by_epoch=False,
+        begin=0,
+        end=1000),
+    dict(
+        type='CosineAnnealingLR',
+        eta_min=base_lr * 0.05,
+        begin=max_epochs // 2,
+        end=max_epochs,
+        T_max=max_epochs // 2,
+        by_epoch=True,
+        convert_to_iter_based=True),
+]
+
+# automatically scaling LR based on the actual training batch size
+auto_scale_lr = dict(base_batch_size=4096)
+
+# codec settings
+codec = dict(
+    type='SimCC3DLabel',
+    input_size=(288, 384, 288),
+    sigma=(6., 6.93, 6.),
+    simcc_split_ratio=2.0,
+    normalize=False,
+    use_dark=False,
+    root_index=(11, 12))
+
+# model settings
+model = dict(
+    type='TopdownPoseEstimator3D',
+    data_preprocessor=dict(
+        type='PoseDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True),
+    backbone=dict(
+        type='CSPNeXt',
+        arch='P5',
+        expand_ratio=0.5,
+        deepen_factor=1.33,
+        widen_factor=1.25,
+        channel_attention=True,
+        norm_cfg=dict(type='BN'),
+        act_cfg=dict(type='SiLU'),
+        init_cfg=dict(
+            type='Pretrained',
+            prefix='backbone.',
+            checkpoint='checkpoints/rtmpose-x_simcc-ucoco_pt-aic-coco_270e-384x288-f5b50679_20230822.pth'  # noqa
+        )),
+    neck=dict(
+        type='CSPNeXtPAFPN',
+        in_channels=[320, 640, 1280],
+        out_channels=None,
+        out_indices=(
+            1,
+            2,
+        ),
+        num_csp_blocks=2,
+        expand_ratio=0.5,
+        norm_cfg=dict(type='SyncBN'),
+        act_cfg=dict(type='SiLU', inplace=True)),
+    head=dict(
+        type='RTMW3DHead',
+        in_channels=1280,
+        out_channels=133,
+        input_size=codec['input_size'],
+        in_featuremap_size=tuple([s // 32 for s in codec['input_size']]),
+        simcc_split_ratio=codec['simcc_split_ratio'],
+        final_layer_kernel_size=7,
+        gau_cfg=dict(
+            hidden_dims=256,
+            s=128,
+            expansion_factor=2,
+            dropout_rate=0.,
+            drop_path=0.,
+            act_fn='SiLU',
+            use_rel_bias=False,
+            pos_enc=False),
+        loss=[
+            dict(
+                type='KLDiscretLoss2',
+                use_target_weight=True,
+                beta=10.,
+                label_softmax=True),
+            dict(
+                type='BoneLoss',
+                joint_parents=[0, 1, 2, 3, 4, 5, 6, 5, 6, 7, 8, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 50, 50, 51, 52, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 91, 92, 93, 94, 91, 96, 97, 98, 91, 100, 101, 102, 91, 104, 105, 106, 91, 108, 109, 110, 8, 112, 113, 114, 113, 112, 117, 118, 117, 112, 121, 122, 123, 112, 125, 126, 127, 112, 129, 130, 131],
+                use_target_weight=True,
+                loss_weight=2.0
+            )
+        ],
+        decoder=codec),
+    test_cfg=dict(flip_test=False, mode='2d')
+    # test_cfg=dict(flip_test=False)
+)
+
+# base dataset settings
+data_mode = 'topdown'
+
+backend_args = dict(backend='local')
+
+# pipelines
+train_pipeline = [
+    dict(type='LoadImage', backend_args=backend_args),
+    dict(type='RandomBackground',
+         bg_dir='/mnt/data/oss_beijing/mmseg/obj365v1_images',
+         bg_prob=0.5,
+    ),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='RandomHalfBody'),
+    dict(
+        type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
+    dict(type='TopdownAffine', input_size=(288, 384)),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(
+        type='Albumentation',
+        transforms=[
+            dict(type='Blur', p=0.1),
+            dict(type='MedianBlur', p=0.1),
+            dict(
+                type='CoarseDropout',
+                max_holes=1,
+                max_height=0.4,
+                max_width=0.4,
+                min_holes=1,
+                min_height=0.2,
+                min_width=0.2,
+                p=1.0),
+        ]),
+    dict(type='GenerateTarget', encoder=codec),
+    dict(type='PackPoseInputs')
+]
+val_pipeline = [
+    dict(type='LoadImage', backend_args=backend_args),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='TopdownAffine', input_size=(288, 384)),
+    dict(type='GenerateTarget', encoder=codec),
+    dict(type='PackPoseInputs')
+]
+train_pipeline_stage2 = [
+    dict(type='LoadImage', backend_args=backend_args),
+    dict(type='GetBBoxCenterScale'),
+    dict(type='RandomFlip', direction='horizontal'),
+    dict(type='RandomHalfBody'),
+    dict(
+        type='RandomBBoxTransform',
+        shift_factor=0.,
+        scale_factor=[0.5, 1.5],
+        rotate_factor=90),
+    dict(type='TopdownAffine', input_size=(288, 384)),
+    dict(
+        type='Albumentation',
+        transforms=[
+            dict(type='Blur', p=0.1),
+            dict(type='MedianBlur', p=0.1),
+        ]),
+    dict(type='GenerateTarget', encoder=codec),
+    dict(type='PackPoseInputs')
+]
+
+# h3wb dataset
+h3wb_dataset = dict(
+    type='H36MWholeBodyDataset',
+    ann_file='annotation_body3d/h3wb_train_bbox.npz',
+    seq_len=1,
+    causal=True,
+    data_root='data/h36m/',
+    data_prefix=dict(img='images/'),
+    test_mode=False,
+    pipeline=[])
+
+
+# dna rendering dataset
+dna_rendering_dataset = dict(
+    type='DNARenderingDataset',
+    data_root='data/dna_rendering_part1',
+    data_mode='topdown',
+    ann_file='instances.npz',
+    subset_frac=0.1,
+    pipeline=[
+        dict(type='LoadMask', backend_args=backend_args)
+    ],
+)
+
+# mapping
+
+aic_coco133 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7), (5, 9), (6, 12),
+               (7, 14), (8, 16), (9, 11), (10, 13), (11, 15)]
+
+crowdpose_coco133 = [(0, 5), (1, 6), (2, 7), (3, 8), (4, 9), (5, 10), (6, 11),
+                     (7, 12), (8, 13), (9, 14), (10, 15), (11, 16)]
+
+mpii_coco133 = [
+    (0, 16),
+    (1, 14),
+    (2, 12),
+    (3, 11),
+    (4, 13),
+    (5, 15),
+    (10, 10),
+    (11, 8),
+    (12, 6),
+    (13, 5),
+    (14, 7),
+    (15, 9),
+]
+
+jhmdb_coco133 = [
+    (3, 6),
+    (4, 5),
+    (5, 12),
+    (6, 11),
+    (7, 8),
+    (8, 7),
+    (9, 14),
+    (10, 13),
+    (11, 10),
+    (12, 9),
+    (13, 16),
+    (14, 15),
+]
+
+halpe_coco133 = [(i, i)
+                 for i in range(17)] + [(20, 17), (21, 20), (22, 18), (23, 21),
+                                        (24, 19),
+                                        (25, 22)] + [(i, i - 3)
+                                                     for i in range(26, 136)]
+
+posetrack_coco133 = [
+    (0, 0),
+    (3, 3),
+    (4, 4),
+    (5, 5),
+    (6, 6),
+    (7, 7),
+    (8, 8),
+    (9, 9),
+    (10, 10),
+    (11, 11),
+    (12, 12),
+    (13, 13),
+    (14, 14),
+    (15, 15),
+    (16, 16),
+]
+
+humanart_coco133 = [(i, i) for i in range(17)] + [(17, 99), (18, 120),
+                                                  (19, 17), (20, 20)]
+
+data_mode = 'topdown'
+data_root = 'data/'
+
+# train datasets
+dataset_coco = dict(
+    type='CocoWholeBodyDataset',
+    data_root='data/coco/',
+    data_mode='topdown',
+    ann_file='annotations/coco_wholebody_train_v1.0.json',
+    data_prefix=dict(img='train2017/'),
+    pipeline=[],
+)
+
+dataset_aic = dict(
+    type='AicDataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='aic/annotations/aic_train.json',
+    data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
+                     '_train_20170902/keypoint_train_images_20170902/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=aic_coco133)
+    ],
+)
+
+dataset_crowdpose = dict(
+    type='CrowdPoseDataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
+    data_prefix=dict(img='pose/CrowdPose/images/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=crowdpose_coco133)
+    ],
+)
+
+dataset_mpii = dict(
+    type='MpiiDataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='mpii/annotations/mpii_train.json',
+    data_prefix=dict(img='pose/MPI/images/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=mpii_coco133)
+    ],
+)
+
+dataset_jhmdb = dict(
+    type='JhmdbDataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='jhmdb/annotations/Sub1_train.json',
+    data_prefix=dict(img='pose/JHMDB/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=jhmdb_coco133)
+    ],
+)
+
+dataset_halpe = dict(
+    type='HalpeDataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='halpe/annotations/halpe_train_v1.json',
+    data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=halpe_coco133)
+    ],
+)
+
+dataset_posetrack = dict(
+    type='PoseTrack18Dataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='posetrack18/annotations/posetrack18_train.json',
+    data_prefix=dict(img='pose/PoseChallenge2018/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=posetrack_coco133)
+    ],
+)
+
+dataset_humanart = dict(
+    type='HumanArt21Dataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='HumanArt/annotations/training_humanart.json',
+    filter_cfg=dict(scenes=['real_human']),
+    data_prefix=dict(img='pose/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=humanart_coco133)
+    ])
+
+face_pipeline = [
+    dict(type='LoadImage', backend_args=backend_args),
+    dict(type='GetBBoxCenterScale', padding=1.25),
+    dict(
+        type='RandomBBoxTransform',
+        shift_factor=0.,
+        scale_factor=[1.5, 2.0],
+        rotate_factor=0),
+]
+
+wflw_coco133 = [(i * 2, 23 + i)
+                for i in range(17)] + [(33 + i, 40 + i) for i in range(5)] + [
+                    (42 + i, 45 + i) for i in range(5)
+                ] + [(51 + i, 50 + i)
+                     for i in range(9)] + [(60, 59), (61, 60), (63, 61),
+                                           (64, 62), (65, 63), (67, 64),
+                                           (68, 65), (69, 66), (71, 67),
+                                           (72, 68), (73, 69),
+                                           (75, 70)] + [(76 + i, 71 + i)
+                                                        for i in range(20)]
+dataset_wflw = dict(
+    type='WFLWDataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='wflw/annotations/face_landmarks_wflw_train.json',
+    data_prefix=dict(img='pose/WFLW/images/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=wflw_coco133), *face_pipeline
+    ],
+)
+
+mapping_300w_coco133 = [(i, 23 + i) for i in range(68)]
+dataset_300w = dict(
+    type='Face300WDataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='300w/annotations/face_landmarks_300w_train.json',
+    data_prefix=dict(img='pose/300w/images/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=mapping_300w_coco133), *face_pipeline
+    ],
+)
+
+cofw_coco133 = [(0, 40), (2, 44), (4, 42), (1, 49), (3, 45), (6, 47), (8, 59),
+                (10, 62), (9, 68), (11, 65), (18, 54), (19, 58), (20, 53),
+                (21, 56), (22, 71), (23, 77), (24, 74), (25, 85), (26, 89),
+                (27, 80), (28, 31)]
+dataset_cofw = dict(
+    type='COFWDataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='cofw/annotations/cofw_train.json',
+    data_prefix=dict(img='pose/COFW/images/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=cofw_coco133), *face_pipeline
+    ],
+)
+
+lapa_coco133 = [(i * 2, 23 + i) for i in range(17)] + [
+    (33 + i, 40 + i) for i in range(5)
+] + [(42 + i, 45 + i) for i in range(5)] + [
+    (51 + i, 50 + i) for i in range(4)
+] + [(58 + i, 54 + i) for i in range(5)] + [(66, 59), (67, 60), (69, 61),
+                                            (70, 62), (71, 63), (73, 64),
+                                            (75, 65), (76, 66), (78, 67),
+                                            (79, 68), (80, 69),
+                                            (82, 70)] + [(84 + i, 71 + i)
+                                                         for i in range(20)]
+dataset_lapa = dict(
+    type='LapaDataset',
+    data_root=data_root,
+    data_mode=data_mode,
+    ann_file='LaPa/annotations/lapa_trainval.json',
+    data_prefix=dict(img='pose/LaPa/'),
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=lapa_coco133), *face_pipeline
+    ],
+)
+
+dataset_wb = dict(
+    type='CombinedDataset',
+    metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+    datasets=[
+        dataset_coco,
+        dataset_halpe
+    ],
+    pipeline=[],
+    test_mode=False,
+)
+
+dataset_body = dict(
+    type='CombinedDataset',
+    metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+    datasets=[
+        dataset_aic,
+        dataset_crowdpose,
+        dataset_mpii,
+        dataset_jhmdb,
+        dataset_posetrack,
+        # dataset_humanart,
+    ],
+    pipeline=[],
+    test_mode=False,
+)
+
+dataset_face = dict(
+    type='CombinedDataset',
+    metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+    datasets=[
+        dataset_wflw,
+        dataset_300w,
+        dataset_cofw,
+        dataset_lapa,
+    ],
+    pipeline=[],
+    test_mode=False,
+)
+
+hand_pipeline = [
+    dict(type='LoadImage', backend_args=backend_args),
+    dict(type='GetBBoxCenterScale'),
+    dict(
+        type='RandomBBoxTransform',
+        shift_factor=0.,
+        scale_factor=[1.5, 2.0],
+        rotate_factor=0),
+]
+
+interhand_left = [(21, 95), (22, 94), (23, 93), (24, 92), (25, 99), (26, 98),
+                  (27, 97), (28, 96), (29, 103), (30, 102), (31, 101),
+                  (32, 100), (33, 107), (34, 106), (35, 105), (36, 104),
+                  (37, 111), (38, 110), (39, 109), (40, 108), (41, 91)]
+interhand_right = [(i - 21, j + 21) for i, j in interhand_left]
+interhand_coco133 = interhand_right + interhand_left
+
+dataset_interhand2d = dict(
+    type='InterHand2DDoubleDataset',
+    data_root='data/interhand2.6m/',
+    data_mode='topdown',
+    ann_file='annotations/all/InterHand2.6M_train_data.json',
+    camera_param_file='annotations/all/InterHand2.6M_train_camera.json',
+    joint_file='annotations/all/InterHand2.6M_train_joint_3d.json',
+    data_prefix=dict(img='images/train/'),
+    sample_interval=10,
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=interhand_coco133,
+        ), *hand_pipeline
+    ],
+)
+
+dataset_interhand3d = dict(
+    type='InterHand3DDataset',
+    data_root='data/interhand2.6m/',
+    data_mode='topdown',
+    ann_file='annotations/all/InterHand2.6M_train_data.json',
+    camera_param_file='annotations/all/InterHand2.6M_train_camera.json',
+    joint_file='annotations/all/InterHand2.6M_train_joint_3d.json',
+    use_gt_root_depth=True,
+    rootnet_result_file=None,
+    data_prefix=dict(img='images/train/'),
+    sample_interval=10,
+    pipeline=[
+        dict(
+            type='KeypointConverter',
+            num_keypoints=num_keypoints,
+            mapping=interhand_coco133,
+        ), *hand_pipeline
+    ],
+)
+
+dataset_hand = dict(
+    type='CombinedDataset',
+    metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+    datasets=[dataset_interhand3d],
+    pipeline=[],
+    test_mode=False,
+)
+
+
+# ubody dataset
+scenes = [
+    'Magic_show',
+    'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow',
+    'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing',
+    'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference'
+]
+ubody_datasets = []
+for scene in scenes:
+    train_ann = f'annotations/{scene}/train_3dkeypoint_annotation.json'
+    ubody = dict(
+        type='UBody3dDataset',
+        data_root='data/UBody/',
+        ann_file=train_ann,
+        data_mode='topdown',
+        causal=True,
+        seq_len=1,
+        data_prefix=dict(img='images/'),
+        subset_frac=0.1,
+        pipeline=[])
+    ubody_datasets.append(ubody)
+
+
+train_datasets = [
+    dataset_wb,
+    dataset_body,
+    dataset_face,
+    dataset_hand,
+    *ubody_datasets,
+    h3wb_dataset,
+    # dna_rendering_dataset
+]
+
+
+# data loaders
+train_dataloader = dict(
+    batch_size=32,
+    num_workers=10,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='CombinedDataset',
+        datasets=train_datasets,
+        pipeline=train_pipeline,
+        metainfo=dict(from_file='configs/_base_/datasets/h3wb.py'),
+        test_mode=False))
+
+# hooks
+# default_hooks = dict(
+#     checkpoint=dict(
+#         type='CheckpointHook',
+#         save_best='MPJPE',
+#         rule='less',
+#         max_keep_ckpts=1))
+
+# hooks
+default_hooks = dict(
+    checkpoint=dict(
+        save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1))
+
+custom_hooks = [
+    dict(
+        type='EMAHook',
+        ema_type='ExpMomentumEMA',
+        momentum=0.0002,
+        update_buffers=True,
+        priority=49),
+    dict(
+        type='mmdet.PipelineSwitchHook',
+        switch_epoch=max_epochs - stage2_num_epochs,
+        switch_pipeline=train_pipeline_stage2)
+]
+
+# eval h3wb
+# val_dataloader = dict(
+#     batch_size=64,
+#     num_workers=10,
+#     persistent_workers=True,
+#     drop_last=False,
+#     sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+#     dataset=dict(
+#         type='H36MWholeBodyDataset',
+#         ann_file='annotation_body3d/h3wb_train_bbox.npz',
+#         seq_len=1,
+#         causal=True,
+#         data_root='data/h36m/',
+#         data_prefix=dict(img='images/'),
+#         test_mode=True,
+#         pipeline=val_pipeline))
+# test_dataloader = val_dataloader
+
+# # evaluators
+# val_evaluator = [
+#     dict(type='SimpleMPJPE', mode='mpjpe'),
+#     dict(type='SimpleMPJPE', mode='p-mpjpe')
+# ]
+# test_evaluator = val_evaluator
+
+# eval coco
+val_dataloader = dict(
+    batch_size=64,
+    num_workers=10,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type='CocoWholeBodyDataset',
+        data_root='data/coco/',
+        data_mode='topdown',
+        ann_file='annotations/coco_wholebody_val_v1.0.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        bbox_file='data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json',
+        pipeline=val_pipeline,
+    ))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+    type='CocoWholeBodyMetric',
+    ann_file='data/coco/' + 'annotations/coco_wholebody_val_v1.0.json')
+test_evaluator = val_evaluator
diff --git a/projects/rtmpose3d/rtmpose3d/__init__.py b/projects/rtmpose3d/rtmpose3d/__init__.py
new file mode 100644
index 0000000000..eec926b2c8
--- /dev/null
+++ b/projects/rtmpose3d/rtmpose3d/__init__.py
@@ -0,0 +1,6 @@
+from .pose_estimator import TopdownPoseEstimator3D
+from .rtmw3d_head import RTMW3DHead
+from .simcc_3d_label import SimCC3DLabel
+from .loss import KLDiscretLoss2
+
+__all__ = ['TopdownPoseEstimator3D', 'RTMW3DHead', 'SimCC3DLabel', 'KLDiscretLoss2']
diff --git a/projects/rtmpose3d/rtmpose3d/loss.py b/projects/rtmpose3d/rtmpose3d/loss.py
new file mode 100644
index 0000000000..499befa5a0
--- /dev/null
+++ b/projects/rtmpose3d/rtmpose3d/loss.py
@@ -0,0 +1,37 @@
+from mmpose.registry import MODELS
+from mmpose.models.losses import KLDiscretLoss
+
+@MODELS.register_module()
+class KLDiscretLoss2(KLDiscretLoss):
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self._loss_name = 'loss_kld'
+
+    def forward(self, pred_simcc, gt_simcc, target_weight):
+        N, K, _ = pred_simcc[0].shape
+        loss = 0
+
+        for pred, target, weight in zip(pred_simcc, gt_simcc, target_weight):
+            pred = pred.reshape(-1, pred.size(-1))
+            target = target.reshape(-1, target.size(-1))
+            weight = weight.reshape(-1)
+
+            t_loss = self.criterion(pred, target).mul(weight)
+
+            if self.mask is not None:
+                t_loss = t_loss.reshape(N, K)
+                t_loss[:, self.mask] = t_loss[:, self.mask] * self.mask_weight
+
+            loss = loss + t_loss.sum()
+
+        return loss / K
+
+    @property
+    def loss_name(self):
+        """Loss Name.
+
+        Returns:
+            str: The name of this loss item.
+        """
+        return self._loss_name
\ No newline at end of file
diff --git a/projects/rtmpose3d/rtmpose3d/pose_estimator.py b/projects/rtmpose3d/rtmpose3d/pose_estimator.py
new file mode 100644
index 0000000000..6854205b4b
--- /dev/null
+++ b/projects/rtmpose3d/rtmpose3d/pose_estimator.py
@@ -0,0 +1,116 @@
+from itertools import zip_longest
+from typing import Optional
+
+import numpy as np
+
+from mmpose.utils.typing import InstanceList, PixelDataList, SampleList
+from mmpose.registry import MODELS
+from mmpose.models.pose_estimators import TopdownPoseEstimator
+
+@MODELS.register_module()
+class TopdownPoseEstimator3D(TopdownPoseEstimator):
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.camera_param = {
+            'c': [512.54150496, 515.45148698],
+            'f': [1145.04940459, 1143.78109572],
+        }
+
+    def add_pred_to_datasample(self, batch_pred_instances: InstanceList,
+                               batch_pred_fields: Optional[PixelDataList],
+                               batch_data_samples: SampleList) -> SampleList:
+        """Add predictions into data samples.
+
+        Args:
+            batch_pred_instances (List[InstanceData]): The predicted instances
+                of the input data batch
+            batch_pred_fields (List[PixelData], optional): The predicted
+                fields (e.g. heatmaps) of the input batch
+            batch_data_samples (List[PoseDataSample]): The input data batch
+
+        Returns:
+            List[PoseDataSample]: A list of data samples where the predictions
+            are stored in the ``pred_instances`` field of each data sample.
+        """
+        assert len(batch_pred_instances) == len(batch_data_samples)
+        if batch_pred_fields is None:
+            batch_pred_fields = []
+        output_keypoint_indices = self.test_cfg.get('output_keypoint_indices',
+                                                    None)
+        mode = self.test_cfg.get('mode', '3d')
+        assert mode in ['2d', '3d', 'vis', 'simcc']
+        for pred_instances, pred_fields, data_sample in zip_longest(
+                batch_pred_instances, batch_pred_fields, batch_data_samples):
+
+            gt_instances = data_sample.gt_instances
+
+            # convert keypoint coordinates from input space to image space
+            input_center = data_sample.metainfo['input_center']
+            input_scale = data_sample.metainfo['input_scale']
+            input_size = data_sample.metainfo['input_size']
+            keypoints_3d = pred_instances.keypoints
+            keypoints_2d = pred_instances.keypoints_2d
+            keypoints_simcc = pred_instances.keypoints_simcc
+            keypoints_2d = keypoints_2d / input_size * input_scale \
+                + input_center - 0.5 * input_scale
+
+            if gt_instances.get('camera_params', None) is not None:
+                camera_params = gt_instances.camera_params[0]
+                f = np.array(camera_params['f'])
+                c = np.array(camera_params['c'])
+            else:
+                f = np.array([1145.04940459, 1143.78109572])
+                c = np.array(data_sample.ori_shape)
+
+            kpts_pixel = np.concatenate([
+                keypoints_2d,
+                (keypoints_3d[..., 2] + gt_instances.root_z)[..., None]
+            ],
+                                        axis=-1)
+            kpts_cam = kpts_pixel.copy()
+            kpts_cam[..., :2] = (kpts_pixel[..., :2] - c) / f * kpts_pixel[...,
+                                                                           2:]
+            if mode == '3d':
+                pred_instances.keypoints = kpts_cam
+                pred_instances.transformed_keypoints = keypoints_2d
+            elif mode == 'vis':
+                pred_instances.keypoints = keypoints_3d
+                pred_instances.transformed_keypoints = keypoints_2d
+            elif mode == 'simcc':
+                pred_instances.keypoints = keypoints_simcc
+                pred_instances.transformed_keypoints = keypoints_2d
+            else:
+                pred_instances.keypoints = keypoints_2d
+                pred_instances.transformed_keypoints = keypoints_2d
+
+            if 'keypoints_visible' not in pred_instances:
+                pred_instances.keypoints_visible = \
+                    pred_instances.keypoint_scores
+
+            if output_keypoint_indices is not None:
+                # select output keypoints with given indices
+                num_keypoints = pred_instances.keypoints.shape[1]
+                for key, value in pred_instances.all_items():
+                    if key.startswith('keypoint'):
+                        pred_instances.set_field(
+                            value[:, output_keypoint_indices], key)
+
+            # add bbox information into pred_instances
+            pred_instances.bboxes = gt_instances.bboxes
+            pred_instances.bbox_scores = gt_instances.bbox_scores
+
+            data_sample.pred_instances = pred_instances
+
+            if pred_fields is not None:
+                if output_keypoint_indices is not None:
+                    # select output heatmap channels with keypoint indices
+                    # when the number of heatmap channel matches num_keypoints
+                    for key, value in pred_fields.all_items():
+                        if value.shape[0] != num_keypoints:
+                            continue
+                        pred_fields.set_field(value[output_keypoint_indices],
+                                              key)
+                data_sample.pred_fields = pred_fields
+
+        return batch_data_samples
diff --git a/projects/rtmpose3d/rtmpose3d/rtmw3d_head.py b/projects/rtmpose3d/rtmpose3d/rtmw3d_head.py
new file mode 100644
index 0000000000..bbf6bd2b48
--- /dev/null
+++ b/projects/rtmpose3d/rtmpose3d/rtmw3d_head.py
@@ -0,0 +1,444 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence, Tuple, Union
+
+import numpy as np
+import torch
+from mmcv.cnn import ConvModule
+from mmengine.structures import InstanceData
+from torch import Tensor, nn
+
+from mmpose.codecs.utils import get_simcc_maximum as get_2d_simcc_maximum
+from mmpose.evaluation.functional import keypoint_mpjpe
+from mmpose.models.utils.rtmcc_block import RTMCCBlock, ScaleNorm
+from mmpose.registry import KEYPOINT_CODECS, MODELS
+from mmpose.utils.tensor_utils import to_numpy
+from mmpose.utils.typing import (ConfigType, InstanceList, OptConfigType,
+                                 OptSampleList)
+from mmpose.models.heads import BaseHead
+from .utils import get_simcc_maximum
+
+OptIntSeq = Optional[Sequence[int]]
+
+
+@MODELS.register_module()
+class RTMW3DHead(BaseHead):
+    """Top-down head introduced in RTMPose-Wholebody (2023).
+
+    Args:
+        in_channels (int | sequence[int]): Number of channels in the input
+            feature map.
+        out_channels (int): Number of channels in the output heatmap.
+        input_size (tuple): Size of input image in shape [w, h].
+        in_featuremap_size (int | sequence[int]): Size of input feature map.
+        simcc_split_ratio (float): Split ratio of pixels.
+            Default: 2.0.
+        final_layer_kernel_size (int): Kernel size of the convolutional layer.
+            Default: 1.
+        gau_cfg (Config): Config dict for the Gated Attention Unit.
+            Default: dict(
+                hidden_dims=256,
+                s=128,
+                expansion_factor=2,
+                dropout_rate=0.,
+                drop_path=0.,
+                act_fn='ReLU',
+                use_rel_bias=False,
+                pos_enc=False).
+        loss (Config): Config of the keypoint loss. Defaults to use
+            :class:`KLDiscretLoss`
+        decoder (Config, optional): The decoder config that controls decoding
+            keypoint coordinates from the network output. Defaults to ``None``
+        init_cfg (Config, optional): Config to control the initialization. See
+            :attr:`default_init_cfg` for default settings
+    """
+
+    def __init__(
+        self,
+        in_channels: Union[int, Sequence[int]],
+        out_channels: int,
+        input_size: Tuple[int, int],
+        in_featuremap_size: Tuple[int, int],
+        simcc_split_ratio: float = 2.0,
+        final_layer_kernel_size: int = 1,
+        gau_cfg: ConfigType = dict(
+            hidden_dims=256,
+            s=128,
+            expansion_factor=2,
+            dropout_rate=0.,
+            drop_path=0.,
+            act_fn='ReLU',
+            use_rel_bias=False,
+            pos_enc=False),
+        loss: ConfigType = dict(type='KLDiscretLoss', use_target_weight=True),
+        decoder: OptConfigType = None,
+        init_cfg: OptConfigType = None,
+    ):
+
+        if init_cfg is None:
+            init_cfg = self.default_init_cfg
+
+        super().__init__(init_cfg)
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.input_size = input_size
+        self.in_featuremap_size = in_featuremap_size
+        self.simcc_split_ratio = simcc_split_ratio
+
+        self.loss_module = nn.ModuleList()
+        if isinstance(loss, dict):
+            self.loss_module.append(MODELS.build(loss))
+        elif isinstance(loss, (list, tuple)):
+            for cfg in loss:
+                self.loss_module.append(MODELS.build(cfg))
+        else:
+            raise TypeError(f'loss_decode must be a dict or sequence of dict,\
+                but got {type(loss)}')
+
+        if decoder is not None:
+            self.decoder = KEYPOINT_CODECS.build(decoder)
+        else:
+            self.decoder = None
+
+        if isinstance(in_channels, (tuple, list)):
+            raise ValueError(
+                f'{self.__class__.__name__} does not support selecting '
+                'multiple input features.')
+
+        # Define SimCC layers
+        flatten_dims = self.in_featuremap_size[0] * self.in_featuremap_size[1]
+
+        ps = 2
+        self.ps = nn.PixelShuffle(ps)
+        self.conv_dec = ConvModule(
+            in_channels // ps**2,
+            in_channels // 4,
+            kernel_size=final_layer_kernel_size,
+            stride=1,
+            padding=final_layer_kernel_size // 2,
+            norm_cfg=dict(type='BN', requires_grad=True),
+            act_cfg=dict(type='ReLU'))
+
+        self.final_layer = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size=final_layer_kernel_size,
+            stride=1,
+            padding=final_layer_kernel_size // 2,
+            norm_cfg=dict(type='BN', requires_grad=True),
+            act_cfg=dict(type='ReLU'))
+        self.final_layer2 = ConvModule(
+            in_channels // ps + in_channels // 4,
+            out_channels,
+            kernel_size=final_layer_kernel_size,
+            stride=1,
+            padding=final_layer_kernel_size // 2,
+            norm_cfg=dict(type='BN', requires_grad=True),
+            act_cfg=dict(type='ReLU'))
+
+        self.mlp = nn.Sequential(
+            ScaleNorm(flatten_dims),
+            nn.Linear(flatten_dims, gau_cfg['hidden_dims'] // 2, bias=False))
+
+        self.mlp2 = nn.Sequential(
+            ScaleNorm(flatten_dims * ps**2),
+            nn.Linear(
+                flatten_dims * ps**2, gau_cfg['hidden_dims'] // 2, bias=False))
+
+        W = int(self.input_size[0] * self.simcc_split_ratio)
+        H = int(self.input_size[1] * self.simcc_split_ratio)
+        D = int(self.input_size[2] * self.simcc_split_ratio)
+
+        self.gau = RTMCCBlock(
+            self.out_channels,
+            gau_cfg['hidden_dims'],
+            gau_cfg['hidden_dims'],
+            s=gau_cfg['s'],
+            expansion_factor=gau_cfg['expansion_factor'],
+            dropout_rate=gau_cfg['dropout_rate'],
+            drop_path=gau_cfg['drop_path'],
+            attn_type='self-attn',
+            act_fn=gau_cfg['act_fn'],
+            use_rel_bias=gau_cfg['use_rel_bias'],
+            pos_enc=gau_cfg['pos_enc'])
+
+        self.cls_x = nn.Linear(gau_cfg['hidden_dims'], W, bias=False)
+        self.cls_y = nn.Linear(gau_cfg['hidden_dims'], H, bias=False)
+        self.cls_z = nn.Linear(gau_cfg['hidden_dims'], D, bias=False)
+
+    def forward(self, feats: Tuple[Tensor,
+                                   Tensor]) -> Tuple[Tensor, Tensor, Tensor]:
+        """Forward the network.
+
+        The input is the feature map extracted by backbone and the
+        output is the simcc representation.
+
+        Args:
+            feats (Tuple[Tensor]): Multi scale feature maps.
+
+        Returns:
+            pred_x (Tensor): 1d representation of x.
+            pred_y (Tensor): 1d representation of y.
+        """
+        # enc_b  n / 2, h, w
+        # enc_t  n,     h, w
+        enc_b, enc_t = feats
+
+        feats_t = self.final_layer(enc_t)
+        feats_t = torch.flatten(feats_t, 2)
+        feats_t = self.mlp(feats_t)
+
+        dec_t = self.ps(enc_t)
+        dec_t = self.conv_dec(dec_t)
+        enc_b = torch.cat([dec_t, enc_b], dim=1)
+
+        feats_b = self.final_layer2(enc_b)
+        feats_b = torch.flatten(feats_b, 2)
+        feats_b = self.mlp2(feats_b)
+
+        feats = torch.cat([feats_t, feats_b], dim=2)
+
+        feats = self.gau(feats)
+
+        pred_x = self.cls_x(feats)
+        pred_y = self.cls_y(feats)
+        pred_z = self.cls_z(feats)
+
+        return pred_x, pred_y, pred_z
+
+    def decode(self, batch_outputs: Union[Tensor,
+                                          Tuple[Tensor]]) -> InstanceList:
+        """Decode keypoints from outputs.
+
+        Args:
+            batch_outputs (Tensor | Tuple[Tensor]): The network outputs of
+                a data batch
+
+        Returns:
+            List[InstanceData]: A list of InstanceData, each contains the
+            decoded pose information of the instances of one data sample.
+        """
+
+        def _pack_and_call(args, func):
+            if not isinstance(args, tuple):
+                args = (args, )
+            return func(*args)
+
+        if self.decoder is None:
+            raise RuntimeError(
+                f'The decoder has not been set in {self.__class__.__name__}. '
+                'Please set the decoder configs in the init parameters to '
+                'enable head methods `head.predict()` and `head.decode()`')
+
+        batch_output_np = to_numpy(batch_outputs, unzip=True)
+        batch_keypoints = []
+        batch_keypoints2d = []
+        batch_keypoints_simcc = []
+        batch_scores = []
+        for outputs in batch_output_np:
+            keypoints_2d, keypoints, keypoints_simcc, scores = _pack_and_call(
+                outputs, self.decoder.decode)
+            batch_keypoints2d.append(keypoints_2d)
+            batch_keypoints.append(keypoints)
+            batch_keypoints_simcc.append(keypoints_simcc)
+            batch_scores.append(scores)
+
+        preds = []
+        for keypoints_2d, keypoints, keypoints_simcc, scores in zip(batch_keypoints2d,
+                                                   batch_keypoints,
+                                                   batch_keypoints_simcc,
+                                                   batch_scores):
+            pred = InstanceData(
+                keypoints_2d=keypoints_2d,
+                keypoints=keypoints,
+                keypoints_simcc=keypoints_simcc,
+                keypoint_scores=scores)
+            preds.append(pred)
+
+        return preds
+
+    def predict(
+        self,
+        feats: Tuple[Tensor],
+        batch_data_samples: OptSampleList,
+        test_cfg: OptConfigType = {},
+    ) -> InstanceList:
+        """Predict results from features.
+
+        Args:
+            feats (Tuple[Tensor] | List[Tuple[Tensor]]): The multi-stage
+                features (or multiple multi-stage features in TTA)
+            batch_data_samples (List[:obj:`PoseDataSample`]): The batch
+                data samples
+            test_cfg (dict): The runtime config for testing process. Defaults
+                to {}
+
+        Returns:
+            List[InstanceData]: The pose predictions, each contains
+            the following fields:
+                - keypoints (np.ndarray): predicted keypoint coordinates in
+                    shape (num_instances, K, D) where K is the keypoint number
+                    and D is the keypoint dimension
+                - keypoint_scores (np.ndarray): predicted keypoint scores in
+                    shape (num_instances, K)
+                - keypoint_x_labels (np.ndarray, optional): The predicted 1-D
+                    intensity distribution in the x direction
+                - keypoint_y_labels (np.ndarray, optional): The predicted 1-D
+                    intensity distribution in the y direction
+        """
+        x, y, z = self.forward(feats)
+
+        preds = self.decode((x, y, z))
+
+        if test_cfg.get('output_heatmaps', False):
+            raise NotImplementedError
+        else:
+            return preds
+
+    def loss(
+        self,
+        feats: Tuple[Tensor],
+        batch_data_samples: OptSampleList,
+        train_cfg: OptConfigType = {},
+    ) -> dict:
+        """Calculate losses from a batch of inputs and data samples."""
+
+        pred_x, pred_y, pred_z = self.forward(feats)
+
+        gt_x = torch.cat([
+            d.gt_instance_labels.keypoint_x_labels for d in batch_data_samples
+        ],
+                         dim=0)
+        gt_y = torch.cat([
+            d.gt_instance_labels.keypoint_y_labels for d in batch_data_samples
+        ],
+                         dim=0)
+        gt_z = torch.cat([
+            d.gt_instance_labels.keypoint_z_labels for d in batch_data_samples
+        ],
+                         dim=0)
+        keypoint_weights = torch.cat(
+            [
+                d.gt_instance_labels.keypoint_weights
+                for d in batch_data_samples
+            ],
+            dim=0,
+        )
+
+        weight_z = torch.cat(
+            [d.gt_instance_labels.weight_z for d in batch_data_samples],
+            dim=0,
+        )
+
+        with_z_labels = [
+            d.gt_instance_labels.with_z_label[0] for d in batch_data_samples
+        ]
+
+        N, K, _ = pred_x.shape
+        keypoint_weights_ = keypoint_weights.clone()
+        pred_simcc = (pred_x, pred_y, pred_z)
+        gt_simcc = (gt_x, gt_y, gt_z)
+
+        keypoint_weights = torch.cat([
+            keypoint_weights[None, ...], keypoint_weights[None, ...],
+            weight_z[None, ...]
+        ])
+
+        # calculate losses
+        losses = dict()
+        for i, loss_ in enumerate(self.loss_module):
+            if loss_.loss_name == 'loss_bone' or loss_.loss_name == 'loss_mpjpe':
+                pred_coords = get_3d_coord(pred_x, pred_y, pred_z,
+                                           with_z_labels)
+                gt_coords = get_3d_coord(gt_x, gt_y, gt_z, with_z_labels)
+                loss = loss_(pred_coords, gt_coords, keypoint_weights_)
+            else:
+                loss = loss_(pred_simcc, gt_simcc, keypoint_weights)
+            losses[loss_.loss_name] = loss
+
+        # calculate accuracy
+        error = simcc_mpjpe(
+            output=to_numpy(pred_simcc),
+            target=to_numpy(gt_simcc),
+            simcc_split_ratio=self.simcc_split_ratio,
+            mask=to_numpy(keypoint_weights_) > 0,
+        )
+
+        mpjpe = torch.tensor(error, device=gt_x.device)
+        losses.update(mpjpe=mpjpe)
+
+        return losses
+
+    @property
+    def default_init_cfg(self):
+        init_cfg = [
+            dict(type='Normal', layer=['Conv2d'], std=0.001),
+            dict(type='Constant', layer='BatchNorm2d', val=1),
+            dict(type='Normal', layer=['Linear'], std=0.01, bias=0),
+        ]
+        return init_cfg
+
+
+def simcc_mpjpe(output: Tuple[np.ndarray, np.ndarray, np.ndarray],
+                target: Tuple[np.ndarray, np.ndarray, np.ndarray],
+                simcc_split_ratio: float,
+                mask: np.ndarray,
+                thr: float = 0.05) -> float:
+    """Calculate the pose accuracy of PCK for each individual keypoint and the
+    averaged accuracy across all keypoints from 3D SimCC.
+
+    Note:
+        - PCK metric measures accuracy of the localization of the body joints.
+        - The distances between predicted positions and the ground-truth ones
+          are typically normalized by the bounding box size.
+
+    Args:
+        output (Tuple[np.ndarray, np.ndarray, np.ndarray]): Model predicted
+            3D SimCC (x, y, z).
+        target (Tuple[np.ndarray, np.ndarray, np.ndarray]): Groundtruth
+            3D SimCC (x, y, z).
+        simcc_split_ratio (float): SimCC split ratio for recovering actual
+            coordinates.
+        mask (np.ndarray[N, K]): Visibility mask for the target. False for
+            invisible joints, and True for visible.
+        thr (float): Threshold for PCK calculation. Default 0.05.
+        normalize (Optional[np.ndarray[N, 3]]): Normalization factor for
+            H, W, and Depth.
+
+    Returns:
+        Tuple[np.ndarray, float, int]:
+        - np.ndarray[K]: Accuracy of each keypoint.
+        - float: Averaged accuracy across all keypoints.
+        - int: Number of valid keypoints.
+    """
+    if len(output) == 3:
+        pred_x, pred_y, pred_z = output
+        gt_x, gt_y, gt_z = target
+        pred_coords, _ = get_simcc_maximum(pred_x, pred_y, pred_z)
+        gt_coords, _ = get_simcc_maximum(gt_x, gt_y, gt_z)
+
+    else:
+        pred_x, pred_y = output
+        gt_x, gt_y = target
+        pred_coords, _ = get_2d_simcc_maximum(pred_x, pred_y)
+        gt_coords, _ = get_2d_simcc_maximum(gt_x, gt_y)
+
+    pred_coords /= simcc_split_ratio
+    gt_coords /= simcc_split_ratio
+
+    return keypoint_mpjpe(pred_coords, gt_coords, mask)
+
+
+def get_3d_coord(simcc_x, simcc_y, simcc_z, with_z_labels):
+    N, K, W = simcc_x.shape
+    # 过滤 z 轴
+    for i, with_z in enumerate(with_z_labels):
+        if not with_z:
+            simcc_z[i] = torch.zeros_like(simcc_z[i])
+    x_locs = simcc_x.reshape(N * K, -1).argmax(dim=1)
+    y_locs = simcc_y.reshape(N * K, -1).argmax(dim=1)
+    z_locs = simcc_z.reshape(N * K, -1).argmax(dim=1)
+
+    locs = torch.stack((x_locs, y_locs, z_locs),
+                       dim=-1).to(simcc_x).reshape(N, K, 3)
+    return locs
diff --git a/projects/rtmpose3d/rtmpose3d/simcc_3d_label.py b/projects/rtmpose3d/rtmpose3d/simcc_3d_label.py
new file mode 100644
index 0000000000..4440caa667
--- /dev/null
+++ b/projects/rtmpose3d/rtmpose3d/simcc_3d_label.py
@@ -0,0 +1,335 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from itertools import product
+from typing import Any, Optional, Tuple, Union
+
+import numpy as np
+from numpy import ndarray
+
+from mmpose.registry import KEYPOINT_CODECS
+from mmpose.codecs.base import BaseKeypointCodec
+
+from .utils import get_simcc_maximum
+
+
+@KEYPOINT_CODECS.register_module()
+class SimCC3DLabel(BaseKeypointCodec):
+    r"""Generate keypoint representation via "SimCC" approach.
+    See the paper: `SimCC: a Simple Coordinate Classification Perspective for
+    Human Pose Estimation`_ by Li et al (2022) for more details.
+    Old name: SimDR
+
+    Note:
+
+        - instance number: N
+        - keypoint number: K
+        - keypoint dimension: D
+        - image size: [w, h]
+
+    Encoded:
+
+        - keypoint_x_labels (np.ndarray): The generated SimCC label for x-axis.
+            The label shape is (N, K, Wx) if ``smoothing_type=='gaussian'``
+            and (N, K) if `smoothing_type=='standard'``, where
+            :math:`Wx=w*simcc_split_ratio`
+        - keypoint_y_labels (np.ndarray): The generated SimCC label for y-axis.
+            The label shape is (N, K, Wy) if ``smoothing_type=='gaussian'``
+            and (N, K) if `smoothing_type=='standard'``, where
+            :math:`Wy=h*simcc_split_ratio`
+        - keypoint_weights (np.ndarray): The target weights in shape (N, K)
+
+    Args:
+        input_size (tuple): Input image size in [w, h]
+        smoothing_type (str): The SimCC label smoothing strategy. Options are
+            ``'gaussian'`` and ``'standard'``. Defaults to ``'gaussian'``
+        sigma (float | int | tuple): The sigma value in the Gaussian SimCC
+            label. Defaults to 6.0
+        simcc_split_ratio (float): The ratio of the label size to the input
+            size. For example, if the input width is ``w``, the x label size
+            will be :math:`w*simcc_split_ratio`. Defaults to 2.0
+        label_smooth_weight (float): Label Smoothing weight. Defaults to 0.0
+        normalize (bool): Whether to normalize the heatmaps. Defaults to True.
+        use_dark (bool): Whether to use the DARK post processing. Defaults to
+            False.
+        decode_visibility (bool): Whether to decode the visibility. Defaults
+            to False.
+        decode_beta (float): The beta value for decoding visibility. Defaults
+            to 150.0.
+
+    .. _`SimCC: a Simple Coordinate Classification Perspective for Human Pose
+    Estimation`: https://arxiv.org/abs/2107.03332
+    """
+
+    auxiliary_encode_keys = {'keypoints_3d'}
+
+    label_mapping_table = dict(
+        keypoint_x_labels='keypoint_x_labels',
+        keypoint_y_labels='keypoint_y_labels',
+        keypoint_z_labels='keypoint_z_labels',
+        keypoint_weights='keypoint_weights',
+        weight_z='weight_z',
+        with_z_label='with_z_label')
+
+    instance_mapping_table = dict(
+        bbox='bboxes',
+        bbox_score='bbox_scores',
+        bbox_scale='bbox_scales',
+        lifting_target='lifting_target',
+        lifting_target_visible='lifting_target_visible',
+        camera_param='camera_params',
+        root_z='root_z')
+
+    def __init__(self,
+                 input_size: Tuple[int, int, int],
+                 smoothing_type: str = 'gaussian',
+                 sigma: Union[float, int, Tuple[float]] = 6.0,
+                 simcc_split_ratio: float = 2.0,
+                 label_smooth_weight: float = 0.0,
+                 normalize: bool = True,
+                 use_dark: bool = False,
+                 decode_visibility: bool = False,
+                 decode_beta: float = 150.0,
+                 root_index: Union[int, Tuple[int]] = 0,
+                 z_range: Optional[int] = None,
+                 sigmoid_z: bool = False) -> None:
+        super().__init__()
+
+        self.input_size = input_size
+        self.smoothing_type = smoothing_type
+        self.simcc_split_ratio = simcc_split_ratio
+        self.label_smooth_weight = label_smooth_weight
+        self.normalize = normalize
+        self.use_dark = use_dark
+        self.decode_visibility = decode_visibility
+        self.decode_beta = decode_beta
+
+        if isinstance(sigma, (float, int)):
+            self.sigma = np.array([sigma, sigma, sigma])
+        else:
+            self.sigma = np.array(sigma)
+
+        if self.smoothing_type not in {'gaussian', 'standard'}:
+            raise ValueError(
+                f'{self.__class__.__name__} got invalid `smoothing_type` value'
+                f'{self.smoothing_type}. Should be one of '
+                '{"gaussian", "standard"}')
+
+        if self.smoothing_type == 'gaussian' and self.label_smooth_weight > 0:
+            raise ValueError('Attribute `label_smooth_weight` is only '
+                             'used for `standard` mode.')
+
+        if self.label_smooth_weight < 0.0 or self.label_smooth_weight > 1.0:
+            raise ValueError('`label_smooth_weight` should be in range [0, 1]')
+
+        self.root_index = list(root_index) if isinstance(
+            root_index, tuple) else [root_index]
+        self.z_range = z_range if z_range is not None else 2.1744869
+        self.sigmoid_z = sigmoid_z
+        self.root_z = [5.14388]
+
+    def encode(self,
+               keypoints: np.ndarray,
+               keypoints_3d: Optional[np.ndarray] = None,
+               keypoints_visible: Optional[np.ndarray] = None) -> dict:
+
+        if keypoints_visible is None:
+            keypoints_visible = np.ones(keypoints.shape[:2], dtype=np.float32)
+        lifting_target = [None]
+        root_z = self.root_z
+        with_z_label = False
+        if keypoints_3d is not None:
+            lifting_target = keypoints_3d.copy()
+            root_z = keypoints_3d[..., self.root_index, 2].mean(1)
+            keypoints_3d[..., 2] -= root_z
+            if self.sigmoid_z:
+                keypoints_z = (1 / (1 + np.exp(-(3 * keypoints_3d[..., 2])))
+                               ) * self.input_size[2]
+            else:
+                keypoints_z = (keypoints_3d[..., 2] / self.z_range + 1) * (
+                    self.input_size[2] / 2)
+
+            keypoints_3d = np.concatenate([keypoints, keypoints_z[..., None]],
+                                          axis=-1)
+            x, y, z, keypoint_weights = self._generate_gaussian(
+                keypoints_3d, keypoints_visible)
+            weight_z = keypoint_weights
+            with_z_label = True
+        else:
+            if keypoints.shape != np.zeros([]).shape:
+                keypoints_z = np.ones((keypoints.shape[0],
+                                             keypoints.shape[1], 1), dtype=np.float32)
+                keypoints = np.concatenate([keypoints, keypoints_z], axis=-1)
+                x, y, z, keypoint_weights = self._generate_gaussian(
+                    keypoints, keypoints_visible)
+            else:
+                x, y, z = np.zeros((3, 1), dtype=np.float32)
+                keypoint_weights = np.ones((1, ))
+            weight_z = np.zeros_like(keypoint_weights)
+            with_z_label = False
+
+        encoded = dict(
+            keypoint_x_labels=x,
+            keypoint_y_labels=y,
+            keypoint_z_labels=z,
+            lifting_target=lifting_target,
+            root_z=root_z,
+            keypoint_weights=keypoint_weights,
+            weight_z=weight_z,
+            with_z_label=[with_z_label])
+
+        return encoded
+
+    def decode(self, x: np.ndarray, y: np.ndarray, z: np.ndarray):
+        """Decode SimCC labels into 3D keypoints.
+
+        Args:
+            encoded (Tuple[np.ndarray, np.ndarray]): SimCC labels for x-axis,
+            y-axis and z-axis in shape (N, K, Wx), (N, K, Wy) and (N, K, Wz)
+
+        Returns:
+            tuple:
+            - keypoints (np.ndarray): Decoded coordinates in shape (N, K, D)
+            - scores (np.ndarray): The keypoint scores in shape (N, K).
+                It usually represents the confidence of the keypoint prediction
+        """
+
+        keypoints, scores = get_simcc_maximum(x, y, z)
+
+        # Unsqueeze the instance dimension for single-instance results
+        if keypoints.ndim == 2:
+            keypoints = keypoints[None, :]
+            scores = scores[None, :]
+
+        keypoints /= self.simcc_split_ratio
+        keypoints_simcc = keypoints.copy()
+        keypoints_2d = keypoints[..., :2]
+        keypoints_z = keypoints[..., 2:3]
+        if self.sigmoid_z:
+            keypoints_z /= self.input_size[2]
+            keypoints_z[keypoints_z <= 0] = 1e-8
+            scores[(keypoints_z <= 0).squeeze(-1)] = 0
+            keypoints[..., 2:3] = np.log(keypoints_z / (1 - keypoints_z)) / 3
+        else:
+            keypoints[...,
+                      2:3] = (keypoints_z /
+                              (self.input_size[-1] / 2) - 1) * self.z_range
+        return keypoints_2d, keypoints, keypoints_simcc, scores
+
+    def _map_coordinates(
+        self,
+        keypoints: np.ndarray,
+        keypoints_visible: Optional[np.ndarray] = None
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """Mapping keypoint coordinates into SimCC space."""
+
+        keypoints_split = keypoints.copy()
+        keypoints_split = np.around(keypoints_split * self.simcc_split_ratio)
+        keypoints_split = keypoints_split.astype(np.int64)
+        keypoint_weights = keypoints_visible.copy()
+
+        return keypoints_split, keypoint_weights
+
+    def _generate_gaussian(
+        self,
+        keypoints: np.ndarray,
+        keypoints_visible: Optional[np.ndarray] = None
+    ) -> tuple[ndarray, ndarray, ndarray, ndarray]:
+        """Encoding keypoints into SimCC labels with Gaussian Label Smoothing
+        strategy."""
+
+        N, K, _ = keypoints.shape
+        w, h, d = self.input_size
+        W = np.around(w * self.simcc_split_ratio).astype(int)
+        H = np.around(h * self.simcc_split_ratio).astype(int)
+        D = np.around(d * self.simcc_split_ratio).astype(int)
+
+        keypoints_split, keypoint_weights = self._map_coordinates(
+            keypoints, keypoints_visible)
+
+        target_x = np.zeros((N, K, W), dtype=np.float32)
+        target_y = np.zeros((N, K, H), dtype=np.float32)
+        target_z = np.zeros((N, K, D), dtype=np.float32)
+
+        # 3-sigma rule
+        radius = self.sigma * 3
+
+        # xy grid
+        x = np.arange(0, W, 1, dtype=np.float32)
+        y = np.arange(0, H, 1, dtype=np.float32)
+        z = np.arange(0, D, 1, dtype=np.float32)
+
+        for n, k in product(range(N), range(K)):
+            # skip unlabled keypoints
+            if keypoints_visible[n, k] < 0.5:
+                continue
+
+            mu = keypoints_split[n, k]
+
+            # check that the gaussian has in-bounds part
+            left, top, near = mu - radius
+            right, bottom, far = mu + radius + 1
+
+            if left >= W or top >= H or near >= D or right < 0 or bottom < 0 or far < 0:  # noqa: E501
+                keypoint_weights[n, k] = 0
+                continue
+
+            mu_x, mu_y, mu_z = mu
+
+            target_x[n, k] = np.exp(-((x - mu_x)**2) / (2 * self.sigma[0]**2))
+            target_y[n, k] = np.exp(-((y - mu_y)**2) / (2 * self.sigma[1]**2))
+            target_z[n, k] = np.exp(-((z - mu_z)**2) / (2 * self.sigma[2]**2))
+
+        if self.normalize:
+            norm_value = self.sigma * np.sqrt(np.pi * 2)
+            target_x /= norm_value[0]
+            target_y /= norm_value[1]
+            target_z /= norm_value[2]
+
+        return target_x, target_y, target_z, keypoint_weights
+
+    def _generate_standard(
+        self,
+        keypoints: np.ndarray,
+        keypoints_visible: Optional[np.ndarray] = None
+    ) -> tuple[ndarray, ndarray, ndarray, Any]:
+        """Encoding keypoints into SimCC labels with Standard Label Smoothing
+        strategy.
+
+        Labels will be one-hot vectors if self.label_smooth_weight==0.0
+        """
+
+        N, K, _ = keypoints.shape
+        w, h, d = self.input_size
+        w = np.around(w * self.simcc_split_ratio).astype(int)
+        h = np.around(h * self.simcc_split_ratio).astype(int)
+        d = np.around(d * self.simcc_split_ratio).astype(int)
+
+        keypoints_split, keypoint_weights = self._map_coordinates(
+            keypoints, keypoints_visible)
+
+        x = np.zeros((N, K, w), dtype=np.float32)
+        y = np.zeros((N, K, h), dtype=np.float32)
+        z = np.zeros((N, K, d), dtype=np.float32)
+
+        for n, k in product(range(N), range(K)):
+            # skip unlabled keypoints
+            if keypoints_visible[n, k] < 0.5:
+                continue
+
+            # get center coordinates
+            mu_x, mu_y, mu_z = keypoints_split[n, k].astype(np.int64)
+
+            # detect abnormal coords and assign the weight 0
+            if mu_x >= w or mu_y >= h or mu_x < 0 or mu_y < 0:
+                keypoint_weights[n, k] = 0
+                continue
+
+            if self.label_smooth_weight > 0:
+                x[n, k] = self.label_smooth_weight / (w - 1)
+                y[n, k] = self.label_smooth_weight / (h - 1)
+                z[n, k] = self.label_smooth_weight / (d - 1)
+
+            x[n, k, mu_x] = 1.0 - self.label_smooth_weight
+            y[n, k, mu_y] = 1.0 - self.label_smooth_weight
+            z[n, k, mu_z] = 1.0 - self.label_smooth_weight
+
+        return x, y, z, keypoint_weights
diff --git a/projects/rtmpose3d/rtmpose3d/utils.py b/projects/rtmpose3d/rtmpose3d/utils.py
new file mode 100644
index 0000000000..8dab90de20
--- /dev/null
+++ b/projects/rtmpose3d/rtmpose3d/utils.py
@@ -0,0 +1,76 @@
+from typing import Tuple
+
+import numpy as np
+
+
+def get_simcc_maximum(simcc_x: np.ndarray,
+                      simcc_y: np.ndarray,
+                      simcc_z: np.ndarray,
+                      apply_softmax: bool = False
+                      ) -> Tuple[np.ndarray, np.ndarray]:
+    """Get maximum response location and value from simcc representations.
+
+    Note:
+        instance number: N
+        num_keypoints: K
+        heatmap height: H
+        heatmap width: W
+
+    Args:
+        encoded_keypoints (dict): encoded keypoints with simcc representations.
+        apply_softmax (bool): whether to apply softmax on the heatmap.
+            Defaults to False.
+
+    Returns:
+        tuple:
+        - locs (np.ndarray): locations of maximum heatmap responses in shape
+            (K, 2) or (N, K, 2)
+        - vals (np.ndarray): values of maximum heatmap responses in shape
+            (K,) or (N, K)
+    """
+    assert isinstance(simcc_x, np.ndarray), 'simcc_x should be numpy.ndarray'
+    assert isinstance(simcc_y, np.ndarray), 'simcc_y should be numpy.ndarray'
+    assert isinstance(simcc_z, np.ndarray), 'simcc_z should be numpy.ndarray'
+    assert simcc_x.ndim == 2 or simcc_x.ndim == 3, (
+        f'Invalid shape {simcc_x.shape}')
+    assert simcc_y.ndim == 2 or simcc_y.ndim == 3, (
+        f'Invalid shape {simcc_y.shape}')
+    assert simcc_z.ndim == 2 or simcc_z.ndim == 3, (
+        f'Invalid shape {simcc_z.shape}')
+    assert simcc_x.ndim == simcc_y.ndim == simcc_z.ndim, (
+        f'{simcc_x.shape} != {simcc_y.shape} or {simcc_z.shape}')
+
+    if simcc_x.ndim == 3:
+        n, k, _ = simcc_x.shape
+        simcc_x = simcc_x.reshape(n * k, -1)
+        simcc_y = simcc_y.reshape(n * k, -1)
+        simcc_z = simcc_z.reshape(n * k, -1)
+    else:
+        n = None
+
+    if apply_softmax:
+        simcc_x = simcc_x - np.max(simcc_x, axis=1, keepdims=True)
+        simcc_y = simcc_y - np.max(simcc_y, axis=1, keepdims=True)
+        simcc_z = simcc_z - np.max(simcc_z, axis=1, keepdims=True)
+        ex, ey, ez = np.exp(simcc_x), np.exp(simcc_y), np.exp(simcc_z)
+        simcc_x = ex / np.sum(ex, axis=1, keepdims=True)
+        simcc_y = ey / np.sum(ey, axis=1, keepdims=True)
+        simcc_z = ez / np.sum(ez, axis=1, keepdims=True)
+
+    x_locs = np.argmax(simcc_x, axis=1)
+    y_locs = np.argmax(simcc_y, axis=1)
+    z_locs = np.argmax(simcc_z, axis=1)
+    locs = np.stack((x_locs, y_locs, z_locs), axis=-1).astype(np.float32)
+    max_val_x = np.amax(simcc_x, axis=1)
+    max_val_y = np.amax(simcc_y, axis=1)
+    
+    mask = max_val_x > max_val_y
+    max_val_x[mask] = max_val_y[mask]
+    vals = max_val_x
+    locs[vals <= 0.] = -1
+
+    if n is not None:
+        locs = locs.reshape(n, k, 3)
+        vals = vals.reshape(n, k)
+
+    return locs, vals
\ No newline at end of file

From 379358439c1626f9570630c9d5835f62b9b9387a Mon Sep 17 00:00:00 2001
From: xiexinch <xiexinch@outlook.com>
Date: Wed, 24 Apr 2024 23:33:13 +0800
Subject: [PATCH 02/15] --doc=add rtmpose3d readme

---
 projects/rtmpose3d/README.md | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 projects/rtmpose3d/README.md

diff --git a/projects/rtmpose3d/README.md b/projects/rtmpose3d/README.md
new file mode 100644
index 0000000000..e279078eb0
--- /dev/null
+++ b/projects/rtmpose3d/README.md
@@ -0,0 +1,16 @@
+# RTMPose3D: Real-Time 3D Pose Estimation toolkit based on RTMPose
+
+## Abstract
+
+RTMPose3D is a toolkit for real-time 3D pose estimation. It is based on the RTMPose model, which is a 2D pose estimation model that is capable of predicting 2D keypoints and body part associations in real-time. RTMPose3D extends RTMPose by adding a 3D pose estimation branch that can predict 3D keypoints from images directly.
+
+## Usage
+
+👉🏼 TRY RTMO NOW
+
+python .\body3d_img2pose_demo.py .\configs\rtmdet_m_640-8xb32_coco-person.py ..\..\..\mmpose\checkpoints\rtmdet_m_8xb32-100e_coco-obj365-person-235e8209.pth .\configs\rtmw3d-l_8xb64_cocktail14-384x288.py ..\..\..\mmpose\checkpoints\rtmw3d-l_cock14-0d4ad840_20240422.pth --input ..\..\tests\data\coco\000000000785.jpg --output-root results
+
+```bash
+cd /path/to/mmpose/projects/rtmpose3d
+python body3d_img2pose_demo.py configs/rtmdet_m_640-8xb32_coco-person.py https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmdet_m_8xb32-100e_coco-obj365-person-235e8209.pth configs\rtmw3d-l_8xb64_cocktail14-384x288.py rtmw3d-l_cock14-0d4ad840_20240422.pth --input /path/to/image --output-root /path/to/output
+```

From dc9114ee478b534e12716a01190fe34035edf9f0 Mon Sep 17 00:00:00 2001
From: xiexinch <xiexinch@outlook.com>
Date: Thu, 25 Apr 2024 00:03:44 +0800
Subject: [PATCH 03/15] --doc=remove

---
 projects/rtmpose3d/README.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/projects/rtmpose3d/README.md b/projects/rtmpose3d/README.md
index e279078eb0..4f75c44fe2 100644
--- a/projects/rtmpose3d/README.md
+++ b/projects/rtmpose3d/README.md
@@ -8,8 +8,6 @@ RTMPose3D is a toolkit for real-time 3D pose estimation. It is based on the RTMP
 
 👉🏼 TRY RTMO NOW
 
-python .\body3d_img2pose_demo.py .\configs\rtmdet_m_640-8xb32_coco-person.py ..\..\..\mmpose\checkpoints\rtmdet_m_8xb32-100e_coco-obj365-person-235e8209.pth .\configs\rtmw3d-l_8xb64_cocktail14-384x288.py ..\..\..\mmpose\checkpoints\rtmw3d-l_cock14-0d4ad840_20240422.pth --input ..\..\tests\data\coco\000000000785.jpg --output-root results
-
 ```bash
 cd /path/to/mmpose/projects/rtmpose3d
 python body3d_img2pose_demo.py configs/rtmdet_m_640-8xb32_coco-person.py https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmdet_m_8xb32-100e_coco-obj365-person-235e8209.pth configs\rtmw3d-l_8xb64_cocktail14-384x288.py rtmw3d-l_cock14-0d4ad840_20240422.pth --input /path/to/image --output-root /path/to/output

From 9aa8bf5729ec10f2d3e3a390d80f7e8427543766 Mon Sep 17 00:00:00 2001
From: xiexinch <xiexinch@outlook.com>
Date: Thu, 25 Apr 2024 00:32:59 +0800
Subject: [PATCH 04/15] --fix=fix infer video

---
 projects/rtmpose3d/body3d_img2pose_demo.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/projects/rtmpose3d/body3d_img2pose_demo.py b/projects/rtmpose3d/body3d_img2pose_demo.py
index 200043d7d4..e32db3323a 100644
--- a/projects/rtmpose3d/body3d_img2pose_demo.py
+++ b/projects/rtmpose3d/body3d_img2pose_demo.py
@@ -216,8 +216,6 @@ def process_one_image(args, detector, frame: np.ndarray, frame_idx: int,
             keypoints = np.squeeze(keypoints, axis=1)
 
         keypoints = -keypoints[..., [0, 2, 1]]
-        # keypoints[..., 0] = -keypoints[..., 0]
-        # keypoints[..., 2] = -keypoints[..., 2]
 
         # rebase height (z-axis)
         if not args.disable_rebase_keypoint:
@@ -254,8 +252,7 @@ def process_one_image(args, detector, frame: np.ndarray, frame_idx: int,
             axis_azimuth=70,
             axis_elev=15,
             num_instances=args.num_instances,
-            wait_time=args.show_interval,
-            root_index=[11, 12])
+            wait_time=args.show_interval)
 
     return pose_est_results, pose_est_results_list, pred_3d_instances, next_id
 

From fb15f42e13b0590e22e0e94d47824cfd2543f2be Mon Sep 17 00:00:00 2001
From: xiexinch <xiexinch@outlook.com>
Date: Thu, 25 Apr 2024 14:20:27 +0800
Subject: [PATCH 05/15] --fix=fix lint

---
 projects/rtmpose3d/body3d_img2pose_demo.py    |  5 +-
 .../rtmw3d-l_8xb64_cocktail14-384x288.py      | 51 +++++++++----------
 .../rtmw3d-x_8xb64_cocktail14-384x288.py      | 48 ++++++++---------
 projects/rtmpose3d/rtmpose3d/__init__.py      |  6 ++-
 projects/rtmpose3d/rtmpose3d/loss.py          |  5 +-
 .../rtmpose3d/rtmpose3d/pose_estimator.py     |  5 +-
 projects/rtmpose3d/rtmpose3d/rtmw3d_head.py   | 11 ++--
 .../rtmpose3d/rtmpose3d/simcc_3d_label.py     |  8 +--
 projects/rtmpose3d/rtmpose3d/utils.py         |  4 +-
 9 files changed, 73 insertions(+), 70 deletions(-)

diff --git a/projects/rtmpose3d/body3d_img2pose_demo.py b/projects/rtmpose3d/body3d_img2pose_demo.py
index e32db3323a..4a0e90040c 100644
--- a/projects/rtmpose3d/body3d_img2pose_demo.py
+++ b/projects/rtmpose3d/body3d_img2pose_demo.py
@@ -19,7 +19,6 @@
                                split_instances)
 from mmpose.utils import adapt_mmdet_pipeline
 from mmpose.visualization import Pose3dLocalVisualizer
-from rtmpose3d import *
 
 try:
     from mmdet.apis import inference_detector, init_detector
@@ -27,6 +26,8 @@
 except (ImportError, ModuleNotFoundError):
     has_mmdet = False
 
+from rtmpose3d import *  # noqa: F401, F403
+
 
 def parse_args():
     parser = ArgumentParser()
@@ -124,7 +125,7 @@ def parse_args():
 
 
 def process_one_image(args, detector, frame: np.ndarray, frame_idx: int,
-                      pose_estimator: TopdownPoseEstimator3D,
+                      pose_estimator,
                       pose_est_results_last: List[PoseDataSample],
                       pose_est_results_list: List[List[PoseDataSample]],
                       next_id: int, visualize_frame: np.ndarray,
diff --git a/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py b/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py
index 832742788d..950216c9bd 100644
--- a/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py
+++ b/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py
@@ -53,6 +53,8 @@
     use_dark=False,
     root_index=(11, 12))
 
+backbone_path = 'checkpoints/rtmpose-l_simcc-ucoco_dw-ucoco_270e-256x192-4d6dfc62_20230728.pth'  # noqa
+
 # model settings
 model = dict(
     type='TopdownPoseEstimator3D',
@@ -71,10 +73,7 @@
         norm_cfg=dict(type='BN'),
         act_cfg=dict(type='SiLU'),
         init_cfg=dict(
-            type='Pretrained',
-            prefix='backbone.',
-            checkpoint='checkpoints/rtmpose-l_simcc-ucoco_dw-ucoco_270e-256x192-4d6dfc62_20230728.pth'  # noqa
-        )),
+            type='Pretrained', prefix='backbone.', checkpoint=backbone_path)),
     neck=dict(
         type='CSPNeXtPAFPN',
         in_channels=[256, 512, 1024],
@@ -112,15 +111,23 @@
                 label_softmax=True),
             dict(
                 type='BoneLoss',
-                joint_parents=[0, 1, 2, 3, 4, 5, 6, 5, 6, 7, 8, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 50, 50, 51, 52, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 91, 92, 93, 94, 91, 96, 97, 98, 91, 100, 101, 102, 91, 104, 105, 106, 91, 108, 109, 110, 8, 112, 113, 114, 113, 112, 117, 118, 117, 112, 121, 122, 123, 112, 125, 126, 127, 112, 129, 130, 131],
+                joint_parents=[
+                    0, 1, 2, 3, 4, 5, 6, 5, 6, 7, 8, 11, 12, 13, 14, 15, 16,
+                    17, 18, 19, 20, 21, 22, 23, 23, 24, 25, 26, 27, 28, 29, 30,
+                    31, 32, 33, 34, 35, 36, 37, 38, 2, 2, 2, 2, 2, 3, 3, 3, 3,
+                    3, 50, 50, 51, 52, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 3, 3,
+                    3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 7, 91, 92, 93, 94, 91, 96, 97, 98, 91, 100,
+                    101, 102, 91, 104, 105, 106, 91, 108, 109, 110, 8, 112,
+                    113, 114, 113, 112, 117, 118, 117, 112, 121, 122, 123, 112,
+                    125, 126, 127, 112, 129, 130, 131
+                ],
                 use_target_weight=True,
-                loss_weight=2.0
-            )
+                loss_weight=2.0)
         ],
         decoder=codec),
     # test_cfg=dict(flip_test=False, mode='2d')
-    test_cfg=dict(flip_test=False)
-)
+    test_cfg=dict(flip_test=False))
 
 # base dataset settings
 data_mode = 'topdown'
@@ -130,9 +137,10 @@
 # pipelines
 train_pipeline = [
     dict(type='LoadImage', backend_args=backend_args),
-    dict(type='RandomBackground',
-         bg_dir='/mnt/data/oss_beijing/mmseg/obj365v1_images',
-         bg_prob=0.5,
+    dict(
+        type='RandomBackground',
+        bg_dir='/mnt/data/oss_beijing/mmseg/obj365v1_images',
+        bg_prob=0.5,
     ),
     dict(type='GetBBoxCenterScale'),
     dict(type='RandomFlip', direction='horizontal'),
@@ -198,7 +206,6 @@
     test_mode=False,
     pipeline=[])
 
-
 # dna rendering dataset
 dna_rendering_dataset = dict(
     type='DNARenderingDataset',
@@ -206,9 +213,7 @@
     data_mode='topdown',
     ann_file='instances.npz',
     subset_frac=0.1,
-    pipeline=[
-        dict(type='LoadMask', backend_args=backend_args)
-    ],
+    pipeline=[dict(type='LoadMask', backend_args=backend_args)],
 )
 
 # mapping
@@ -482,10 +487,7 @@
 dataset_wb = dict(
     type='CombinedDataset',
     metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
-    datasets=[
-        dataset_coco,
-        dataset_halpe
-    ],
+    datasets=[dataset_coco, dataset_halpe],
     pipeline=[],
     test_mode=False,
 )
@@ -581,11 +583,9 @@
     test_mode=False,
 )
 
-
 # ubody dataset
 scenes = [
-    'Magic_show',
-    'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow',
+    'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow',
     'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing',
     'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference'
 ]
@@ -604,7 +604,6 @@
         pipeline=[])
     ubody_datasets.append(ubody)
 
-
 train_datasets = [
     dataset_wb,
     dataset_body,
@@ -615,7 +614,6 @@
     # dna_rendering_dataset
 ]
 
-
 # data loaders
 train_dataloader = dict(
     batch_size=64,
@@ -694,7 +692,8 @@
         ann_file='annotations/coco_wholebody_val_v1.0.json',
         data_prefix=dict(img='val2017/'),
         test_mode=True,
-        bbox_file='data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json',
+        bbox_file='data/coco/person_detection_results/'
+        'COCO_val2017_detections_AP_H_56_person.json',
         pipeline=val_pipeline,
     ))
 test_dataloader = val_dataloader
diff --git a/projects/rtmpose3d/configs/rtmw3d-x_8xb64_cocktail14-384x288.py b/projects/rtmpose3d/configs/rtmw3d-x_8xb64_cocktail14-384x288.py
index 3a822f50b8..2352072b62 100644
--- a/projects/rtmpose3d/configs/rtmw3d-x_8xb64_cocktail14-384x288.py
+++ b/projects/rtmpose3d/configs/rtmw3d-x_8xb64_cocktail14-384x288.py
@@ -53,6 +53,8 @@
     use_dark=False,
     root_index=(11, 12))
 
+backbone_path = 'checkpoints/rtmpose-x_simcc-ucoco_pt-aic-coco_270e-384x288-f5b50679_20230822.pth'  # noqa
+
 # model settings
 model = dict(
     type='TopdownPoseEstimator3D',
@@ -71,10 +73,7 @@
         norm_cfg=dict(type='BN'),
         act_cfg=dict(type='SiLU'),
         init_cfg=dict(
-            type='Pretrained',
-            prefix='backbone.',
-            checkpoint='checkpoints/rtmpose-x_simcc-ucoco_pt-aic-coco_270e-384x288-f5b50679_20230822.pth'  # noqa
-        )),
+            type='Pretrained', prefix='backbone.', checkpoint=backbone_path)),
     neck=dict(
         type='CSPNeXtPAFPN',
         in_channels=[320, 640, 1280],
@@ -112,10 +111,19 @@
                 label_softmax=True),
             dict(
                 type='BoneLoss',
-                joint_parents=[0, 1, 2, 3, 4, 5, 6, 5, 6, 7, 8, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 50, 50, 51, 52, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 91, 92, 93, 94, 91, 96, 97, 98, 91, 100, 101, 102, 91, 104, 105, 106, 91, 108, 109, 110, 8, 112, 113, 114, 113, 112, 117, 118, 117, 112, 121, 122, 123, 112, 125, 126, 127, 112, 129, 130, 131],
+                joint_parents=[
+                    0, 1, 2, 3, 4, 5, 6, 5, 6, 7, 8, 11, 12, 13, 14, 15, 16,
+                    17, 18, 19, 20, 21, 22, 23, 23, 24, 25, 26, 27, 28, 29, 30,
+                    31, 32, 33, 34, 35, 36, 37, 38, 2, 2, 2, 2, 2, 3, 3, 3, 3,
+                    3, 50, 50, 51, 52, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 3, 3,
+                    3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 7, 91, 92, 93, 94, 91, 96, 97, 98, 91, 100,
+                    101, 102, 91, 104, 105, 106, 91, 108, 109, 110, 8, 112,
+                    113, 114, 113, 112, 117, 118, 117, 112, 121, 122, 123, 112,
+                    125, 126, 127, 112, 129, 130, 131
+                ],
                 use_target_weight=True,
-                loss_weight=2.0
-            )
+                loss_weight=2.0)
         ],
         decoder=codec),
     test_cfg=dict(flip_test=False, mode='2d')
@@ -130,9 +138,10 @@
 # pipelines
 train_pipeline = [
     dict(type='LoadImage', backend_args=backend_args),
-    dict(type='RandomBackground',
-         bg_dir='/mnt/data/oss_beijing/mmseg/obj365v1_images',
-         bg_prob=0.5,
+    dict(
+        type='RandomBackground',
+        bg_dir='/mnt/data/oss_beijing/mmseg/obj365v1_images',
+        bg_prob=0.5,
     ),
     dict(type='GetBBoxCenterScale'),
     dict(type='RandomFlip', direction='horizontal'),
@@ -198,7 +207,6 @@
     test_mode=False,
     pipeline=[])
 
-
 # dna rendering dataset
 dna_rendering_dataset = dict(
     type='DNARenderingDataset',
@@ -206,9 +214,7 @@
     data_mode='topdown',
     ann_file='instances.npz',
     subset_frac=0.1,
-    pipeline=[
-        dict(type='LoadMask', backend_args=backend_args)
-    ],
+    pipeline=[dict(type='LoadMask', backend_args=backend_args)],
 )
 
 # mapping
@@ -482,10 +488,7 @@
 dataset_wb = dict(
     type='CombinedDataset',
     metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
-    datasets=[
-        dataset_coco,
-        dataset_halpe
-    ],
+    datasets=[dataset_coco, dataset_halpe],
     pipeline=[],
     test_mode=False,
 )
@@ -581,11 +584,9 @@
     test_mode=False,
 )
 
-
 # ubody dataset
 scenes = [
-    'Magic_show',
-    'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow',
+    'Magic_show', 'Entertainment', 'ConductMusic', 'Online_class', 'TalkShow',
     'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow', 'Singing',
     'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference'
 ]
@@ -604,7 +605,6 @@
         pipeline=[])
     ubody_datasets.append(ubody)
 
-
 train_datasets = [
     dataset_wb,
     dataset_body,
@@ -615,7 +615,6 @@
     # dna_rendering_dataset
 ]
 
-
 # data loaders
 train_dataloader = dict(
     batch_size=32,
@@ -694,7 +693,8 @@
         ann_file='annotations/coco_wholebody_val_v1.0.json',
         data_prefix=dict(img='val2017/'),
         test_mode=True,
-        bbox_file='data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json',
+        bbox_file='data/coco/person_detection_results/'
+        'COCO_val2017_detections_AP_H_56_person.json',
         pipeline=val_pipeline,
     ))
 test_dataloader = val_dataloader
diff --git a/projects/rtmpose3d/rtmpose3d/__init__.py b/projects/rtmpose3d/rtmpose3d/__init__.py
index eec926b2c8..8bbd120d68 100644
--- a/projects/rtmpose3d/rtmpose3d/__init__.py
+++ b/projects/rtmpose3d/rtmpose3d/__init__.py
@@ -1,6 +1,8 @@
+from .loss import KLDiscretLoss2
 from .pose_estimator import TopdownPoseEstimator3D
 from .rtmw3d_head import RTMW3DHead
 from .simcc_3d_label import SimCC3DLabel
-from .loss import KLDiscretLoss2
 
-__all__ = ['TopdownPoseEstimator3D', 'RTMW3DHead', 'SimCC3DLabel', 'KLDiscretLoss2']
+__all__ = [
+    'TopdownPoseEstimator3D', 'RTMW3DHead', 'SimCC3DLabel', 'KLDiscretLoss2'
+]
diff --git a/projects/rtmpose3d/rtmpose3d/loss.py b/projects/rtmpose3d/rtmpose3d/loss.py
index 499befa5a0..9869fd00ff 100644
--- a/projects/rtmpose3d/rtmpose3d/loss.py
+++ b/projects/rtmpose3d/rtmpose3d/loss.py
@@ -1,5 +1,6 @@
-from mmpose.registry import MODELS
 from mmpose.models.losses import KLDiscretLoss
+from mmpose.registry import MODELS
+
 
 @MODELS.register_module()
 class KLDiscretLoss2(KLDiscretLoss):
@@ -34,4 +35,4 @@ def loss_name(self):
         Returns:
             str: The name of this loss item.
         """
-        return self._loss_name
\ No newline at end of file
+        return self._loss_name
diff --git a/projects/rtmpose3d/rtmpose3d/pose_estimator.py b/projects/rtmpose3d/rtmpose3d/pose_estimator.py
index 6854205b4b..3ef7411738 100644
--- a/projects/rtmpose3d/rtmpose3d/pose_estimator.py
+++ b/projects/rtmpose3d/rtmpose3d/pose_estimator.py
@@ -3,9 +3,10 @@
 
 import numpy as np
 
-from mmpose.utils.typing import InstanceList, PixelDataList, SampleList
-from mmpose.registry import MODELS
 from mmpose.models.pose_estimators import TopdownPoseEstimator
+from mmpose.registry import MODELS
+from mmpose.utils.typing import InstanceList, PixelDataList, SampleList
+
 
 @MODELS.register_module()
 class TopdownPoseEstimator3D(TopdownPoseEstimator):
diff --git a/projects/rtmpose3d/rtmpose3d/rtmw3d_head.py b/projects/rtmpose3d/rtmpose3d/rtmw3d_head.py
index bbf6bd2b48..c56db1c9b9 100644
--- a/projects/rtmpose3d/rtmpose3d/rtmw3d_head.py
+++ b/projects/rtmpose3d/rtmpose3d/rtmw3d_head.py
@@ -9,12 +9,12 @@
 
 from mmpose.codecs.utils import get_simcc_maximum as get_2d_simcc_maximum
 from mmpose.evaluation.functional import keypoint_mpjpe
+from mmpose.models.heads import BaseHead
 from mmpose.models.utils.rtmcc_block import RTMCCBlock, ScaleNorm
 from mmpose.registry import KEYPOINT_CODECS, MODELS
 from mmpose.utils.tensor_utils import to_numpy
 from mmpose.utils.typing import (ConfigType, InstanceList, OptConfigType,
                                  OptSampleList)
-from mmpose.models.heads import BaseHead
 from .utils import get_simcc_maximum
 
 OptIntSeq = Optional[Sequence[int]]
@@ -244,10 +244,9 @@ def _pack_and_call(args, func):
             batch_scores.append(scores)
 
         preds = []
-        for keypoints_2d, keypoints, keypoints_simcc, scores in zip(batch_keypoints2d,
-                                                   batch_keypoints,
-                                                   batch_keypoints_simcc,
-                                                   batch_scores):
+        for keypoints_2d, keypoints, keypoints_simcc, scores in zip(
+                batch_keypoints2d, batch_keypoints, batch_keypoints_simcc,
+                batch_scores):
             pred = InstanceData(
                 keypoints_2d=keypoints_2d,
                 keypoints=keypoints,
@@ -347,7 +346,7 @@ def loss(
         # calculate losses
         losses = dict()
         for i, loss_ in enumerate(self.loss_module):
-            if loss_.loss_name == 'loss_bone' or loss_.loss_name == 'loss_mpjpe':
+            if loss_.loss_name in ['loss_bone', 'loss_mpjpe']:
                 pred_coords = get_3d_coord(pred_x, pred_y, pred_z,
                                            with_z_labels)
                 gt_coords = get_3d_coord(gt_x, gt_y, gt_z, with_z_labels)
diff --git a/projects/rtmpose3d/rtmpose3d/simcc_3d_label.py b/projects/rtmpose3d/rtmpose3d/simcc_3d_label.py
index 4440caa667..41f2064d15 100644
--- a/projects/rtmpose3d/rtmpose3d/simcc_3d_label.py
+++ b/projects/rtmpose3d/rtmpose3d/simcc_3d_label.py
@@ -5,9 +5,8 @@
 import numpy as np
 from numpy import ndarray
 
-from mmpose.registry import KEYPOINT_CODECS
 from mmpose.codecs.base import BaseKeypointCodec
-
+from mmpose.registry import KEYPOINT_CODECS
 from .utils import get_simcc_maximum
 
 
@@ -155,8 +154,9 @@ def encode(self,
             with_z_label = True
         else:
             if keypoints.shape != np.zeros([]).shape:
-                keypoints_z = np.ones((keypoints.shape[0],
-                                             keypoints.shape[1], 1), dtype=np.float32)
+                keypoints_z = np.ones(
+                    (keypoints.shape[0], keypoints.shape[1], 1),
+                    dtype=np.float32)
                 keypoints = np.concatenate([keypoints, keypoints_z], axis=-1)
                 x, y, z, keypoint_weights = self._generate_gaussian(
                     keypoints, keypoints_visible)
diff --git a/projects/rtmpose3d/rtmpose3d/utils.py b/projects/rtmpose3d/rtmpose3d/utils.py
index 8dab90de20..62837bde4c 100644
--- a/projects/rtmpose3d/rtmpose3d/utils.py
+++ b/projects/rtmpose3d/rtmpose3d/utils.py
@@ -63,7 +63,7 @@ def get_simcc_maximum(simcc_x: np.ndarray,
     locs = np.stack((x_locs, y_locs, z_locs), axis=-1).astype(np.float32)
     max_val_x = np.amax(simcc_x, axis=1)
     max_val_y = np.amax(simcc_y, axis=1)
-    
+
     mask = max_val_x > max_val_y
     max_val_x[mask] = max_val_y[mask]
     vals = max_val_x
@@ -73,4 +73,4 @@ def get_simcc_maximum(simcc_x: np.ndarray,
         locs = locs.reshape(n, k, 3)
         vals = vals.reshape(n, k)
 
-    return locs, vals
\ No newline at end of file
+    return locs, vals

From fc87ebd60ad6e364ad8d52cc44d4a735562864ba Mon Sep 17 00:00:00 2001
From: xiexinch <xiexinch@outlook.com>
Date: Sun, 28 Apr 2024 15:55:20 +0800
Subject: [PATCH 06/15] --update=refactor

---
 projects/rtmpose3d/configs/default_runtime.py |  54 -------
 .../rtmw3d-l_8xb64_cocktail14-384x288.py      | 137 ++++++++----------
 ...y => rtmw3d-x_8xb32_cocktail14-384x288.py} | 136 ++++++++---------
 .../{ => demo}/body3d_img2pose_demo.py        |   2 +-
 .../rtmdet_m_640-8xb32_coco-person.py         |   0
 projects/rtmpose3d/rtmpose3d/__init__.py      |   5 +-
 projects/rtmpose3d/rtmpose3d/loss.py          |   2 +-
 .../rtmpose3d/rtmpose3d/pose_estimator.py     |  17 ++-
 projects/rtmpose3d/rtmpose3d/rtmw3d_head.py   |  11 +-
 .../rtmpose3d/rtmpose3d/simcc_3d_label.py     | 135 ++++-------------
 10 files changed, 177 insertions(+), 322 deletions(-)
 delete mode 100644 projects/rtmpose3d/configs/default_runtime.py
 rename projects/rtmpose3d/configs/{rtmw3d-x_8xb64_cocktail14-384x288.py => rtmw3d-x_8xb32_cocktail14-384x288.py} (92%)
 rename projects/rtmpose3d/{ => demo}/body3d_img2pose_demo.py (99%)
 rename projects/rtmpose3d/{configs => demo}/rtmdet_m_640-8xb32_coco-person.py (100%)

diff --git a/projects/rtmpose3d/configs/default_runtime.py b/projects/rtmpose3d/configs/default_runtime.py
deleted file mode 100644
index 6f27c0345a..0000000000
--- a/projects/rtmpose3d/configs/default_runtime.py
+++ /dev/null
@@ -1,54 +0,0 @@
-default_scope = 'mmpose'
-
-# hooks
-default_hooks = dict(
-    timer=dict(type='IterTimerHook'),
-    logger=dict(type='LoggerHook', interval=50),
-    param_scheduler=dict(type='ParamSchedulerHook'),
-    checkpoint=dict(type='CheckpointHook', interval=10),
-    sampler_seed=dict(type='DistSamplerSeedHook'),
-    visualization=dict(type='PoseVisualizationHook', enable=False),
-    badcase=dict(
-        type='BadCaseAnalysisHook',
-        enable=False,
-        out_dir='badcase',
-        metric_type='loss',
-        badcase_thr=5))
-
-# custom hooks
-custom_hooks = [
-    # Synchronize model buffers such as running_mean and running_var in BN
-    # at the end of each epoch
-    dict(type='SyncBuffersHook')
-]
-
-# multi-processing backend
-env_cfg = dict(
-    cudnn_benchmark=False,
-    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
-    dist_cfg=dict(backend='nccl'),
-)
-
-# visualizer
-vis_backends = [
-    dict(type='LocalVisBackend'),
-    # dict(type='TensorboardVisBackend'),
-    # dict(type='WandbVisBackend'),
-]
-visualizer = dict(
-    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
-
-# logger
-log_processor = dict(
-    type='LogProcessor', window_size=50, by_epoch=True, num_digits=6)
-log_level = 'INFO'
-load_from = None
-resume = False
-
-# file I/O backend
-backend_args = dict(backend='local')
-
-# training/validation/testing progress
-train_cfg = dict(by_epoch=True)
-val_cfg = dict()
-test_cfg = dict()
diff --git a/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py b/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py
index 950216c9bd..460ae6300d 100644
--- a/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py
+++ b/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py
@@ -1,4 +1,6 @@
-_base_ = ['./default_runtime.py']
+_base_ = ['mmpose::_base_/default_runtime.py']
+
+custom_imports = dict(imports=['rtmpose3d'], allow_failed_imports=False)
 
 vis_backends = [
     dict(type='LocalVisBackend'),
@@ -105,7 +107,7 @@
             pos_enc=False),
         loss=[
             dict(
-                type='KLDiscretLoss2',
+                type='KLDiscretLossWithWeight',
                 use_target_weight=True,
                 beta=10.,
                 label_softmax=True),
@@ -195,27 +197,6 @@
     dict(type='PackPoseInputs')
 ]
 
-# h3wb dataset
-h3wb_dataset = dict(
-    type='H36MWholeBodyDataset',
-    ann_file='annotation_body3d/h3wb_train_bbox.npz',
-    seq_len=1,
-    causal=True,
-    data_root='data/h36m/',
-    data_prefix=dict(img='images/'),
-    test_mode=False,
-    pipeline=[])
-
-# dna rendering dataset
-dna_rendering_dataset = dict(
-    type='DNARenderingDataset',
-    data_root='data/dna_rendering_part1',
-    data_mode='topdown',
-    ann_file='instances.npz',
-    subset_frac=0.1,
-    pipeline=[dict(type='LoadMask', backend_args=backend_args)],
-)
-
 # mapping
 
 aic_coco133 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7), (5, 9), (6, 12),
@@ -604,14 +585,23 @@
         pipeline=[])
     ubody_datasets.append(ubody)
 
+# h3wb dataset
+h3wb_dataset = dict(
+    type='H36MWholeBodyDataset',
+    ann_file='annotation_body3d/h3wb_train_bbox.npz',
+    seq_len=1,
+    causal=True,
+    data_root='data/h36m/',
+    data_prefix=dict(img='images/'),
+    test_mode=False,
+    pipeline=[])
+
 train_datasets = [
     dataset_wb,
     dataset_body,
     dataset_face,
-    # dataset_hand,
     *ubody_datasets,
     h3wb_dataset,
-    # dna_rendering_dataset
 ]
 
 # data loaders
@@ -626,20 +616,7 @@
         pipeline=train_pipeline,
         metainfo=dict(from_file='configs/_base_/datasets/h3wb.py'),
         test_mode=False))
-
-# hooks
-default_hooks = dict(
-    checkpoint=dict(
-        type='CheckpointHook',
-        save_best='MPJPE',
-        rule='less',
-        max_keep_ckpts=1))
-
 # hooks
-# default_hooks = dict(
-#     checkpoint=dict(
-#         save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1))
-
 custom_hooks = [
     dict(
         type='EMAHook',
@@ -653,7 +630,39 @@
         switch_pipeline=train_pipeline_stage2)
 ]
 
+default_hooks = dict(
+    checkpoint=dict(
+        type='CheckpointHook',
+        save_best='MPJPE',
+        rule='less',
+        max_keep_ckpts=1))
+
 # eval h3wb
+val_dataloader = dict(
+    batch_size=64,
+    num_workers=10,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type='H36MWholeBodyDataset',
+        ann_file='annotation_body3d/h3wb_train_bbox.npz',
+        seq_len=1,
+        causal=True,
+        data_root='data/h36m/',
+        data_prefix=dict(img='images/'),
+        test_mode=True,
+        pipeline=val_pipeline))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+    dict(type='SimpleMPJPE', mode='mpjpe'),
+    dict(type='SimpleMPJPE', mode='p-mpjpe')
+]
+test_evaluator = val_evaluator
+
+# eval coco
 # val_dataloader = dict(
 #     batch_size=64,
 #     num_workers=10,
@@ -661,45 +670,25 @@
 #     drop_last=False,
 #     sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
 #     dataset=dict(
-#         type='H36MWholeBodyDataset',
-#         ann_file='annotation_body3d/h3wb_train_bbox.npz',
-#         seq_len=1,
-#         causal=True,
-#         data_root='data/h36m/',
-#         data_prefix=dict(img='images/'),
+#         type='CocoWholeBodyDataset',
+#         data_root='data/coco/',
+#         data_mode='topdown',
+#         ann_file='annotations/coco_wholebody_val_v1.0.json',
+#         data_prefix=dict(img='val2017/'),
 #         test_mode=True,
-#         pipeline=val_pipeline))
+#         bbox_file='data/coco/person_detection_results/'
+#         'COCO_val2017_detections_AP_H_56_person.json',
+#         pipeline=val_pipeline,
+#     ))
 # test_dataloader = val_dataloader
 
 # # evaluators
-# val_evaluator = [
-#     dict(type='SimpleMPJPE', mode='mpjpe'),
-#     dict(type='SimpleMPJPE', mode='p-mpjpe')
-# ]
+# val_evaluator = dict(
+#     type='CocoWholeBodyMetric',
+#     ann_file='data/coco/' + 'annotations/coco_wholebody_val_v1.0.json')
 # test_evaluator = val_evaluator
 
-# eval coco
-val_dataloader = dict(
-    batch_size=64,
-    num_workers=10,
-    persistent_workers=True,
-    drop_last=False,
-    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
-    dataset=dict(
-        type='CocoWholeBodyDataset',
-        data_root='data/coco/',
-        data_mode='topdown',
-        ann_file='annotations/coco_wholebody_val_v1.0.json',
-        data_prefix=dict(img='val2017/'),
-        test_mode=True,
-        bbox_file='data/coco/person_detection_results/'
-        'COCO_val2017_detections_AP_H_56_person.json',
-        pipeline=val_pipeline,
-    ))
-test_dataloader = val_dataloader
-
-# evaluators
-val_evaluator = dict(
-    type='CocoWholeBodyMetric',
-    ann_file='data/coco/' + 'annotations/coco_wholebody_val_v1.0.json')
-test_evaluator = val_evaluator
+# hooks
+# default_hooks = dict(
+#     checkpoint=dict(
+#         save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1))
diff --git a/projects/rtmpose3d/configs/rtmw3d-x_8xb64_cocktail14-384x288.py b/projects/rtmpose3d/configs/rtmw3d-x_8xb32_cocktail14-384x288.py
similarity index 92%
rename from projects/rtmpose3d/configs/rtmw3d-x_8xb64_cocktail14-384x288.py
rename to projects/rtmpose3d/configs/rtmw3d-x_8xb32_cocktail14-384x288.py
index 2352072b62..c171835e0e 100644
--- a/projects/rtmpose3d/configs/rtmw3d-x_8xb64_cocktail14-384x288.py
+++ b/projects/rtmpose3d/configs/rtmw3d-x_8xb32_cocktail14-384x288.py
@@ -1,4 +1,6 @@
-_base_ = ['../../_base_/default_runtime.py']
+_base_ = ['mmpose::_base_/default_runtime.py']
+
+custom_imports = dict(imports=['rtmpose3d'], allow_failed_imports=False)
 
 vis_backends = [
     dict(type='LocalVisBackend'),
@@ -105,7 +107,7 @@
             pos_enc=False),
         loss=[
             dict(
-                type='KLDiscretLoss2',
+                type='KLDiscretLossWithWeight',
                 use_target_weight=True,
                 beta=10.,
                 label_softmax=True),
@@ -196,27 +198,6 @@
     dict(type='PackPoseInputs')
 ]
 
-# h3wb dataset
-h3wb_dataset = dict(
-    type='H36MWholeBodyDataset',
-    ann_file='annotation_body3d/h3wb_train_bbox.npz',
-    seq_len=1,
-    causal=True,
-    data_root='data/h36m/',
-    data_prefix=dict(img='images/'),
-    test_mode=False,
-    pipeline=[])
-
-# dna rendering dataset
-dna_rendering_dataset = dict(
-    type='DNARenderingDataset',
-    data_root='data/dna_rendering_part1',
-    data_mode='topdown',
-    ann_file='instances.npz',
-    subset_frac=0.1,
-    pipeline=[dict(type='LoadMask', backend_args=backend_args)],
-)
-
 # mapping
 
 aic_coco133 = [(0, 6), (1, 8), (2, 10), (3, 5), (4, 7), (5, 9), (6, 12),
@@ -605,14 +586,23 @@
         pipeline=[])
     ubody_datasets.append(ubody)
 
+# h3wb dataset
+h3wb_dataset = dict(
+    type='H36MWholeBodyDataset',
+    ann_file='annotation_body3d/h3wb_train_bbox.npz',
+    seq_len=1,
+    causal=True,
+    data_root='data/h36m/',
+    data_prefix=dict(img='images/'),
+    test_mode=False,
+    pipeline=[])
+
 train_datasets = [
     dataset_wb,
     dataset_body,
     dataset_face,
-    dataset_hand,
     *ubody_datasets,
     h3wb_dataset,
-    # dna_rendering_dataset
 ]
 
 # data loaders
@@ -629,18 +619,6 @@
         test_mode=False))
 
 # hooks
-# default_hooks = dict(
-#     checkpoint=dict(
-#         type='CheckpointHook',
-#         save_best='MPJPE',
-#         rule='less',
-#         max_keep_ckpts=1))
-
-# hooks
-default_hooks = dict(
-    checkpoint=dict(
-        save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1))
-
 custom_hooks = [
     dict(
         type='EMAHook',
@@ -654,7 +632,39 @@
         switch_pipeline=train_pipeline_stage2)
 ]
 
+default_hooks = dict(
+    checkpoint=dict(
+        type='CheckpointHook',
+        save_best='MPJPE',
+        rule='less',
+        max_keep_ckpts=1))
+
 # eval h3wb
+val_dataloader = dict(
+    batch_size=64,
+    num_workers=10,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    dataset=dict(
+        type='H36MWholeBodyDataset',
+        ann_file='annotation_body3d/h3wb_train_bbox.npz',
+        seq_len=1,
+        causal=True,
+        data_root='data/h36m/',
+        data_prefix=dict(img='images/'),
+        test_mode=True,
+        pipeline=val_pipeline))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = [
+    dict(type='SimpleMPJPE', mode='mpjpe'),
+    dict(type='SimpleMPJPE', mode='p-mpjpe')
+]
+test_evaluator = val_evaluator
+
+# eval coco
 # val_dataloader = dict(
 #     batch_size=64,
 #     num_workers=10,
@@ -662,45 +672,25 @@
 #     drop_last=False,
 #     sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
 #     dataset=dict(
-#         type='H36MWholeBodyDataset',
-#         ann_file='annotation_body3d/h3wb_train_bbox.npz',
-#         seq_len=1,
-#         causal=True,
-#         data_root='data/h36m/',
-#         data_prefix=dict(img='images/'),
+#         type='CocoWholeBodyDataset',
+#         data_root='data/coco/',
+#         data_mode='topdown',
+#         ann_file='annotations/coco_wholebody_val_v1.0.json',
+#         data_prefix=dict(img='val2017/'),
 #         test_mode=True,
-#         pipeline=val_pipeline))
+#         bbox_file='data/coco/person_detection_results/'
+#         'COCO_val2017_detections_AP_H_56_person.json',
+#         pipeline=val_pipeline,
+#     ))
 # test_dataloader = val_dataloader
 
 # # evaluators
-# val_evaluator = [
-#     dict(type='SimpleMPJPE', mode='mpjpe'),
-#     dict(type='SimpleMPJPE', mode='p-mpjpe')
-# ]
+# val_evaluator = dict(
+#     type='CocoWholeBodyMetric',
+#     ann_file='data/coco/' + 'annotations/coco_wholebody_val_v1.0.json')
 # test_evaluator = val_evaluator
 
-# eval coco
-val_dataloader = dict(
-    batch_size=64,
-    num_workers=10,
-    persistent_workers=True,
-    drop_last=False,
-    sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
-    dataset=dict(
-        type='CocoWholeBodyDataset',
-        data_root='data/coco/',
-        data_mode='topdown',
-        ann_file='annotations/coco_wholebody_val_v1.0.json',
-        data_prefix=dict(img='val2017/'),
-        test_mode=True,
-        bbox_file='data/coco/person_detection_results/'
-        'COCO_val2017_detections_AP_H_56_person.json',
-        pipeline=val_pipeline,
-    ))
-test_dataloader = val_dataloader
-
-# evaluators
-val_evaluator = dict(
-    type='CocoWholeBodyMetric',
-    ann_file='data/coco/' + 'annotations/coco_wholebody_val_v1.0.json')
-test_evaluator = val_evaluator
+# hooks
+# default_hooks = dict(
+#     checkpoint=dict(
+#         save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1))
diff --git a/projects/rtmpose3d/body3d_img2pose_demo.py b/projects/rtmpose3d/demo/body3d_img2pose_demo.py
similarity index 99%
rename from projects/rtmpose3d/body3d_img2pose_demo.py
rename to projects/rtmpose3d/demo/body3d_img2pose_demo.py
index 4a0e90040c..72f259414b 100644
--- a/projects/rtmpose3d/body3d_img2pose_demo.py
+++ b/projects/rtmpose3d/demo/body3d_img2pose_demo.py
@@ -283,7 +283,7 @@ def main():
     det_dataset_link_color = pose_estimator.dataset_meta.get(
         'skeleton_link_colors', None)
 
-    pose_estimator.cfg.model.test_cfg.mode = 'simcc'
+    pose_estimator.cfg.model.test_cfg.mode = 'vis'
     pose_estimator.cfg.visualizer.radius = args.radius
     pose_estimator.cfg.visualizer.line_width = args.thickness
     pose_estimator.cfg.visualizer.det_kpt_color = det_kpt_color
diff --git a/projects/rtmpose3d/configs/rtmdet_m_640-8xb32_coco-person.py b/projects/rtmpose3d/demo/rtmdet_m_640-8xb32_coco-person.py
similarity index 100%
rename from projects/rtmpose3d/configs/rtmdet_m_640-8xb32_coco-person.py
rename to projects/rtmpose3d/demo/rtmdet_m_640-8xb32_coco-person.py
diff --git a/projects/rtmpose3d/rtmpose3d/__init__.py b/projects/rtmpose3d/rtmpose3d/__init__.py
index 8bbd120d68..740b92d7cb 100644
--- a/projects/rtmpose3d/rtmpose3d/__init__.py
+++ b/projects/rtmpose3d/rtmpose3d/__init__.py
@@ -1,8 +1,9 @@
-from .loss import KLDiscretLoss2
+from .loss import KLDiscretLossWithWeight
 from .pose_estimator import TopdownPoseEstimator3D
 from .rtmw3d_head import RTMW3DHead
 from .simcc_3d_label import SimCC3DLabel
 
 __all__ = [
-    'TopdownPoseEstimator3D', 'RTMW3DHead', 'SimCC3DLabel', 'KLDiscretLoss2'
+    'TopdownPoseEstimator3D', 'RTMW3DHead', 'SimCC3DLabel',
+    'KLDiscretLossWithWeight'
 ]
diff --git a/projects/rtmpose3d/rtmpose3d/loss.py b/projects/rtmpose3d/rtmpose3d/loss.py
index 9869fd00ff..87289762d8 100644
--- a/projects/rtmpose3d/rtmpose3d/loss.py
+++ b/projects/rtmpose3d/rtmpose3d/loss.py
@@ -3,7 +3,7 @@
 
 
 @MODELS.register_module()
-class KLDiscretLoss2(KLDiscretLoss):
+class KLDiscretLossWithWeight(KLDiscretLoss):
 
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
diff --git a/projects/rtmpose3d/rtmpose3d/pose_estimator.py b/projects/rtmpose3d/rtmpose3d/pose_estimator.py
index 3ef7411738..438cd2b67b 100644
--- a/projects/rtmpose3d/rtmpose3d/pose_estimator.py
+++ b/projects/rtmpose3d/rtmpose3d/pose_estimator.py
@@ -13,6 +13,8 @@ class TopdownPoseEstimator3D(TopdownPoseEstimator):
 
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
+
+        # a default camera parameter for 3D pose estimation
         self.camera_param = {
             'c': [512.54150496, 515.45148698],
             'f': [1145.04940459, 1143.78109572],
@@ -40,7 +42,7 @@ def add_pred_to_datasample(self, batch_pred_instances: InstanceList,
         output_keypoint_indices = self.test_cfg.get('output_keypoint_indices',
                                                     None)
         mode = self.test_cfg.get('mode', '3d')
-        assert mode in ['2d', '3d', 'vis', 'simcc']
+        assert mode in ['2d', '3d', 'vis']
         for pred_instances, pred_fields, data_sample in zip_longest(
                 batch_pred_instances, batch_pred_fields, batch_data_samples):
 
@@ -51,11 +53,14 @@ def add_pred_to_datasample(self, batch_pred_instances: InstanceList,
             input_scale = data_sample.metainfo['input_scale']
             input_size = data_sample.metainfo['input_size']
             keypoints_3d = pred_instances.keypoints
-            keypoints_2d = pred_instances.keypoints_2d
             keypoints_simcc = pred_instances.keypoints_simcc
+
+            # convert keypoints from input space to image space
+            keypoints_2d = keypoints_3d[..., :2].copy()
             keypoints_2d = keypoints_2d / input_size * input_scale \
                 + input_center - 0.5 * input_scale
 
+            # convert keypoints from image space to camera space
             if gt_instances.get('camera_params', None) is not None:
                 camera_params = gt_instances.camera_params[0]
                 f = np.array(camera_params['f'])
@@ -63,7 +68,6 @@ def add_pred_to_datasample(self, batch_pred_instances: InstanceList,
             else:
                 f = np.array([1145.04940459, 1143.78109572])
                 c = np.array(data_sample.ori_shape)
-
             kpts_pixel = np.concatenate([
                 keypoints_2d,
                 (keypoints_3d[..., 2] + gt_instances.root_z)[..., None]
@@ -72,16 +76,17 @@ def add_pred_to_datasample(self, batch_pred_instances: InstanceList,
             kpts_cam = kpts_pixel.copy()
             kpts_cam[..., :2] = (kpts_pixel[..., :2] - c) / f * kpts_pixel[...,
                                                                            2:]
+
             if mode == '3d':
+                # Evaluation with 3D keypoint coordinates
                 pred_instances.keypoints = kpts_cam
                 pred_instances.transformed_keypoints = keypoints_2d
             elif mode == 'vis':
-                pred_instances.keypoints = keypoints_3d
-                pred_instances.transformed_keypoints = keypoints_2d
-            elif mode == 'simcc':
+                # Visualization with SimCC keypoints
                 pred_instances.keypoints = keypoints_simcc
                 pred_instances.transformed_keypoints = keypoints_2d
             else:
+                # Evaluation with 2D keypoint coordinates
                 pred_instances.keypoints = keypoints_2d
                 pred_instances.transformed_keypoints = keypoints_2d
 
diff --git a/projects/rtmpose3d/rtmpose3d/rtmw3d_head.py b/projects/rtmpose3d/rtmpose3d/rtmw3d_head.py
index c56db1c9b9..90e73b7255 100644
--- a/projects/rtmpose3d/rtmpose3d/rtmw3d_head.py
+++ b/projects/rtmpose3d/rtmpose3d/rtmw3d_head.py
@@ -232,23 +232,20 @@ def _pack_and_call(args, func):
 
         batch_output_np = to_numpy(batch_outputs, unzip=True)
         batch_keypoints = []
-        batch_keypoints2d = []
         batch_keypoints_simcc = []
         batch_scores = []
         for outputs in batch_output_np:
-            keypoints_2d, keypoints, keypoints_simcc, scores = _pack_and_call(
+            keypoints, keypoints_simcc, scores = _pack_and_call(
                 outputs, self.decoder.decode)
-            batch_keypoints2d.append(keypoints_2d)
             batch_keypoints.append(keypoints)
             batch_keypoints_simcc.append(keypoints_simcc)
             batch_scores.append(scores)
 
         preds = []
-        for keypoints_2d, keypoints, keypoints_simcc, scores in zip(
-                batch_keypoints2d, batch_keypoints, batch_keypoints_simcc,
-                batch_scores):
+        for keypoints, keypoints_simcc, scores in zip(batch_keypoints,
+                                                      batch_keypoints_simcc,
+                                                      batch_scores):
             pred = InstanceData(
-                keypoints_2d=keypoints_2d,
                 keypoints=keypoints,
                 keypoints_simcc=keypoints_simcc,
                 keypoint_scores=scores)
diff --git a/projects/rtmpose3d/rtmpose3d/simcc_3d_label.py b/projects/rtmpose3d/rtmpose3d/simcc_3d_label.py
index 41f2064d15..ead72c5090 100644
--- a/projects/rtmpose3d/rtmpose3d/simcc_3d_label.py
+++ b/projects/rtmpose3d/rtmpose3d/simcc_3d_label.py
@@ -1,11 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from itertools import product
-from typing import Any, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import numpy as np
 from numpy import ndarray
 
 from mmpose.codecs.base import BaseKeypointCodec
+from mmpose.codecs.utils.refinement import refine_simcc_dark
 from mmpose.registry import KEYPOINT_CODECS
 from .utils import get_simcc_maximum
 
@@ -27,32 +28,23 @@ class SimCC3DLabel(BaseKeypointCodec):
     Encoded:
 
         - keypoint_x_labels (np.ndarray): The generated SimCC label for x-axis.
-            The label shape is (N, K, Wx) if ``smoothing_type=='gaussian'``
-            and (N, K) if `smoothing_type=='standard'``, where
-            :math:`Wx=w*simcc_split_ratio`
         - keypoint_y_labels (np.ndarray): The generated SimCC label for y-axis.
-            The label shape is (N, K, Wy) if ``smoothing_type=='gaussian'``
-            and (N, K) if `smoothing_type=='standard'``, where
-            :math:`Wy=h*simcc_split_ratio`
+        - keypoint_z_labels (np.ndarray): The generated SimCC label for z-axis.
         - keypoint_weights (np.ndarray): The target weights in shape (N, K)
 
     Args:
         input_size (tuple): Input image size in [w, h]
-        smoothing_type (str): The SimCC label smoothing strategy. Options are
-            ``'gaussian'`` and ``'standard'``. Defaults to ``'gaussian'``
         sigma (float | int | tuple): The sigma value in the Gaussian SimCC
             label. Defaults to 6.0
         simcc_split_ratio (float): The ratio of the label size to the input
             size. For example, if the input width is ``w``, the x label size
             will be :math:`w*simcc_split_ratio`. Defaults to 2.0
-        label_smooth_weight (float): Label Smoothing weight. Defaults to 0.0
         normalize (bool): Whether to normalize the heatmaps. Defaults to True.
         use_dark (bool): Whether to use the DARK post processing. Defaults to
             False.
-        decode_visibility (bool): Whether to decode the visibility. Defaults
-            to False.
-        decode_beta (float): The beta value for decoding visibility. Defaults
-            to 150.0.
+        root_index (int | tuple): The index of the root keypoint. Defaults to
+            0.
+        z_range (float): The range of the z-axis. Defaults to None.
 
     .. _`SimCC: a Simple Coordinate Classification Perspective for Human Pose
     Estimation`: https://arxiv.org/abs/2107.03332
@@ -79,51 +71,30 @@ class SimCC3DLabel(BaseKeypointCodec):
 
     def __init__(self,
                  input_size: Tuple[int, int, int],
-                 smoothing_type: str = 'gaussian',
                  sigma: Union[float, int, Tuple[float]] = 6.0,
                  simcc_split_ratio: float = 2.0,
-                 label_smooth_weight: float = 0.0,
                  normalize: bool = True,
                  use_dark: bool = False,
-                 decode_visibility: bool = False,
-                 decode_beta: float = 150.0,
                  root_index: Union[int, Tuple[int]] = 0,
-                 z_range: Optional[int] = None,
-                 sigmoid_z: bool = False) -> None:
+                 z_range: Optional[int] = None) -> None:
         super().__init__()
 
         self.input_size = input_size
-        self.smoothing_type = smoothing_type
         self.simcc_split_ratio = simcc_split_ratio
-        self.label_smooth_weight = label_smooth_weight
         self.normalize = normalize
         self.use_dark = use_dark
-        self.decode_visibility = decode_visibility
-        self.decode_beta = decode_beta
 
         if isinstance(sigma, (float, int)):
             self.sigma = np.array([sigma, sigma, sigma])
         else:
             self.sigma = np.array(sigma)
 
-        if self.smoothing_type not in {'gaussian', 'standard'}:
-            raise ValueError(
-                f'{self.__class__.__name__} got invalid `smoothing_type` value'
-                f'{self.smoothing_type}. Should be one of '
-                '{"gaussian", "standard"}')
-
-        if self.smoothing_type == 'gaussian' and self.label_smooth_weight > 0:
-            raise ValueError('Attribute `label_smooth_weight` is only '
-                             'used for `standard` mode.')
-
-        if self.label_smooth_weight < 0.0 or self.label_smooth_weight > 1.0:
-            raise ValueError('`label_smooth_weight` should be in range [0, 1]')
-
         self.root_index = list(root_index) if isinstance(
             root_index, tuple) else [root_index]
-        self.z_range = z_range if z_range is not None else 2.1744869
-        self.sigmoid_z = sigmoid_z
+
+        # Mean value of the root z-axis of datasets
         self.root_z = [5.14388]
+        self.z_range = z_range if z_range is not None else 2.1744869
 
     def encode(self,
                keypoints: np.ndarray,
@@ -139,12 +110,8 @@ def encode(self,
             lifting_target = keypoints_3d.copy()
             root_z = keypoints_3d[..., self.root_index, 2].mean(1)
             keypoints_3d[..., 2] -= root_z
-            if self.sigmoid_z:
-                keypoints_z = (1 / (1 + np.exp(-(3 * keypoints_3d[..., 2])))
-                               ) * self.input_size[2]
-            else:
-                keypoints_z = (keypoints_3d[..., 2] / self.z_range + 1) * (
-                    self.input_size[2] / 2)
+            keypoints_z = (keypoints_3d[..., 2] / self.z_range + 1) * (
+                self.input_size[2] / 2)
 
             keypoints_3d = np.concatenate([keypoints, keypoints_z[..., None]],
                                           axis=-1)
@@ -161,6 +128,7 @@ def encode(self,
                 x, y, z, keypoint_weights = self._generate_gaussian(
                     keypoints, keypoints_visible)
             else:
+                # placeholder for empty keypoints
                 x, y, z = np.zeros((3, 1), dtype=np.float32)
                 keypoint_weights = np.ones((1, ))
             weight_z = np.zeros_like(keypoint_weights)
@@ -199,20 +167,27 @@ def decode(self, x: np.ndarray, y: np.ndarray, z: np.ndarray):
             keypoints = keypoints[None, :]
             scores = scores[None, :]
 
+        if self.use_dark:
+            x_blur = int((self.sigma[0] * 20 - 7) // 3)
+            y_blur = int((self.sigma[1] * 20 - 7) // 3)
+            z_blur = int((self.sigma[2] * 20 - 7) // 3)
+            x_blur -= int((x_blur % 2) == 0)
+            y_blur -= int((y_blur % 2) == 0)
+            z_blur -= int((z_blur % 2) == 0)
+            keypoints[:, :, 0] = refine_simcc_dark(keypoints[:, :, 0], x,
+                                                   x_blur)
+            keypoints[:, :, 1] = refine_simcc_dark(keypoints[:, :, 1], y,
+                                                   y_blur)
+            keypoints[:, :, 2] = refine_simcc_dark(keypoints[:, :, 2], z,
+                                                   z_blur)
+
         keypoints /= self.simcc_split_ratio
         keypoints_simcc = keypoints.copy()
-        keypoints_2d = keypoints[..., :2]
         keypoints_z = keypoints[..., 2:3]
-        if self.sigmoid_z:
-            keypoints_z /= self.input_size[2]
-            keypoints_z[keypoints_z <= 0] = 1e-8
-            scores[(keypoints_z <= 0).squeeze(-1)] = 0
-            keypoints[..., 2:3] = np.log(keypoints_z / (1 - keypoints_z)) / 3
-        else:
-            keypoints[...,
-                      2:3] = (keypoints_z /
-                              (self.input_size[-1] / 2) - 1) * self.z_range
-        return keypoints_2d, keypoints, keypoints_simcc, scores
+
+        keypoints[..., 2:3] = (keypoints_z /
+                               (self.input_size[-1] / 2) - 1) * self.z_range
+        return keypoints, keypoints_simcc, scores
 
     def _map_coordinates(
         self,
@@ -285,51 +260,3 @@ def _generate_gaussian(
             target_z /= norm_value[2]
 
         return target_x, target_y, target_z, keypoint_weights
-
-    def _generate_standard(
-        self,
-        keypoints: np.ndarray,
-        keypoints_visible: Optional[np.ndarray] = None
-    ) -> tuple[ndarray, ndarray, ndarray, Any]:
-        """Encoding keypoints into SimCC labels with Standard Label Smoothing
-        strategy.
-
-        Labels will be one-hot vectors if self.label_smooth_weight==0.0
-        """
-
-        N, K, _ = keypoints.shape
-        w, h, d = self.input_size
-        w = np.around(w * self.simcc_split_ratio).astype(int)
-        h = np.around(h * self.simcc_split_ratio).astype(int)
-        d = np.around(d * self.simcc_split_ratio).astype(int)
-
-        keypoints_split, keypoint_weights = self._map_coordinates(
-            keypoints, keypoints_visible)
-
-        x = np.zeros((N, K, w), dtype=np.float32)
-        y = np.zeros((N, K, h), dtype=np.float32)
-        z = np.zeros((N, K, d), dtype=np.float32)
-
-        for n, k in product(range(N), range(K)):
-            # skip unlabled keypoints
-            if keypoints_visible[n, k] < 0.5:
-                continue
-
-            # get center coordinates
-            mu_x, mu_y, mu_z = keypoints_split[n, k].astype(np.int64)
-
-            # detect abnormal coords and assign the weight 0
-            if mu_x >= w or mu_y >= h or mu_x < 0 or mu_y < 0:
-                keypoint_weights[n, k] = 0
-                continue
-
-            if self.label_smooth_weight > 0:
-                x[n, k] = self.label_smooth_weight / (w - 1)
-                y[n, k] = self.label_smooth_weight / (h - 1)
-                z[n, k] = self.label_smooth_weight / (d - 1)
-
-            x[n, k, mu_x] = 1.0 - self.label_smooth_weight
-            y[n, k, mu_y] = 1.0 - self.label_smooth_weight
-            z[n, k, mu_z] = 1.0 - self.label_smooth_weight
-
-        return x, y, z, keypoint_weights

From 88ad8e4f2fa6e2b5196b41852a5996602c4c4eb2 Mon Sep 17 00:00:00 2001
From: xiexinch <xiexinch@outlook.com>
Date: Sun, 28 Apr 2024 16:27:13 +0800
Subject: [PATCH 07/15] --fix=fix 3d training

---
 .../datasets/base/base_coco_style_dataset.py  |  2 +
 mmpose/datasets/datasets/body/mpii_dataset.py |  3 +-
 .../wholebody/coco_wholebody_dataset.py       |  3 +-
 .../datasets/wholebody3d/h3wb_dataset.py      | 41 ++++++++---
 .../datasets/wholebody3d/ubody3d_dataset.py   | 68 +++++++++++++------
 5 files changed, 86 insertions(+), 31 deletions(-)

diff --git a/mmpose/datasets/datasets/base/base_coco_style_dataset.py b/mmpose/datasets/datasets/base/base_coco_style_dataset.py
index ac94961f2c..9a223984e0 100644
--- a/mmpose/datasets/datasets/base/base_coco_style_dataset.py
+++ b/mmpose/datasets/datasets/base/base_coco_style_dataset.py
@@ -210,6 +210,8 @@ def load_data_list(self) -> List[dict]:
                 data_list = self._get_bottomup_data_infos(
                     instance_list, image_list)
 
+        if hasattr(self, 'coco'):
+            del self.coco
         return data_list
 
     def _load_annotations(self) -> Tuple[List[dict], List[dict]]:
diff --git a/mmpose/datasets/datasets/body/mpii_dataset.py b/mmpose/datasets/datasets/body/mpii_dataset.py
index 28d53bd8b8..d60338657e 100644
--- a/mmpose/datasets/datasets/body/mpii_dataset.py
+++ b/mmpose/datasets/datasets/body/mpii_dataset.py
@@ -221,5 +221,6 @@ def _load_annotations(self) -> Tuple[List[dict], List[dict]]:
 
             instance_list.append(instance_info)
             ann_id = ann_id + 1
-
+        del self.anns
+        self.coco = None
         return instance_list, image_list
diff --git a/mmpose/datasets/datasets/wholebody/coco_wholebody_dataset.py b/mmpose/datasets/datasets/wholebody/coco_wholebody_dataset.py
index b0e20e1335..2539a9817e 100644
--- a/mmpose/datasets/datasets/wholebody/coco_wholebody_dataset.py
+++ b/mmpose/datasets/datasets/wholebody/coco_wholebody_dataset.py
@@ -120,12 +120,13 @@ def parse_data_info(self, raw_data_info: dict) -> Optional[dict]:
             'bbox_score': np.ones(1, dtype=np.float32),
             'num_keypoints': num_keypoints,
             'keypoints': keypoints,
+            'keypoints_3d': None,
             'keypoints_visible': keypoints_visible,
             'iscrowd': ann['iscrowd'],
             'segmentation': ann['segmentation'],
             'area': area,
             'id': ann['id'],
-            'category_id': np.array(ann['category_id']),
+            'category_id': ann['category_id'],
             # store the raw annotation of the instance
             # it is useful for evaluation without providing ann_file
             'raw_ann_info': copy.deepcopy(ann),
diff --git a/mmpose/datasets/datasets/wholebody3d/h3wb_dataset.py b/mmpose/datasets/datasets/wholebody3d/h3wb_dataset.py
index 95e40db4b4..26ffa7d14f 100644
--- a/mmpose/datasets/datasets/wholebody3d/h3wb_dataset.py
+++ b/mmpose/datasets/datasets/wholebody3d/h3wb_dataset.py
@@ -106,6 +106,7 @@ def _load_ann_file(self, ann_file: str) -> dict:
 
         self.ann_data = data['train_data'].item()
         self.camera_data = data['metadata'].item()
+        self.bboxes = data['bbox'].item()
 
     def get_sequence_indices(self) -> List[List[int]]:
         return []
@@ -132,19 +133,26 @@ def _load_annotations(self) -> Tuple[List[dict], List[dict]]:
                         'K': camera_param['K'][0, :2, ...],
                         'R': camera_param['R'][0],
                         'T': camera_param['T'].reshape(3, 1),
-                        'Distortion': camera_param['Distortion'][0]
+                        'Distortion': camera_param['Distortion'][0],
                     }
+                    camera_param['f'] = (camera_param['K'][0, 0] * 1000,
+                                         camera_param['K'][1, 1] * 1000)
+                    camera_param['c'] = (camera_param['K'][0, 2] * 1000,
+                                         camera_param['K'][1, 2] * 1000)
 
                     seq_step = 1
                     _len = (self.seq_len - 1) * seq_step + 1
                     _indices = list(
                         range(len(self.ann_data[subject][act]['frame_id'])))
+
                     seq_indices = [
                         _indices[i:(i + _len):seq_step]
                         for i in list(range(0,
                                             len(_indices) - _len + 1))
                     ]
 
+                    frames = self.ann_data[subject][act]['frame_id']
+
                     for idx, frame_ids in enumerate(seq_indices):
                         expected_num_frames = self.seq_len
                         if self.multiple_target:
@@ -163,6 +171,20 @@ def _load_annotations(self) -> Tuple[List[dict], List[dict]]:
                         if self.multiple_target > 0:
                             target_idx = list(range(self.multiple_target))
 
+                        bbox = self.bboxes[(subject, act, cam,
+                                            frames[frame_ids[-1]])]
+                        bbox = np.array([[
+                            bbox['x_min'], bbox['y_min'], bbox['x_max'],
+                            bbox['y_max']
+                        ]],
+                                        dtype=np.float32)
+
+                        img_path = f'{self.data_root}original/{subject}/Images/{act}.{cam}/frame_{frames[frame_ids[-1]]}.jpg'  # noqa
+                        img_paths = [
+                            f'{self.data_root}original/{subject}/Images/{act}.{cam}/frame_{frames[i]}.jpg'  # noqa
+                            for i in frame_ids
+                        ]
+
                         instance_info = {
                             'num_keypoints':
                             num_keypoints,
@@ -174,6 +196,10 @@ def _load_annotations(self) -> Tuple[List[dict], List[dict]]:
                             np.ones_like(_kpts_2d[..., 0], dtype=np.float32),
                             'keypoints_3d_visible':
                             np.ones_like(_kpts_2d[..., 0], dtype=np.float32),
+                            'bbox':
+                            bbox,
+                            'bbox_score':
+                            np.ones((len(frame_ids), )),
                             'scale':
                             np.zeros((1, 1), dtype=np.float32),
                             'center':
@@ -186,12 +212,11 @@ def _load_annotations(self) -> Tuple[List[dict], List[dict]]:
                             1,
                             'iscrowd':
                             0,
-                            'camera_param':
-                            camera_param,
-                            'img_paths': [
-                                f'{subject}/{act}/{cam}/{i:06d}.jpg'
-                                for i in frame_ids
-                            ],
+                            'camera_param': [camera_param],
+                            'img_paths':
+                            img_paths,
+                            'img_path':
+                            img_path,
                             'img_ids':
                             frame_ids,
                             'lifting_target':
@@ -209,5 +234,5 @@ def _load_annotations(self) -> Tuple[List[dict], List[dict]]:
                                 image_list.append(img_info)
 
                         instance_id += 1
-
+        del self.ann_data
         return instance_list, image_list
diff --git a/mmpose/datasets/datasets/wholebody3d/ubody3d_dataset.py b/mmpose/datasets/datasets/wholebody3d/ubody3d_dataset.py
index 85b8d893e7..5e6e564168 100644
--- a/mmpose/datasets/datasets/wholebody3d/ubody3d_dataset.py
+++ b/mmpose/datasets/datasets/wholebody3d/ubody3d_dataset.py
@@ -84,7 +84,7 @@ def __init__(self,
 
         super().__init__(multiple_target=multiple_target, **kwargs)
 
-    METAINFO: dict = dict(from_file='configs/_base_/datasets/ubody3d.py')
+    METAINFO: dict = dict(from_file='configs/_base_/datasets/h3wb.py')
 
     def _load_ann_file(self, ann_file: str) -> dict:
         """Load annotation file."""
@@ -167,7 +167,7 @@ def _parse_image_name(self, image_path: str) -> Tuple[str, int]:
 
     def _load_annotations(self):
         """Load data from annotations in COCO format."""
-        num_keypoints = self.metainfo['num_keypoints']
+        num_keypoints = 133
         self._metainfo['CLASSES'] = self.ann_data.loadCats(
             self.ann_data.getCatIds())
 
@@ -184,23 +184,37 @@ def _load_annotations(self):
                 f'got {len(_ann_ids)} ')
 
             anns = self.ann_data.loadAnns(_ann_ids)
+            num_anns = len(anns)
             img_ids = []
-            kpts = np.zeros((len(anns), num_keypoints, 2), dtype=np.float32)
-            kpts_3d = np.zeros((len(anns), num_keypoints, 3), dtype=np.float32)
-            keypoints_visible = np.zeros((len(anns), num_keypoints, 1),
+            kpts = np.zeros((num_anns, num_keypoints, 2), dtype=np.float32)
+            kpts_3d = np.zeros((num_anns, num_keypoints, 3), dtype=np.float32)
+            keypoints_visible = np.zeros((num_anns, num_keypoints),
                                          dtype=np.float32)
+            scales = np.zeros((num_anns, 2), dtype=np.float32)
+            centers = np.zeros((num_anns, 2), dtype=np.float32)
+            bboxes = np.zeros((num_anns, 4), dtype=np.float32)
+            bbox_scores = np.zeros((num_anns, ), dtype=np.float32)
+            bbox_scales = np.zeros((num_anns, 2), dtype=np.float32)
+
             for j, ann in enumerate(anns):
                 img_ids.append(ann['image_id'])
                 kpts[j] = np.array(ann['keypoints'], dtype=np.float32)
                 kpts_3d[j] = np.array(ann['keypoints_3d'], dtype=np.float32)
                 keypoints_visible[j] = np.array(
                     ann['keypoints_valid'], dtype=np.float32)
+                if 'scale' in ann:
+                    scales[j] = np.array(ann['scale'])
+                if 'center' in ann:
+                    centers[j] = np.array(ann['center'])
+                bboxes[j] = np.array(ann['bbox'], dtype=np.float32)
+                bbox_scores[j] = np.array([1], dtype=np.float32)
+                bbox_scales[j] = np.array([1, 1], dtype=np.float32)
+
             imgs = self.ann_data.loadImgs(img_ids)
-            keypoints_visible = keypoints_visible.squeeze(-1)
 
-            scales = np.zeros(len(imgs), dtype=np.float32)
-            centers = np.zeros((len(imgs), 2), dtype=np.float32)
-            img_paths = np.array([img['file_name'] for img in imgs])
+            img_paths = np.array([
+                f'{self.data_root}/images/' + img['file_name'] for img in imgs
+            ])
             factors = np.zeros((kpts_3d.shape[0], ), dtype=np.float32)
 
             target_idx = [-1] if self.causal else [int(self.seq_len // 2)]
@@ -212,6 +226,8 @@ def _load_annotations(self):
                 cam_param['w'] = 1000
                 cam_param['h'] = 1000
 
+            cam_param = {'f': cam_param['focal'], 'c': cam_param['princpt']}
+
             instance_info = {
                 'num_keypoints': num_keypoints,
                 'keypoints': kpts,
@@ -223,25 +239,35 @@ def _load_annotations(self):
                 'category_id': 1,
                 'iscrowd': 0,
                 'img_paths': list(img_paths),
+                'img_path': img_paths[-1],
                 'img_ids': [img['id'] for img in imgs],
                 'lifting_target': kpts_3d[target_idx],
                 'lifting_target_visible': keypoints_visible[target_idx],
-                'target_img_paths': img_paths[target_idx],
-                'camera_param': cam_param,
+                'target_img_paths': list(img_paths[target_idx]),
+                'camera_param': [cam_param],
                 'factor': factors,
                 'target_idx': target_idx,
+                'bbox': bboxes,
+                'bbox_scales': bbox_scales,
+                'bbox_scores': bbox_scores
             }
 
             instance_list.append(instance_info)
 
-        for img_id in self.ann_data.getImgIds():
-            img = self.ann_data.loadImgs(img_id)[0]
-            img.update({
-                'img_id':
-                img_id,
-                'img_path':
-                osp.join(self.data_prefix['img'], img['file_name']),
-            })
-            image_list.append(img)
-
+        if self.data_mode == 'bottomup':
+            for img_id in self.ann_data.getImgIds():
+                img = self.ann_data.loadImgs(img_id)[0]
+                img.update({
+                    'img_id':
+                    img_id,
+                    'img_path':
+                    osp.join(self.data_prefix['img'], img['file_name']),
+                })
+                image_list.append(img)
+        del self.ann_data
         return instance_list, image_list
+
+    def load_data_list(self) -> List[dict]:
+        data_list = super().load_data_list()
+        self.ann_data = None
+        return data_list

From 50af983cc9ae8f54763a03588dde3cdf3f52dbb9 Mon Sep 17 00:00:00 2001
From: xiexinch <xiexinch@outlook.com>
Date: Sun, 28 Apr 2024 16:43:18 +0800
Subject: [PATCH 08/15] --fix=fix configs

---
 configs/_base_/datasets/h3wb.py               | 256 +++++++++---------
 .../rtmw3d-l_8xb64_cocktail14-384x288.py      |  15 +-
 .../rtmw3d-x_8xb32_cocktail14-384x288.py      |  15 +-
 3 files changed, 138 insertions(+), 148 deletions(-)

diff --git a/configs/_base_/datasets/h3wb.py b/configs/_base_/datasets/h3wb.py
index bb47a1b3f5..5a341cebda 100644
--- a/configs/_base_/datasets/h3wb.py
+++ b/configs/_base_/datasets/h3wb.py
@@ -168,433 +168,433 @@
         dict(
             name='face-0',
             id=23,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-16'),
         24:
         dict(
             name='face-1',
             id=24,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-15'),
         25:
         dict(
             name='face-2',
             id=25,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-14'),
         26:
         dict(
             name='face-3',
             id=26,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-13'),
         27:
         dict(
             name='face-4',
             id=27,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-12'),
         28:
         dict(
             name='face-5',
             id=28,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-11'),
         29:
         dict(
             name='face-6',
             id=29,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-10'),
         30:
         dict(
             name='face-7',
             id=30,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-9'),
         31:
-        dict(name='face-8', id=31, color=[255, 255, 255], type='', swap=''),
+        dict(name='face-8', id=31, color=[247, 34, 5], type='upper', swap=''),
         32:
         dict(
             name='face-9',
             id=32,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-7'),
         33:
         dict(
             name='face-10',
             id=33,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-6'),
         34:
         dict(
             name='face-11',
             id=34,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-5'),
         35:
         dict(
             name='face-12',
             id=35,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-4'),
         36:
         dict(
             name='face-13',
             id=36,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-3'),
         37:
         dict(
             name='face-14',
             id=37,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-2'),
         38:
         dict(
             name='face-15',
             id=38,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-1'),
         39:
         dict(
             name='face-16',
             id=39,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-0'),
         40:
         dict(
             name='face-17',
             id=40,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-26'),
         41:
         dict(
             name='face-18',
             id=41,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-25'),
         42:
         dict(
             name='face-19',
             id=42,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-24'),
         43:
         dict(
             name='face-20',
             id=43,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-23'),
         44:
         dict(
             name='face-21',
             id=44,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-22'),
         45:
         dict(
             name='face-22',
             id=45,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-21'),
         46:
         dict(
             name='face-23',
             id=46,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-20'),
         47:
         dict(
             name='face-24',
             id=47,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-19'),
         48:
         dict(
             name='face-25',
             id=48,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-18'),
         49:
         dict(
             name='face-26',
             id=49,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-17'),
         50:
-        dict(name='face-27', id=50, color=[255, 255, 255], type='', swap=''),
+        dict(name='face-27', id=50, color=[247, 34, 5], type='upper', swap=''),
         51:
-        dict(name='face-28', id=51, color=[255, 255, 255], type='', swap=''),
+        dict(name='face-28', id=51, color=[247, 34, 5], type='upper', swap=''),
         52:
-        dict(name='face-29', id=52, color=[255, 255, 255], type='', swap=''),
+        dict(name='face-29', id=52, color=[247, 34, 5], type='upper', swap=''),
         53:
-        dict(name='face-30', id=53, color=[255, 255, 255], type='', swap=''),
+        dict(name='face-30', id=53, color=[247, 34, 5], type='upper', swap=''),
         54:
         dict(
             name='face-31',
             id=54,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-35'),
         55:
         dict(
             name='face-32',
             id=55,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-34'),
         56:
-        dict(name='face-33', id=56, color=[255, 255, 255], type='', swap=''),
+        dict(name='face-33', id=56, color=[247, 34, 5], type='upper', swap=''),
         57:
         dict(
             name='face-34',
             id=57,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-32'),
         58:
         dict(
             name='face-35',
             id=58,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-31'),
         59:
         dict(
             name='face-36',
             id=59,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-45'),
         60:
         dict(
             name='face-37',
             id=60,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-44'),
         61:
         dict(
             name='face-38',
             id=61,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-43'),
         62:
         dict(
             name='face-39',
             id=62,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-42'),
         63:
         dict(
             name='face-40',
             id=63,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-47'),
         64:
         dict(
             name='face-41',
             id=64,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-46'),
         65:
         dict(
             name='face-42',
             id=65,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-39'),
         66:
         dict(
             name='face-43',
             id=66,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-38'),
         67:
         dict(
             name='face-44',
             id=67,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-37'),
         68:
         dict(
             name='face-45',
             id=68,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-36'),
         69:
         dict(
             name='face-46',
             id=69,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-41'),
         70:
         dict(
             name='face-47',
             id=70,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-40'),
         71:
         dict(
             name='face-48',
             id=71,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-54'),
         72:
         dict(
             name='face-49',
             id=72,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-53'),
         73:
         dict(
             name='face-50',
             id=73,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-52'),
         74:
-        dict(name='face-51', id=74, color=[255, 255, 255], type='', swap=''),
+        dict(name='face-51', id=74, color=[247, 34, 5], type='upper', swap=''),
         75:
         dict(
             name='face-52',
             id=75,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-50'),
         76:
         dict(
             name='face-53',
             id=76,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-49'),
         77:
         dict(
             name='face-54',
             id=77,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-48'),
         78:
         dict(
             name='face-55',
             id=78,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-59'),
         79:
         dict(
             name='face-56',
             id=79,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-58'),
         80:
-        dict(name='face-57', id=80, color=[255, 255, 255], type='', swap=''),
+        dict(name='face-57', id=80, color=[247, 34, 5], type='upper', swap=''),
         81:
         dict(
             name='face-58',
             id=81,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-56'),
         82:
         dict(
             name='face-59',
             id=82,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-55'),
         83:
         dict(
             name='face-60',
             id=83,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-64'),
         84:
         dict(
             name='face-61',
             id=84,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-63'),
         85:
-        dict(name='face-62', id=85, color=[255, 255, 255], type='', swap=''),
+        dict(name='face-62', id=85, color=[247, 34, 5], type='upper', swap=''),
         86:
         dict(
             name='face-63',
             id=86,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-61'),
         87:
         dict(
             name='face-64',
             id=87,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-60'),
         88:
         dict(
             name='face-65',
             id=88,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-67'),
         89:
-        dict(name='face-66', id=89, color=[255, 255, 255], type='', swap=''),
+        dict(name='face-66', id=89, color=[247, 34, 5], type='upper', swap=''),
         90:
         dict(
             name='face-67',
             id=90,
-            color=[255, 255, 255],
-            type='',
+            color=[247, 34, 5],
+            type='upper',
             swap='face-65'),
         91:
         dict(
             name='left_hand_root',
             id=91,
-            color=[255, 255, 255],
+            color=[247, 34, 5],
             type='',
             swap='right_hand_root'),
         92:
@@ -741,7 +741,7 @@
         dict(
             name='right_hand_root',
             id=112,
-            color=[255, 255, 255],
+            color=[247, 34, 5],
             type='',
             swap='left_hand_root'),
         113:
diff --git a/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py b/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py
index 460ae6300d..997d43a6fa 100644
--- a/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py
+++ b/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py
@@ -139,11 +139,6 @@
 # pipelines
 train_pipeline = [
     dict(type='LoadImage', backend_args=backend_args),
-    dict(
-        type='RandomBackground',
-        bg_dir='/mnt/data/oss_beijing/mmseg/obj365v1_images',
-        bg_prob=0.5,
-    ),
     dict(type='GetBBoxCenterScale'),
     dict(type='RandomFlip', direction='horizontal'),
     dict(type='RandomHalfBody'),
@@ -467,7 +462,7 @@
 
 dataset_wb = dict(
     type='CombinedDataset',
-    metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+    metainfo=dict(from_file='mmpose::_base_/datasets/coco_wholebody.py'),
     datasets=[dataset_coco, dataset_halpe],
     pipeline=[],
     test_mode=False,
@@ -475,7 +470,7 @@
 
 dataset_body = dict(
     type='CombinedDataset',
-    metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+    metainfo=dict(from_file='mmpose::_base_/datasets/coco_wholebody.py'),
     datasets=[
         dataset_aic,
         dataset_crowdpose,
@@ -490,7 +485,7 @@
 
 dataset_face = dict(
     type='CombinedDataset',
-    metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+    metainfo=dict(from_file='mmpose::_base_/datasets/coco_wholebody.py'),
     datasets=[
         dataset_wflw,
         dataset_300w,
@@ -558,7 +553,7 @@
 
 dataset_hand = dict(
     type='CombinedDataset',
-    metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+    metainfo=dict(from_file='mmpose::_base_/datasets/coco_wholebody.py'),
     datasets=[dataset_interhand3d],
     pipeline=[],
     test_mode=False,
@@ -614,7 +609,7 @@
         type='CombinedDataset',
         datasets=train_datasets,
         pipeline=train_pipeline,
-        metainfo=dict(from_file='configs/_base_/datasets/h3wb.py'),
+        metainfo=dict(from_file='mmpose::_base_/datasets/h3wb.py'),
         test_mode=False))
 # hooks
 custom_hooks = [
diff --git a/projects/rtmpose3d/configs/rtmw3d-x_8xb32_cocktail14-384x288.py b/projects/rtmpose3d/configs/rtmw3d-x_8xb32_cocktail14-384x288.py
index c171835e0e..0126290826 100644
--- a/projects/rtmpose3d/configs/rtmw3d-x_8xb32_cocktail14-384x288.py
+++ b/projects/rtmpose3d/configs/rtmw3d-x_8xb32_cocktail14-384x288.py
@@ -140,11 +140,6 @@
 # pipelines
 train_pipeline = [
     dict(type='LoadImage', backend_args=backend_args),
-    dict(
-        type='RandomBackground',
-        bg_dir='/mnt/data/oss_beijing/mmseg/obj365v1_images',
-        bg_prob=0.5,
-    ),
     dict(type='GetBBoxCenterScale'),
     dict(type='RandomFlip', direction='horizontal'),
     dict(type='RandomHalfBody'),
@@ -468,7 +463,7 @@
 
 dataset_wb = dict(
     type='CombinedDataset',
-    metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+    metainfo=dict(from_file='mmpose::_base_/datasets/coco_wholebody.py'),
     datasets=[dataset_coco, dataset_halpe],
     pipeline=[],
     test_mode=False,
@@ -476,7 +471,7 @@
 
 dataset_body = dict(
     type='CombinedDataset',
-    metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+    metainfo=dict(from_file='mmpose::_base_/datasets/coco_wholebody.py'),
     datasets=[
         dataset_aic,
         dataset_crowdpose,
@@ -491,7 +486,7 @@
 
 dataset_face = dict(
     type='CombinedDataset',
-    metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+    metainfo=dict(from_file='mmpose::_base_/datasets/coco_wholebody.py'),
     datasets=[
         dataset_wflw,
         dataset_300w,
@@ -559,7 +554,7 @@
 
 dataset_hand = dict(
     type='CombinedDataset',
-    metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
+    metainfo=dict(from_file='mmpose::_base_/datasets/coco_wholebody.py'),
     datasets=[dataset_interhand3d],
     pipeline=[],
     test_mode=False,
@@ -615,7 +610,7 @@
         type='CombinedDataset',
         datasets=train_datasets,
         pipeline=train_pipeline,
-        metainfo=dict(from_file='configs/_base_/datasets/h3wb.py'),
+        metainfo=dict(from_file='mmpose::_base_/datasets/h3wb.py'),
         test_mode=False))
 
 # hooks

From 907d1cbb6cd55b6ad5647c7c439b3c869ed334c5 Mon Sep 17 00:00:00 2001
From: xiexinch <xiexinch@outlook.com>
Date: Sun, 28 Apr 2024 17:00:18 +0800
Subject: [PATCH 09/15] --fix=fix pretrain link

---
 projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py | 2 +-
 projects/rtmpose3d/configs/rtmw3d-x_8xb32_cocktail14-384x288.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py b/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py
index 997d43a6fa..8511a4a039 100644
--- a/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py
+++ b/projects/rtmpose3d/configs/rtmw3d-l_8xb64_cocktail14-384x288.py
@@ -55,7 +55,7 @@
     use_dark=False,
     root_index=(11, 12))
 
-backbone_path = 'checkpoints/rtmpose-l_simcc-ucoco_dw-ucoco_270e-256x192-4d6dfc62_20230728.pth'  # noqa
+backbone_path = 'https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-l_simcc-ucoco_dw-ucoco_270e-256x192-4d6dfc62_20230728.pth'  # noqa
 
 # model settings
 model = dict(
diff --git a/projects/rtmpose3d/configs/rtmw3d-x_8xb32_cocktail14-384x288.py b/projects/rtmpose3d/configs/rtmw3d-x_8xb32_cocktail14-384x288.py
index 0126290826..61f93f108a 100644
--- a/projects/rtmpose3d/configs/rtmw3d-x_8xb32_cocktail14-384x288.py
+++ b/projects/rtmpose3d/configs/rtmw3d-x_8xb32_cocktail14-384x288.py
@@ -55,7 +55,7 @@
     use_dark=False,
     root_index=(11, 12))
 
-backbone_path = 'checkpoints/rtmpose-x_simcc-ucoco_pt-aic-coco_270e-384x288-f5b50679_20230822.pth'  # noqa
+backbone_path = 'https://download.openmmlab.com/mmpose/v1/wholebody_2d_keypoint/rtmpose/ubody/rtmpose-x_simcc-ucoco_pt-aic-coco_270e-384x288-f5b50679_20230822.pth'  # noqa
 
 # model settings
 model = dict(

From cf2420296c587945d30603736f2344bb9f87a672 Mon Sep 17 00:00:00 2001
From: xiexinch <xiexinch@outlook.com>
Date: Sun, 28 Apr 2024 17:37:54 +0800
Subject: [PATCH 10/15] --fix=fix loss name

---
 mmpose/models/losses/regression_loss.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/mmpose/models/losses/regression_loss.py b/mmpose/models/losses/regression_loss.py
index 948d65bae7..83d03625b1 100644
--- a/mmpose/models/losses/regression_loss.py
+++ b/mmpose/models/losses/regression_loss.py
@@ -573,7 +573,11 @@ class BoneLoss(nn.Module):
         loss_weight (float): Weight of the loss. Default: 1.0.
     """
 
-    def __init__(self, joint_parents, use_target_weight=False, loss_weight=1.):
+    def __init__(self,
+                 joint_parents,
+                 use_target_weight: bool = False,
+                 loss_weight: float = 1.,
+                 loss_name: str = 'loss_bone'):
         super().__init__()
         self.joint_parents = joint_parents
         self.use_target_weight = use_target_weight
@@ -584,6 +588,8 @@ def __init__(self, joint_parents, use_target_weight=False, loss_weight=1.):
             if i != self.joint_parents[i]:
                 self.non_root_indices.append(i)
 
+        self._loss_name = loss_name
+
     def forward(self, output, target, target_weight=None):
         """Forward function.
 
@@ -606,6 +612,7 @@ def forward(self, output, target, target_weight=None):
             dim=-1)[:, self.non_root_indices]
         if self.use_target_weight:
             assert target_weight is not None
+            target_weight = target_weight[:, self.non_root_indices]
             loss = torch.mean(
                 torch.abs((output_bone * target_weight).mean(dim=0) -
                           (target_bone * target_weight).mean(dim=0)))
@@ -615,6 +622,15 @@ def forward(self, output, target, target_weight=None):
 
         return loss * self.loss_weight
 
+    @property
+    def loss_name(self):
+        """Loss Name.
+
+        Returns:
+            str: The name of this loss item.
+        """
+        return self._loss_name
+
 
 @MODELS.register_module()
 class SemiSupervisionLoss(nn.Module):

From e0ebd1d6a9c8ab48d3bdec8257a5501f55129d47 Mon Sep 17 00:00:00 2001
From: xiexinch <xiexinch@outlook.com>
Date: Mon, 6 May 2024 10:17:32 +0800
Subject: [PATCH 11/15] --doc=update readme

---
 projects/rtmpose3d/README.md | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/projects/rtmpose3d/README.md b/projects/rtmpose3d/README.md
index 4f75c44fe2..b680f2a6b7 100644
--- a/projects/rtmpose3d/README.md
+++ b/projects/rtmpose3d/README.md
@@ -4,9 +4,16 @@
 
 RTMPose3D is a toolkit for real-time 3D pose estimation. It is based on the RTMPose model, which is a 2D pose estimation model that is capable of predicting 2D keypoints and body part associations in real-time. RTMPose3D extends RTMPose by adding a 3D pose estimation branch that can predict 3D keypoints from images directly.
 
+## 🗂️ Model Zoo
+
+| Model                                                      | AP on COCO-Wholebody | MPJPE on H3WB |                                                   Download                                                    |
+| :--------------------------------------------------------- | :------------------: | :-----------: | :-----------------------------------------------------------------------------------------------------------: |
+| [RTMW3D-L](./configs/rtmw3d-l_8xb64_cocktail14-384x288.py) |        0.678         |     0.052     | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_coco-640x640-8db55a59_20231211.pth) |
+| [RTMW3D-X](./configs/rtmw3d-x_8xb32_cocktail14-384x288.py) |        0.687         |     0.056     | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_coco-640x640-8db55a59_20231211.pth) |
+
 ## Usage
 
-👉🏼 TRY RTMO NOW
+👉🏼 TRY RTMPose3D NOW
 
 ```bash
 cd /path/to/mmpose/projects/rtmpose3d

From eb42de036e76ce2b1dbeb12d81f6573521e21790 Mon Sep 17 00:00:00 2001
From: xiexinch <xiexinch@outlook.com>
Date: Wed, 26 Jun 2024 14:12:15 +0800
Subject: [PATCH 12/15] --update=make path more robust

---
 mmpose/datasets/datasets/wholebody3d/h3wb_dataset.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/mmpose/datasets/datasets/wholebody3d/h3wb_dataset.py b/mmpose/datasets/datasets/wholebody3d/h3wb_dataset.py
index 26ffa7d14f..f6df56c1b8 100644
--- a/mmpose/datasets/datasets/wholebody3d/h3wb_dataset.py
+++ b/mmpose/datasets/datasets/wholebody3d/h3wb_dataset.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
 from typing import List, Tuple
 
 import numpy as np
@@ -179,9 +180,10 @@ def _load_annotations(self) -> Tuple[List[dict], List[dict]]:
                         ]],
                                         dtype=np.float32)
 
-                        img_path = f'{self.data_root}original/{subject}/Images/{act}.{cam}/frame_{frames[frame_ids[-1]]}.jpg'  # noqa
                         img_paths = [
-                            f'{self.data_root}original/{subject}/Images/{act}.{cam}/frame_{frames[i]}.jpg'  # noqa
+                            osp.join(self.data_root, 'original', subject,
+                                     'Images', f'{act}.{cam}',
+                                     f'frame_{frames[i]}.jpg')  # noqa
                             for i in frame_ids
                         ]
 
@@ -216,7 +218,7 @@ def _load_annotations(self) -> Tuple[List[dict], List[dict]]:
                             'img_paths':
                             img_paths,
                             'img_path':
-                            img_path,
+                            img_paths[-1],
                             'img_ids':
                             frame_ids,
                             'lifting_target':

From e35ff2376286e999c0efa4eb1907a6fa7d87bb1d Mon Sep 17 00:00:00 2001
From: xiexinch <xiexinch@outlook.com>
Date: Wed, 26 Jun 2024 21:22:42 +0800
Subject: [PATCH 13/15] --update=update results

---
 projects/rtmpose3d/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/projects/rtmpose3d/README.md b/projects/rtmpose3d/README.md
index b680f2a6b7..d73e710dc0 100644
--- a/projects/rtmpose3d/README.md
+++ b/projects/rtmpose3d/README.md
@@ -9,7 +9,7 @@ RTMPose3D is a toolkit for real-time 3D pose estimation. It is based on the RTMP
 | Model                                                      | AP on COCO-Wholebody | MPJPE on H3WB |                                                   Download                                                    |
 | :--------------------------------------------------------- | :------------------: | :-----------: | :-----------------------------------------------------------------------------------------------------------: |
 | [RTMW3D-L](./configs/rtmw3d-l_8xb64_cocktail14-384x288.py) |        0.678         |     0.052     | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_coco-640x640-8db55a59_20231211.pth) |
-| [RTMW3D-X](./configs/rtmw3d-x_8xb32_cocktail14-384x288.py) |        0.687         |     0.056     | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_coco-640x640-8db55a59_20231211.pth) |
+| [RTMW3D-X](./configs/rtmw3d-x_8xb32_cocktail14-384x288.py) |        0.680        |     0.052     | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_coco-640x640-8db55a59_20231211.pth) |
 
 ## Usage
 

From 755e515203047d14f6a265eef187f7b10c4c0e35 Mon Sep 17 00:00:00 2001
From: xiexinch <xiexinch@outlook.com>
Date: Thu, 27 Jun 2024 11:21:05 +0800
Subject: [PATCH 14/15] --fix=fix lint

---
 projects/rtmpose3d/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/projects/rtmpose3d/README.md b/projects/rtmpose3d/README.md
index d73e710dc0..8c9c7542f8 100644
--- a/projects/rtmpose3d/README.md
+++ b/projects/rtmpose3d/README.md
@@ -9,7 +9,7 @@ RTMPose3D is a toolkit for real-time 3D pose estimation. It is based on the RTMP
 | Model                                                      | AP on COCO-Wholebody | MPJPE on H3WB |                                                   Download                                                    |
 | :--------------------------------------------------------- | :------------------: | :-----------: | :-----------------------------------------------------------------------------------------------------------: |
 | [RTMW3D-L](./configs/rtmw3d-l_8xb64_cocktail14-384x288.py) |        0.678         |     0.052     | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_coco-640x640-8db55a59_20231211.pth) |
-| [RTMW3D-X](./configs/rtmw3d-x_8xb32_cocktail14-384x288.py) |        0.680        |     0.052     | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_coco-640x640-8db55a59_20231211.pth) |
+| [RTMW3D-X](./configs/rtmw3d-x_8xb32_cocktail14-384x288.py) |        0.680         |     0.052     | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_coco-640x640-8db55a59_20231211.pth) |
 
 ## Usage
 

From 0f8da98e509bd92112477b86117a69356184f102 Mon Sep 17 00:00:00 2001
From: xiexinch <xiexinch@outlook.com>
Date: Fri, 12 Jul 2024 10:43:58 +0800
Subject: [PATCH 15/15] --update=add ckpt links

---
 projects/rtmpose3d/README.md                               | 7 +++++--
 .../rtmpose3d/configs/rtmw3d-x_8xb32_cocktail14-384x288.py | 5 ++---
 projects/rtmpose3d/rtmpose3d/pose_estimator.py             | 2 +-
 projects/rtmpose3d/rtmpose3d/simcc_3d_label.py             | 3 +++
 4 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/projects/rtmpose3d/README.md b/projects/rtmpose3d/README.md
index 8c9c7542f8..b8cd0adcda 100644
--- a/projects/rtmpose3d/README.md
+++ b/projects/rtmpose3d/README.md
@@ -4,12 +4,14 @@
 
 RTMPose3D is a toolkit for real-time 3D pose estimation. It is based on the RTMPose model, which is a 2D pose estimation model that is capable of predicting 2D keypoints and body part associations in real-time. RTMPose3D extends RTMPose by adding a 3D pose estimation branch that can predict 3D keypoints from images directly.
 
+Please refer to our [technical report](https://arxiv.org/pdf/2407.08634) for more details.
+
 ## 🗂️ Model Zoo
 
 | Model                                                      | AP on COCO-Wholebody | MPJPE on H3WB |                                                   Download                                                    |
 | :--------------------------------------------------------- | :------------------: | :-----------: | :-----------------------------------------------------------------------------------------------------------: |
-| [RTMW3D-L](./configs/rtmw3d-l_8xb64_cocktail14-384x288.py) |        0.678         |     0.052     | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_coco-640x640-8db55a59_20231211.pth) |
-| [RTMW3D-X](./configs/rtmw3d-x_8xb32_cocktail14-384x288.py) |        0.680         |     0.052     | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmo/rtmo-s_8xb32-600e_coco-640x640-8db55a59_20231211.pth) |
+| [RTMW3D-L](./configs/rtmw3d-l_8xb64_cocktail14-384x288.py) |        0.678         |     0.056     | [ckpt](https://download.openmmlab.com/mmpose/v1/wholebody_3d_keypoint/rtmw3d/rtmw3d-l_8xb64_cocktail14-384x288-794dbc78_20240626.pth) |
+| [RTMW3D-X](./configs/rtmw3d-x_8xb32_cocktail14-384x288.py) |        0.680         |     0.057     | [ckpt](https://download.openmmlab.com/mmpose/v1/wholebody_3d_keypoint/rtmw3d/rtmw3d-x_8xb64_cocktail14-384x288-b0a0eab7_20240626.pth) |
 
 ## Usage
 
@@ -17,5 +19,6 @@ RTMPose3D is a toolkit for real-time 3D pose estimation. It is based on the RTMP
 
 ```bash
 cd /path/to/mmpose/projects/rtmpose3d
+export PYTHONPATH=$(pwd):$PYTHONPATH
 python body3d_img2pose_demo.py configs/rtmdet_m_640-8xb32_coco-person.py https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmdet_m_8xb32-100e_coco-obj365-person-235e8209.pth configs\rtmw3d-l_8xb64_cocktail14-384x288.py rtmw3d-l_cock14-0d4ad840_20240422.pth --input /path/to/image --output-root /path/to/output
 ```
diff --git a/projects/rtmpose3d/configs/rtmw3d-x_8xb32_cocktail14-384x288.py b/projects/rtmpose3d/configs/rtmw3d-x_8xb32_cocktail14-384x288.py
index 61f93f108a..f1475e97c9 100644
--- a/projects/rtmpose3d/configs/rtmw3d-x_8xb32_cocktail14-384x288.py
+++ b/projects/rtmpose3d/configs/rtmw3d-x_8xb32_cocktail14-384x288.py
@@ -128,9 +128,8 @@
                 loss_weight=2.0)
         ],
         decoder=codec),
-    test_cfg=dict(flip_test=False, mode='2d')
-    # test_cfg=dict(flip_test=False)
-)
+    # test_cfg=dict(flip_test=False, mode='2d')
+    test_cfg=dict(flip_test=False))
 
 # base dataset settings
 data_mode = 'topdown'
diff --git a/projects/rtmpose3d/rtmpose3d/pose_estimator.py b/projects/rtmpose3d/rtmpose3d/pose_estimator.py
index 438cd2b67b..90ec43cdc7 100644
--- a/projects/rtmpose3d/rtmpose3d/pose_estimator.py
+++ b/projects/rtmpose3d/rtmpose3d/pose_estimator.py
@@ -67,7 +67,7 @@ def add_pred_to_datasample(self, batch_pred_instances: InstanceList,
                 c = np.array(camera_params['c'])
             else:
                 f = np.array([1145.04940459, 1143.78109572])
-                c = np.array(data_sample.ori_shape)
+                c = np.array(data_sample.ori_shape) / 2
             kpts_pixel = np.concatenate([
                 keypoints_2d,
                 (keypoints_3d[..., 2] + gt_instances.root_z)[..., None]
diff --git a/projects/rtmpose3d/rtmpose3d/simcc_3d_label.py b/projects/rtmpose3d/rtmpose3d/simcc_3d_label.py
index ead72c5090..22e42079c8 100644
--- a/projects/rtmpose3d/rtmpose3d/simcc_3d_label.py
+++ b/projects/rtmpose3d/rtmpose3d/simcc_3d_label.py
@@ -18,6 +18,8 @@ class SimCC3DLabel(BaseKeypointCodec):
     Human Pose Estimation`_ by Li et al (2022) for more details.
     Old name: SimDR
 
+    We generate the SimCC label for 3D keypoint estimation.
+
     Note:
 
         - instance number: N
@@ -93,6 +95,7 @@ def __init__(self,
             root_index, tuple) else [root_index]
 
         # Mean value of the root z-axis of datasets
+        # These values are statistics from the training set
         self.root_z = [5.14388]
         self.z_range = z_range if z_range is not None else 2.1744869