diff --git a/.circleci/test.yml b/.circleci/test.yml
index 9e24535f7..149d6cac1 100644
--- a/.circleci/test.yml
+++ b/.circleci/test.yml
@@ -67,7 +67,7 @@ jobs:
           command: |
             pip install -U openmim
             mim install git+https://github.com/open-mmlab/mmengine.git@main
-            mim install 'mmcv >= 2.0.0rc4'
+            mim install 'mmcv >= 2.0.0'
             mim install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
             pip install -r requirements/albu.txt
             pip install -r requirements/tests.txt
@@ -90,16 +90,17 @@ jobs:
           name: Run unittests
           command: |
             export LD_LIBRARY_PATH=/home/circleci/project/onnxruntime-linux-x64-1.8.1/lib:${LD_LIBRARY_PATH}
-            coverage run --branch --source mmyolo -m pytest tests/
-            coverage xml
-            coverage report -m
+            pytest tests/
+#            coverage run --branch --source mmyolo -m pytest tests/
+#            coverage xml
+#            coverage report -m
   build_cuda:
     parameters:
       torch:
         type: string
       cuda:
         type: enum
-        enum: ["10.1", "10.2", "11.1", "11.0"]
+        enum: ["10.1", "10.2", "11.0", "11.7"]
       cudnn:
         type: integer
         default: 7
@@ -125,7 +126,7 @@ jobs:
           command: |
             docker exec mmyolo pip install -U openmim
             docker exec mmyolo mim install -e /mmengine
-            docker exec mmyolo mim install 'mmcv >= 2.0.0rc4'
+            docker exec mmyolo mim install 'mmcv >= 2.0.0'
             docker exec mmyolo pip install -e /mmdetection
             docker exec mmyolo pip install -r requirements/albu.txt
             docker exec mmyolo pip install -r requirements/tests.txt
@@ -168,6 +169,9 @@ workflows:
             - lint
       - build_cpu:
           name: maximum_version_cpu
+          # mmdeploy not supported
+#          torch: 2.0.0
+#          torchvision: 0.15.1
           torch: 1.12.1
           torchvision: 0.13.1
           python: 3.9.0
@@ -185,6 +189,13 @@ workflows:
           cuda: "10.2"
           requires:
             - hold
+      - build_cuda:
+          name: maximum_version_gpu
+          torch: 2.0.0
+          cuda: "11.7"
+          cudnn: 8
+          requires:
+            - hold
   merge_stage_test:
     when:
       not: << pipeline.parameters.lint_only >>
diff --git a/.dev_scripts/gather_models.py b/.dev_scripts/gather_models.py
index ba5039c22..f05e2b5b3 100644
--- a/.dev_scripts/gather_models.py
+++ b/.dev_scripts/gather_models.py
@@ -108,6 +108,7 @@ def get_dataset_name(config):
     name_map = dict(
         CityscapesDataset='Cityscapes',
         CocoDataset='COCO',
+        PoseCocoDataset='COCO Person',
         YOLOv5CocoDataset='COCO',
         CocoPanopticDataset='COCO',
         YOLOv5DOTADataset='DOTA 1.0',
diff --git a/README.md b/README.md
index 96e0c08f5..b799a759c 100644
--- a/README.md
+++ b/README.md
@@ -77,17 +77,13 @@ English | [简体中文](README_zh-CN.md)
 
 ## 🥳 🚀 What's New [🔝](#-table-of-contents)
 
-💎 **v0.5.0** was released on 2/3/2023:
-
-1. Support [RTMDet-R](https://github.com/open-mmlab/mmyolo/blob/dev/configs/rtmdet/README.md#rotated-object-detection) rotated object detection
-2. Support for using mask annotation to improve [YOLOv8](https://github.com/open-mmlab/mmyolo/blob/dev/configs/yolov8/README.md) object detection performance
-3. Support [MMRazor](https://github.com/open-mmlab/mmyolo/blob/dev/configs/razor/subnets/README.md) searchable NAS sub-network as the backbone of YOLO series algorithm
-4. Support calling [MMRazor](https://github.com/open-mmlab/mmyolo/blob/dev/configs/rtmdet/distillation/README.md) to distill the knowledge of RTMDet
-5. [MMYOLO](https://mmyolo.readthedocs.io/zh_CN/dev/) document structure optimization, comprehensive content upgrade
-6. Improve YOLOX mAP and training speed based on RTMDet training hyperparameters
-7. Support calculation of model parameters and FLOPs, provide GPU latency data on T4 devices, and update [Model Zoo](https://github.com/open-mmlab/mmyolo/blob/dev/docs/en/model_zoo.md)
-8. Support test-time augmentation (TTA)
-9. Support RTMDet, YOLOv8 and YOLOv7 assigner visualization
+💎 **v0.6.0** was released on 15/8/2023:
+
+- Support YOLOv5 instance segmentation
+- Support YOLOX-Pose based on MMPose
+- Add 15 minutes instance segmentation tutorial.
+- YOLOv5 supports using mask annotation to optimize bbox
+- Add Multi-scale training and testing docs
 
 For release history and update details, please refer to [changelog](https://mmyolo.readthedocs.io/en/latest/notes/changelog.html).
 
@@ -150,7 +146,7 @@ conda activate mmyolo
 pip install openmim
 mim install "mmengine>=0.6.0"
 mim install "mmcv>=2.0.0rc4,<2.1.0"
-mim install "mmdet>=3.0.0rc6,<3.1.0"
+mim install "mmdet>=3.0.0,<4.0.0"
 git clone https://github.com/open-mmlab/mmyolo.git
 cd mmyolo
 # Install albumentations
@@ -184,6 +180,7 @@ For different parts from MMDetection, we have also prepared user guides and adva
 <summary>Recommended Topics</summary>
 
 - [How to contribute code to MMYOLO](docs/en/recommended_topics/contributing.md)
+- [Training testing tricks](docs/en/recommended_topics/training_testing_tricks.md)
 - [MMYOLO model design](docs/en/recommended_topics/model_design.md)
 - [Algorithm principles and implementation](docs/en/recommended_topics/algorithm_descriptions/)
 - [Replace the backbone network](docs/en/recommended_topics/replace_backbone.md)
@@ -192,7 +189,7 @@ For different parts from MMDetection, we have also prepared user guides and adva
 - [Visualization](docs/en/recommended_topics/visualization.md)
 - [Model deployment](docs/en/recommended_topics/deploy/)
 - [Troubleshooting steps](docs/en/recommended_topics/troubleshooting_steps.md)
-- [MMYOLO industry examples](docs/en/recommended_topics/industry_examples.md)
+- [MMYOLO application examples](docs/en/recommended_topics/application_examples/)
 - [MM series repo essential basics](docs/en/recommended_topics/mm_basics.md)
 - [Dataset preparation and description](docs/en/recommended_topics/dataset_preparation.md)
 
@@ -204,6 +201,7 @@ For different parts from MMDetection, we have also prepared user guides and adva
 - [Resume training](docs/en/common_usage/resume_training.md)
 - [Enabling and disabling SyncBatchNorm](docs/en/common_usage/syncbn.md)
 - [Enabling AMP](docs/en/common_usage/amp_training.md)
+- [Multi-scale training and testing](docs/en/common_usage/ms_training_testing.md)
 - [TTA Related Notes](docs/en/common_usage/tta.md)
 - [Add plugins to the backbone network](docs/en/common_usage/plugins.md)
 - [Freeze layers](docs/en/common_usage/freeze_layers.md)
@@ -283,6 +281,7 @@ Results and models are available in the [model zoo](docs/en/model_zoo.md).
 <summary><b>Supported Algorithms</b></summary>
 
 - [x] [YOLOv5](configs/yolov5)
+- [ ] [YOLOv5u](configs/yolov5/yolov5u) (Inference only)
 - [x] [YOLOX](configs/yolox)
 - [x] [RTMDet](configs/rtmdet)
 - [x] [RTMDet-Rotated](configs/rtmdet)
@@ -405,8 +404,8 @@ This project is released under the [GPL 3.0 license](LICENSE).
 
 - [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab foundational library for training deep learning models.
 - [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab foundational library for computer vision.
-- [MIM](https://github.com/open-mmlab/mim): MIM installs OpenMMLab packages.
-- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab image classification toolbox and benchmark.
+- [MMPreTrain](https://github.com/open-mmlab/mmpretrain): OpenMMLab pre-training toolbox and benchmark.
+- [MMagic](https://github.com/open-mmlab/mmagic): Open**MM**Lab **A**dvanced, **G**enerative and **I**ntelligent **C**reation toolbox.
 - [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark.
 - [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection.
 - [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab rotated object detection toolbox and benchmark.
@@ -424,4 +423,6 @@ This project is released under the [GPL 3.0 license](LICENSE).
 - [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab image and video editing toolbox.
 - [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab image and video generative models toolbox.
 - [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab model deployment framework.
+- [MIM](https://github.com/open-mmlab/mim): MIM installs OpenMMLab packages.
 - [MMEval](https://github.com/open-mmlab/mmeval): OpenMMLab machine learning evaluation library.
+- [Playground](https://github.com/open-mmlab/playground): A central hub for gathering and showcasing amazing projects built upon OpenMMLab.
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 16c55a65c..6eb4d95fe 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -78,17 +78,13 @@
 
 ## 🥳 🚀 最新进展 [🔝](#-table-of-contents)
 
-💎 **v0.5.0** 版本已经在 2023.3.2 发布：
-
-1. 支持了 [RTMDet-R](https://github.com/open-mmlab/mmyolo/blob/dev/configs/rtmdet/README.md#rotated-object-detection) 旋转框目标检测任务和算法
-2. [YOLOv8](https://github.com/open-mmlab/mmyolo/blob/dev/configs/yolov8/README.md) 支持使用 mask 标注提升目标检测模型性能
-3. 支持 [MMRazor](https://github.com/open-mmlab/mmyolo/blob/dev/configs/razor/subnets/README.md) 搜索的 NAS 子网络作为 YOLO 系列算法的 backbone
-4. 支持调用 [MMRazor](https://github.com/open-mmlab/mmyolo/blob/dev/configs/rtmdet/distillation/README.md) 对 RTMDet 进行知识蒸馏
-5. [MMYOLO](https://mmyolo.readthedocs.io/zh_CN/dev/) 文档结构优化，内容全面升级
-6. 基于 RTMDet 训练超参提升 YOLOX 精度和训练速度
-7. 支持模型参数量、FLOPs 计算和提供 T4 设备上 GPU 延时数据，并更新了 [Model Zoo](https://github.com/open-mmlab/mmyolo/blob/dev/docs/zh_cn/model_zoo.md)
-8. 支持测试时增强 TTA
-9. 支持 RTMDet、YOLOv8 和 YOLOv7 assigner 可视化
+💎 **v0.6.0** 版本已经在 2023.8.15 发布：
+
+- 支持 YOLOv5 实例分割
+- 基于 MMPose 支持 YOLOX-Pose
+- 添加 15 分钟的实例分割教程
+- YOLOv5 支持使用 mask 标注来优化边界框
+- 添加多尺度训练和测试文档
 
 我们提供了实用的**脚本命令速查表**
 
@@ -108,7 +104,7 @@
 | 🌟  | 自定义数据集从标注到部署保姆级教程 | [![Link](https://i2.hdslb.com/bfs/archive/13f566c89a18c9c881713b63ec14da952d4c0b14.jpg@112w_63h_1c.webp)](https://www.bilibili.com/video/BV1RG4y137i5)  [![bilibili](https://img.shields.io/badge/dynamic/json?label=views&style=social&logo=bilibili&query=data.stat.view&url=https%3A%2F%2Fapi.bilibili.com%2Fx%2Fweb-interface%2Fview%3Fbvid%3DBV1RG4y137i5)](https://www.bilibili.com/video/BV1JG4y1d7GC) |                                                 [自定义数据集从标注到部署保姆级教程](https://github.com/open-mmlab/mmyolo/blob/dev/docs/zh_cn/user_guides/custom_dataset.md)                                                  |
 | 🌟  |      顶会第一步 · 模块自定义       | [![Link](http://i2.hdslb.com/bfs/archive/5b23d41ac57466824eaf185ef806ef734414e93b.jpg@112w_63h_1c.webp)](https://www.bilibili.com/video/BV1yd4y1j7VD)  [![bilibili](https://img.shields.io/badge/dynamic/json?label=views&style=social&logo=bilibili&query=data.stat.view&url=https%3A%2F%2Fapi.bilibili.com%2Fx%2Fweb-interface%2Fview%3Fbvid%3DBV1yd4y1j7VD)](https://www.bilibili.com/video/BV1yd4y1j7VD)  |                                    [顶会第一步·模块自定义.ipynb](https://github.com/open-mmlab/OpenMMLabCourse/blob/main/codes/MMYOLO_tutorials/[实用类第四期]顶会第一步·模块自定义.ipynb)                                    |
 
-完整视频列表请参考 [资源汇总页面](https://mmyolo.readthedocs.io/zh_CN/latest/article.html)
+完整视频列表请参考 [中文解读资源汇总 - 视频](https://mmyolo.readthedocs.io/zh_CN/latest/get_started/article.html)
 
 发布历史和更新细节请参考 [更新日志](https://mmyolo.readthedocs.io/zh_CN/latest/notes/changelog.html)
 
@@ -171,7 +167,7 @@ conda activate mmyolo
 pip install openmim
 mim install "mmengine>=0.6.0"
 mim install "mmcv>=2.0.0rc4,<2.1.0"
-mim install "mmdet>=3.0.0rc6,<3.1.0"
+mim install "mmdet>=3.0.0,<4.0.0"
 git clone https://github.com/open-mmlab/mmyolo.git
 cd mmyolo
 # Install albumentations
@@ -205,6 +201,7 @@ MMYOLO 用法和 MMDetection 几乎一致，所有教程都是通用的，你也
 <summary>推荐专题</summary>
 
 - [如何给 MMYOLO 贡献代码](docs/zh_cn/recommended_topics/contributing.md)
+- [训练和测试技巧](docs/zh_cn/recommended_topics/training_testing_tricks.md)
 - [MMYOLO 模型结构设计](docs/zh_cn/recommended_topics/model_design.md)
 - [原理和实现全解析](docs/zh_cn/recommended_topics/algorithm_descriptions/)
 - [轻松更换主干网络](docs/zh_cn/recommended_topics/replace_backbone.md)
@@ -213,7 +210,7 @@ MMYOLO 用法和 MMDetection 几乎一致，所有教程都是通用的，你也
 - [关于可视化的一切](docs/zh_cn/recommended_topics/visualization.md)
 - [模型部署流程](docs/zh_cn/recommended_topics/deploy/)
 - [常见错误排查步骤](docs/zh_cn/recommended_topics/troubleshooting_steps.md)
-- [MMYOLO 产业范例介绍](docs/zh_cn/recommended_topics/industry_examples.md)
+- [MMYOLO 应用范例介绍](docs/zh_cn/recommended_topics/application_examples/)
 - [MM 系列 Repo 必备基础](docs/zh_cn/recommended_topics/mm_basics.md)
 - [数据集准备和说明](docs/zh_cn/recommended_topics/dataset_preparation.md)
 
@@ -225,9 +222,10 @@ MMYOLO 用法和 MMDetection 几乎一致，所有教程都是通用的，你也
 - [恢复训练](docs/zh_cn/common_usage/resume_training.md)
 - [开启和关闭 SyncBatchNorm](docs/zh_cn/common_usage/syncbn.md)
 - [开启混合精度训练](docs/zh_cn/common_usage/amp_training.md)
+- [多尺度训练和测试](docs/zh_cn/common_usage/ms_training_testing.md)
 - [测试时增强相关说明](docs/zh_cn/common_usage/tta.md)
 - [给主干网络增加插件](docs/zh_cn/common_usage/plugins.md)
-- [冻结指定网络层权重](docs/zh_cn/common_usage/common_usage/freeze_layers.md)
+- [冻结指定网络层权重](docs/zh_cn/common_usage/freeze_layers.md)
 - [输出模型预测结果](docs/zh_cn/common_usage/output_predictions.md)
 - [设置随机种子](docs/zh_cn/common_usage/set_random_seed.md)
 - [算法组合替换教程](docs/zh_cn/common_usage/module_combination.md)
@@ -305,6 +303,7 @@ MMYOLO 用法和 MMDetection 几乎一致，所有教程都是通用的，你也
 <summary><b>支持的算法</b></summary>
 
 - [x] [YOLOv5](configs/yolov5)
+- [ ] [YOLOv5u](configs/yolov5/yolov5u) (仅推理)
 - [x] [YOLOX](configs/yolox)
 - [x] [RTMDet](configs/rtmdet)
 - [x] [RTMDet-Rotated](configs/rtmdet)
@@ -426,8 +425,8 @@ MMYOLO 是一款由来自不同高校和企业的研发人员共同参与贡献
 
 - [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab 深度学习模型训练基础库
 - [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab 计算机视觉基础库
-- [MIM](https://github.com/open-mmlab/mim): MIM 是 OpenMMlab 项目、算法、模型的统一入口
-- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab 图像分类工具箱
+- [MMPreTrain](https://github.com/open-mmlab/mmpretrain): OpenMMLab 深度学习预训练工具箱
+- [MMagic](https://github.com/open-mmlab/mmagic): OpenMMLab 新一代人工智能内容生成（AIGC）工具箱
 - [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab 目标检测工具箱
 - [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab 新一代通用 3D 目标检测平台
 - [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab 旋转框检测工具箱与测试基准
@@ -445,7 +444,9 @@ MMYOLO 是一款由来自不同高校和企业的研发人员共同参与贡献
 - [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab 图像视频编辑工具箱
 - [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab 图片视频生成模型工具箱
 - [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab 模型部署框架
+- [MIM](https://github.com/open-mmlab/mim): MIM 是 OpenMMlab 项目、算法、模型的统一入口
 - [MMEval](https://github.com/open-mmlab/mmeval): OpenMMLab 机器学习算法评测库
+- [Playground](https://github.com/open-mmlab/playground): 收集和展示 OpenMMLab 相关的前沿、有趣的社区项目
 
 ## ❤️ 欢迎加入 OpenMMLab 社区 [🔝](#-table-of-contents)
 
diff --git a/configs/_base_/default_runtime.py b/configs/_base_/default_runtime.py
index 2f0db2e3d..098f22057 100644
--- a/configs/_base_/default_runtime.py
+++ b/configs/_base_/default_runtime.py
@@ -25,10 +25,19 @@
 load_from = None
 resume = False
 
-# file_client_args = dict(
-#         backend='petrel',
-#         path_mapping=dict({
-#             './data/': 's3://openmmlab/datasets/detection/',
-#             'data/': 's3://openmmlab/datasets/detection/'
-#         }))
-file_client_args = dict(backend='disk')
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions
+# before MMDet 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+
+backend_args = None
diff --git a/configs/_base_/det_p5_tta.py b/configs/_base_/det_p5_tta.py
index cbbaf2e68..8df0d5ea8 100644
--- a/configs/_base_/det_p5_tta.py
+++ b/configs/_base_/det_p5_tta.py
@@ -1,11 +1,12 @@
-# TODO: Need to solve the problem of multiple file_client_args parameters
-# _file_client_args = dict(
+# TODO: Need to solve the problem of multiple backend_args parameters
+# _backend_args = dict(
 #     backend='petrel',
 #     path_mapping=dict({
 #         './data/': 's3://openmmlab/datasets/detection/',
 #         'data/': 's3://openmmlab/datasets/detection/'
 #     }))
-_file_client_args = dict(backend='disk')
+
+_backend_args = None
 
 tta_model = dict(
     type='mmdet.DetTTAModel',
@@ -37,7 +38,7 @@
 ]
 
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_backend_args),
     dict(
         type='TestTimeAug',
         transforms=[
diff --git a/configs/_base_/pose/coco.py b/configs/_base_/pose/coco.py
new file mode 100644
index 000000000..865a95bc0
--- /dev/null
+++ b/configs/_base_/pose/coco.py
@@ -0,0 +1,181 @@
+dataset_info = dict(
+    dataset_name='coco',
+    paper_info=dict(
+        author='Lin, Tsung-Yi and Maire, Michael and '
+        'Belongie, Serge and Hays, James and '
+        'Perona, Pietro and Ramanan, Deva and '
+        r'Doll{\'a}r, Piotr and Zitnick, C Lawrence',
+        title='Microsoft coco: Common objects in context',
+        container='European conference on computer vision',
+        year='2014',
+        homepage='http://cocodataset.org/',
+    ),
+    keypoint_info={
+        0:
+        dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+        1:
+        dict(
+            name='left_eye',
+            id=1,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_eye'),
+        2:
+        dict(
+            name='right_eye',
+            id=2,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_eye'),
+        3:
+        dict(
+            name='left_ear',
+            id=3,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_ear'),
+        4:
+        dict(
+            name='right_ear',
+            id=4,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_ear'),
+        5:
+        dict(
+            name='left_shoulder',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        6:
+        dict(
+            name='right_shoulder',
+            id=6,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        7:
+        dict(
+            name='left_elbow',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        8:
+        dict(
+            name='right_elbow',
+            id=8,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        9:
+        dict(
+            name='left_wrist',
+            id=9,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='left_hip',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        12:
+        dict(
+            name='right_hip',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        13:
+        dict(
+            name='left_knee',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        14:
+        dict(
+            name='right_knee',
+            id=14,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        15:
+        dict(
+            name='left_ankle',
+            id=15,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        16:
+        dict(
+            name='right_ankle',
+            id=16,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle')
+    },
+    skeleton_info={
+        0:
+        dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+        1:
+        dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+        2:
+        dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+        5:
+        dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+        6:
+        dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+        7:
+        dict(
+            link=('left_shoulder', 'right_shoulder'),
+            id=7,
+            color=[51, 153, 255]),
+        8:
+        dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+        9:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]),
+        13:
+        dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]),
+        14:
+        dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]),
+        15:
+        dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]),
+        16:
+        dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]),
+        17:
+        dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]),
+        18:
+        dict(
+            link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255])
+    },
+    joint_weights=[
+        1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
+        1.5
+    ],
+    sigmas=[
+        0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+        0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089
+    ])
diff --git a/configs/deploy/model/yolov5_s-static.py b/configs/deploy/model/yolov5_s-static.py
index 470807e82..11b7f6a04 100644
--- a/configs/deploy/model/yolov5_s-static.py
+++ b/configs/deploy/model/yolov5_s-static.py
@@ -1,7 +1,7 @@
 _base_ = '../../yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py'
 
 test_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(
         type='LetterResize',
         scale=_base_.img_scale,
diff --git a/configs/deploy/model/yolov6_s-static.py b/configs/deploy/model/yolov6_s-static.py
index d9044aba7..4f64438ca 100644
--- a/configs/deploy/model/yolov6_s-static.py
+++ b/configs/deploy/model/yolov6_s-static.py
@@ -1,7 +1,7 @@
 _base_ = '../../yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py'
 
 test_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(
         type='LetterResize',
         scale=_base_.img_scale,
diff --git a/configs/ppyoloe/README.md b/configs/ppyoloe/README.md
index 6ea827ec0..70a5b2055 100644
--- a/configs/ppyoloe/README.md
+++ b/configs/ppyoloe/README.md
@@ -19,12 +19,12 @@ PPYOLOE-PLUS-l model structure
 
 ### PPYOLOE+ COCO
 
-|  Backbone   | Arch | Size | Epoch | SyncBN | Mem (GB) | Box AP |                          Config                           |                                                                                                                                                      Download                                                                                                                                                      |
-| :---------: | :--: | :--: | :---: | :----: | :------: | :----: | :-------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-| PPYOLOE+ -s |  P5  | 640  |  80   |  Yes   |   4.7    |  43.5  | [config](../ppyoloe/ppyoloe_plus_s_fast_8xb8-80e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_s_fast_8xb8-80e_coco/ppyoloe_plus_s_fast_8xb8-80e_coco_20230101_154052-9fee7619.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_s_fast_8xb8-80e_coco/ppyoloe_plus_s_fast_8xb8-80e_coco_20230101_154052.log.json) |
-| PPYOLOE+ -m |  P5  | 640  |  80   |  Yes   |   8.4    |  49.5  | [config](../ppyoloe/ppyoloe_plus_m_fast_8xb8-80e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_m_fast_8xb8-80e_coco/ppyoloe_plus_m_fast_8xb8-80e_coco_20230104_193132-e4325ada.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_m_fast_8xb8-80e_coco/ppyoloe_plus_m_fast_8xb8-80e_coco_20230104_193132.log.json) |
-| PPYOLOE+ -l |  P5  | 640  |  80   |  Yes   |   13.2   |  52.6  | [config](../ppyoloe/ppyoloe_plus_l_fast_8xb8-80e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_l_fast_8xb8-80e_coco/ppyoloe_plus_l_fast_8xb8-80e_coco_20230102_203825-1864e7b3.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_l_fast_8xb8-80e_coco/ppyoloe_plus_l_fast_8xb8-80e_coco_20230102_203825.log.json) |
-| PPYOLOE+ -x |  P5  | 640  |  80   |  Yes   |   19.1   |  54.2  | [config](../ppyoloe/ppyoloe_plus_x_fast_8xb8-80e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_x_fast_8xb8-80e_coco/ppyoloe_plus_x_fast_8xb8-80e_coco_20230104_194921-8c953949.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_x_fast_8xb8-80e_coco/ppyoloe_plus_x_fast_8xb8-80e_coco_20230104_194921.log.json) |
+|  Backbone   | Arch | Size | Epoch | SyncBN | Mem (GB) | Box AP |                      Config                      |                                                                                                                                                      Download                                                                                                                                                      |
+| :---------: | :--: | :--: | :---: | :----: | :------: | :----: | :----------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| PPYOLOE+ -s |  P5  | 640  |  80   |  Yes   |   4.7    |  43.5  | [config](./ppyoloe_plus_s_fast_8xb8-80e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_s_fast_8xb8-80e_coco/ppyoloe_plus_s_fast_8xb8-80e_coco_20230101_154052-9fee7619.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_s_fast_8xb8-80e_coco/ppyoloe_plus_s_fast_8xb8-80e_coco_20230101_154052.log.json) |
+| PPYOLOE+ -m |  P5  | 640  |  80   |  Yes   |   8.4    |  49.5  | [config](./ppyoloe_plus_m_fast_8xb8-80e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_m_fast_8xb8-80e_coco/ppyoloe_plus_m_fast_8xb8-80e_coco_20230104_193132-e4325ada.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_m_fast_8xb8-80e_coco/ppyoloe_plus_m_fast_8xb8-80e_coco_20230104_193132.log.json) |
+| PPYOLOE+ -l |  P5  | 640  |  80   |  Yes   |   13.2   |  52.6  | [config](./ppyoloe_plus_l_fast_8xb8-80e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_l_fast_8xb8-80e_coco/ppyoloe_plus_l_fast_8xb8-80e_coco_20230102_203825-1864e7b3.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_l_fast_8xb8-80e_coco/ppyoloe_plus_l_fast_8xb8-80e_coco_20230102_203825.log.json) |
+| PPYOLOE+ -x |  P5  | 640  |  80   |  Yes   |   19.1   |  54.2  | [config](./ppyoloe_plus_x_fast_8xb8-80e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_x_fast_8xb8-80e_coco/ppyoloe_plus_x_fast_8xb8-80e_coco_20230104_194921-8c953949.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_x_fast_8xb8-80e_coco/ppyoloe_plus_x_fast_8xb8-80e_coco_20230104_194921.log.json) |
 
 **Note**:
 
diff --git a/configs/ppyoloe/ppyoloe_plus_s_fast_8xb8-80e_coco.py b/configs/ppyoloe/ppyoloe_plus_s_fast_8xb8-80e_coco.py
index e44dc34a5..3d98252cc 100644
--- a/configs/ppyoloe/ppyoloe_plus_s_fast_8xb8-80e_coco.py
+++ b/configs/ppyoloe/ppyoloe_plus_s_fast_8xb8-80e_coco.py
@@ -129,7 +129,7 @@
         max_per_img=300))
 
 train_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(type='LoadAnnotations', with_bbox=True),
     dict(type='PPYOLOERandomDistort'),
     dict(type='mmdet.Expand', mean=(103.53, 116.28, 123.675)),
@@ -157,7 +157,7 @@
         pipeline=train_pipeline))
 
 test_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(
         type='mmdet.FixShapeResize',
         width=img_scale[0],
diff --git a/configs/razor/subnets/README.md b/configs/razor/subnets/README.md
index 093dedd70..456021bdd 100644
--- a/configs/razor/subnets/README.md
+++ b/configs/razor/subnets/README.md
@@ -62,14 +62,14 @@ CUDA_VISIBLE_DEVICES=0 PORT=29500 ./tools/dist_test.sh configs/razor/subnets/yol
 
 Here we provide the baseline version of YOLO Series with NAS backbone.
 
-|           Model            | size | box AP |  Params(M)   | FLOPs(G) |                                                                  Config                                                                   |                                                                                                                                                                   Download                                                                                                                                                                   |
-| :------------------------: | :--: | :----: | :----------: | :------: | :---------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-|          yolov5-s          | 640  |  37.7  |    7.235     |  8.265   |            [config](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py)            | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700.log.json) |
-| yolov5_s_spos_shufflenetv2 | 640  |  38.0  | 7.04(-2.7%)  |   7.03   |    [config](https://github.com/open-mmlab/mmyolo/tree/dev/configs/razor/subnets/yolov5_s_spos_shufflenetv2_syncbn_8xb16-300e_coco.py)     |                                                                                          [model](https://download.openmmlab.com/mmrazor/v1/yolo_nas_backbone/yolov5_s_spos_shufflenetv2_syncbn_8xb16-300e_coco_20230211_220635-578be9a9.pth) \| log                                                                                          |
-|          yolov6-s          | 640  |  44.0  |    18.869    |  24.253  |              [config](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py)              |         [model](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco/yolov6_s_syncbn_fast_8xb32-400e_coco_20221102_203035-932e1d91.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco/yolov6_s_syncbn_fast_8xb32-400e_coco_20221102_203035.log.json)         |
-|  yolov6_l_attentivenas_a6  | 640  |  45.3  | 18.38(-2.6%) |   8.49   | [config](https://github.com/open-mmlab/mmyolo/tree/dev/configs/razor/subnets/yolov6_l_attentivenas_a6_d12_syncbn_fast_8xb32-300e_coco.py) |                                                                                      [model](https://download.openmmlab.com/mmrazor/v1/yolo_nas_backbone/yolov6_l_attentivenas_a6_d12_syncbn_fast_8xb32-300e_coco_20230211_222409-dcc72668.pth) \| log                                                                                       |
-|        RTMDet-tiny         | 640  |  41.0  |     4.8      |   8.1    |                                            [config](./rtmdet_l_syncbn_fast_8xb32-300e_coco.py)                                            |   [model](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco/rtmdet_tiny_syncbn_fast_8xb32-300e_coco_20230102_140117-dbb1dc83.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco/rtmdet_tiny_syncbn_fast_8xb32-300e_coco_20230102_140117.log.json)   |
-|   rtmdet_tiny_ofa_lat31    | 960  |  41.3  | 3.91(-18.5%) |   6.09   |      [config](https://github.com/open-mmlab/mmyolo/tree/dev/configs/razor/subnets/rtmdet_tiny_ofa_lat31_syncbn_16xb16-300e_coco.py)       |                                                                                            [model](https://download.openmmlab.com/mmrazor/v1/yolo_nas_backbone/rtmdet_tiny_ofa_lat31_syncbn_16xb16-300e_coco_20230214_210623-449bb2a0.pth) \| log                                                                                            |
+|           Model            | size | box AP |  Params(M)   | FLOPs(G) |                                 Config                                  |                                                                                                                                                                   Download                                                                                                                                                                   |
+| :------------------------: | :--: | :----: | :----------: | :------: | :---------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|          yolov5-s          | 640  |  37.7  |    7.235     |  8.265   |   [config](../../yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py)    | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700.log.json) |
+| yolov5_s_spos_shufflenetv2 | 640  |  38.0  | 7.04(-2.7%)  |   7.03   |    [config](./yolov5_s_spos_shufflenetv2_syncbn_8xb16-300e_coco.py)     |                                                                                          [model](https://download.openmmlab.com/mmrazor/v1/yolo_nas_backbone/yolov5_s_spos_shufflenetv2_syncbn_8xb16-300e_coco_20230211_220635-578be9a9.pth) \| log                                                                                          |
+|          yolov6-s          | 640  |  44.0  |    18.869    |  24.253  |     [config](../../yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py)      |         [model](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco/yolov6_s_syncbn_fast_8xb32-400e_coco_20221102_203035-932e1d91.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco/yolov6_s_syncbn_fast_8xb32-400e_coco_20221102_203035.log.json)         |
+|  yolov6_l_attentivenas_a6  | 640  |  45.3  | 18.38(-2.6%) |   8.49   | [config](./yolov6_l_attentivenas_a6_d12_syncbn_fast_8xb32-300e_coco.py) |                                                                                      [model](https://download.openmmlab.com/mmrazor/v1/yolo_nas_backbone/yolov6_l_attentivenas_a6_d12_syncbn_fast_8xb32-300e_coco_20230211_222409-dcc72668.pth) \| log                                                                                       |
+|        RTMDet-tiny         | 640  |  41.0  |     4.8      |   8.1    |     [config](../../rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco.py)      |   [model](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco/rtmdet_tiny_syncbn_fast_8xb32-300e_coco_20230102_140117-dbb1dc83.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco/rtmdet_tiny_syncbn_fast_8xb32-300e_coco_20230102_140117.log.json)   |
+|   rtmdet_tiny_ofa_lat31    | 960  |  41.3  | 3.91(-18.5%) |   6.09   |      [config](./rtmdet_tiny_ofa_lat31_syncbn_16xb16-300e_coco.py)       |                                                                                            [model](https://download.openmmlab.com/mmrazor/v1/yolo_nas_backbone/rtmdet_tiny_ofa_lat31_syncbn_16xb16-300e_coco_20230214_210623-449bb2a0.pth) \| log                                                                                            |
 
 **Note**:
 
diff --git a/configs/razor/subnets/rtmdet_tiny_ofa_lat31_syncbn_16xb16-300e_coco.py b/configs/razor/subnets/rtmdet_tiny_ofa_lat31_syncbn_16xb16-300e_coco.py
index 04d8c2d8c..2f9da6685 100644
--- a/configs/razor/subnets/rtmdet_tiny_ofa_lat31_syncbn_16xb16-300e_coco.py
+++ b/configs/razor/subnets/rtmdet_tiny_ofa_lat31_syncbn_16xb16-300e_coco.py
@@ -41,7 +41,7 @@
     max_per_img=100)
 
 train_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(type='LoadAnnotations', with_bbox=True),
     dict(
         type='Mosaic',
@@ -73,7 +73,7 @@
 ]
 
 train_pipeline_stage2 = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(type='LoadAnnotations', with_bbox=True),
     dict(
         type='mmdet.RandomResize',
@@ -92,7 +92,7 @@
     batch_size=train_batch_size_per_gpu, dataset=dict(pipeline=train_pipeline))
 
 test_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(type='mmdet.Resize', scale=(960, 960), keep_ratio=True),
     dict(type='mmdet.Pad', size=(960, 960), pad_val=dict(img=(114, 114, 114))),
     dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),
diff --git a/configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-36e_dota.py b/configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-36e_dota.py
index 50f58e501..cbb2ae77a 100644
--- a/configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-36e_dota.py
+++ b/configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-36e_dota.py
@@ -169,7 +169,7 @@
 )
 
 train_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(type='LoadAnnotations', with_bbox=True, box_type='qbox'),
     dict(
         type='mmrotate.ConvertBoxType',
@@ -191,7 +191,7 @@
 ]
 
 val_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(type='mmdet.Resize', scale=img_scale, keep_ratio=True),
     dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))),
     dict(
@@ -209,7 +209,7 @@
 ]
 
 test_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(type='mmdet.Resize', scale=img_scale, keep_ratio=True),
     dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))),
     dict(
diff --git a/configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-aug-100e_dota.py b/configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-aug-100e_dota.py
index 45bbaa41b..dcafa55db 100644
--- a/configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-aug-100e_dota.py
+++ b/configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-aug-100e_dota.py
@@ -38,7 +38,7 @@
 # =======================Unmodified in most cases==================
 
 train_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(type='LoadAnnotations', with_bbox=True, box_type='qbox'),
     dict(
         type='mmrotate.ConvertBoxType',
@@ -77,7 +77,7 @@
 ]
 
 train_pipeline_stage2 = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(type='LoadAnnotations', with_bbox=True, box_type='qbox'),
     dict(
         type='mmrotate.ConvertBoxType',
diff --git a/configs/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco.py b/configs/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco.py
index fecd0d314..c36ac38ce 100644
--- a/configs/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco.py
+++ b/configs/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco.py
@@ -148,7 +148,7 @@
 )
 
 train_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(type='LoadAnnotations', with_bbox=True),
     dict(
         type='Mosaic',
@@ -175,7 +175,7 @@
 ]
 
 train_pipeline_stage2 = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(type='LoadAnnotations', with_bbox=True),
     dict(
         type='mmdet.RandomResize',
@@ -191,7 +191,7 @@
 ]
 
 test_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(type='YOLOv5KeepRatioResize', scale=img_scale),
     dict(
         type='LetterResize',
diff --git a/configs/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py b/configs/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py
index 47733ae65..8cead7805 100644
--- a/configs/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py
+++ b/configs/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py
@@ -33,7 +33,7 @@
     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
 
 train_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(type='LoadAnnotations', with_bbox=True),
     dict(
         type='Mosaic',
@@ -60,7 +60,7 @@
 ]
 
 train_pipeline_stage2 = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(type='LoadAnnotations', with_bbox=True),
     dict(
         type='mmdet.RandomResize',
diff --git a/configs/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco.py b/configs/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco.py
index 27d6762ae..257110d22 100644
--- a/configs/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco.py
+++ b/configs/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco.py
@@ -26,7 +26,7 @@
     bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
 
 train_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(type='LoadAnnotations', with_bbox=True),
     dict(
         type='Mosaic',
diff --git a/configs/yolov5/README.md b/configs/yolov5/README.md
index b22d880fc..bd33e83f4 100644
--- a/configs/yolov5/README.md
+++ b/configs/yolov5/README.md
@@ -20,19 +20,29 @@ YOLOv5-l-P6 model structure
 
 ### COCO
 
-| Backbone | Arch | size | SyncBN | AMP | Mem (GB) | box AP | TTA box AP |                                                         Config                                                         |                                                                                                                                                                         Download                                                                                                                                                                         |
-| :------: | :--: | :--: | :----: | :-: | :------: | :----: | :--------: | :--------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-| YOLOv5-n |  P5  | 640  |  Yes   | Yes |   1.5    |  28.0  |    30.7    |  [config](https://github.com/open-mmlab/mmyolo/tree/main/configs/yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco.py)   |       [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco/yolov5_n-v61_syncbn_fast_8xb16-300e_coco_20220919_090739-b804c1ad.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco/yolov5_n-v61_syncbn_fast_8xb16-300e_coco_20220919_090739.log.json)       |
-| YOLOv5-s |  P5  | 640  |  Yes   | Yes |   2.7    |  37.7  |    40.2    |  [config](https://github.com/open-mmlab/mmyolo/tree/main/configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py)   |       [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700.log.json)       |
-| YOLOv5-m |  P5  | 640  |  Yes   | Yes |   5.0    |  45.3  |    46.9    |  [config](https://github.com/open-mmlab/mmyolo/tree/main/configs/yolov5/yolov5_m-v61_syncbn_fast_8xb16-300e_coco.py)   |       [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-v61_syncbn_fast_8xb16-300e_coco/yolov5_m-v61_syncbn_fast_8xb16-300e_coco_20220917_204944-516a710f.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-v61_syncbn_fast_8xb16-300e_coco/yolov5_m-v61_syncbn_fast_8xb16-300e_coco_20220917_204944.log.json)       |
-| YOLOv5-l |  P5  | 640  |  Yes   | Yes |   8.1    |  48.8  |    49.9    |  [config](https://github.com/open-mmlab/mmyolo/tree/main/configs/yolov5/yolov5_l-v61_syncbn_fast_8xb16-300e_coco.py)   |       [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_l-v61_syncbn_fast_8xb16-300e_coco/yolov5_l-v61_syncbn_fast_8xb16-300e_coco_20220917_031007-096ef0eb.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_l-v61_syncbn_fast_8xb16-300e_coco/yolov5_l-v61_syncbn_fast_8xb16-300e_coco_20220917_031007.log.json)       |
-| YOLOv5-n |  P6  | 1280 |  Yes   | Yes |   5.8    |  35.9  |            | [config](https://github.com/open-mmlab/mmyolo/tree/main/configs/yolov5/yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_224705-d493c5f3.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_224705.log.json) |
-| YOLOv5-s |  P6  | 1280 |  Yes   | Yes |   10.5   |  44.4  |            | [config](https://github.com/open-mmlab/mmyolo/tree/main/configs/yolov5/yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_215044-58865c19.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_215044.log.json) |
-| YOLOv5-m |  P6  | 1280 |  Yes   | Yes |   19.1   |  51.3  |            | [config](https://github.com/open-mmlab/mmyolo/tree/main/configs/yolov5/yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_230453-49564d58.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_230453.log.json) |
-| YOLOv5-l |  P6  | 1280 |  Yes   | Yes |   30.5   |  53.7  |            | [config](https://github.com/open-mmlab/mmyolo/tree/main/configs/yolov5/yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_234308-7a2ba6bf.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_234308.log.json) |
+| Backbone  | Arch | size | Mask Refine | SyncBN | AMP | Mem (GB) |   box AP    | TTA box AP |                                     Config                                      |                                                                                                                                                                                                       Download                                                                                                                                                                                                       |
+| :-------: | :--: | :--: | :---------: | :----: | :-: | :------: | :---------: | :--------: | :-----------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| YOLOv5-n  |  P5  | 640  |     No      |  Yes   | Yes |   1.5    |    28.0     |    30.7    |             [config](./yolov5_n-v61_syncbn_fast_8xb16-300e_coco.py)             |                                     [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco/yolov5_n-v61_syncbn_fast_8xb16-300e_coco_20220919_090739-b804c1ad.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco/yolov5_n-v61_syncbn_fast_8xb16-300e_coco_20220919_090739.log.json)                                     |
+| YOLOv5-n  |  P5  | 640  |     Yes     |  Yes   | Yes |   1.5    |    28.0     |            | [config](./mask_refine/yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230305_152706-712fb1b2.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230305_152706.log.json) |
+| YOLOv5u-n |  P5  | 640  |     Yes     |  Yes   | Yes |          |             |            | [config](./yolov5/yolov5u/yolov5_n_mask-refine_syncbn_fast_8xb16-300e_coco.py)  |                                                                                                                                                                                               [model](<>) \| [log](<>)                                                                                                                                                                                               |
+| YOLOv5-s  |  P5  | 640  |     No      |  Yes   | Yes |   2.7    |    37.7     |    40.2    |             [config](./yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py)             |                                     [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700.log.json)                                     |
+| YOLOv5-s  |  P5  | 640  |     Yes     |  Yes   | Yes |   2.7    | 38.0 (+0.3) |            | [config](./mask_refine/yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230304_033134-8e0cd271.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230304_033134.log.json) |
+| YOLOv5u-s |  P5  | 640  |     Yes     |  Yes   | Yes |          |             |            | [config](./yolov5/yolov5u/yolov5_s_mask-refine_syncbn_fast_8xb16-300e_coco.py)  |                                                                                                                                                                                               [model](<>) \| [log](<>)                                                                                                                                                                                               |
+| YOLOv5-m  |  P5  | 640  |     No      |  Yes   | Yes |   5.0    |    45.3     |    46.9    |             [config](./yolov5_m-v61_syncbn_fast_8xb16-300e_coco.py)             |                                     [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-v61_syncbn_fast_8xb16-300e_coco/yolov5_m-v61_syncbn_fast_8xb16-300e_coco_20220917_204944-516a710f.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-v61_syncbn_fast_8xb16-300e_coco/yolov5_m-v61_syncbn_fast_8xb16-300e_coco_20220917_204944.log.json)                                     |
+| YOLOv5-m  |  P5  | 640  |     Yes     |  Yes   | Yes |   5.0    |    45.3     |            | [config](./mask_refine/yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230305_153946-44e96155.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230305_153946.log.json) |
+| YOLOv5u-m |  P5  | 640  |     Yes     |  Yes   | Yes |          |             |            | [config](./yolov5/yolov5u/yolov5_m_mask-refine_syncbn_fast_8xb16-300e_coco.py)  |                                                                                                                                                                                               [model](<>) \| [log](<>)                                                                                                                                                                                               |
+| YOLOv5-l  |  P5  | 640  |     No      |  Yes   | Yes |   8.1    |    48.8     |    49.9    |             [config](./yolov5_l-v61_syncbn_fast_8xb16-300e_coco.py)             |                                     [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_l-v61_syncbn_fast_8xb16-300e_coco/yolov5_l-v61_syncbn_fast_8xb16-300e_coco_20220917_031007-096ef0eb.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_l-v61_syncbn_fast_8xb16-300e_coco/yolov5_l-v61_syncbn_fast_8xb16-300e_coco_20220917_031007.log.json)                                     |
+| YOLOv5-l  |  P5  | 640  |     Yes     |  Yes   | Yes |   8.1    | 49.3 (+0.5) |            | [config](./mask_refine/yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230305_154301-2c1d912a.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230305_154301.log.json) |
+| YOLOv5u-l |  P5  | 640  |     Yes     |  Yes   | Yes |          |             |            | [config](./yolov5/yolov5u/yolov5_l_mask-refine_syncbn_fast_8xb16-300e_coco.py)  |                                                                                                                                                                                               [model](<>) \| [log](<>)                                                                                                                                                                                               |
+| YOLOv5-x  |  P5  | 640  |     No      |  Yes   | Yes |   12.2   |    50.2     |            |             [config](./yolov5_x-v61_syncbn_fast_8xb16-300e_coco.py)             |                                     [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_x-v61_syncbn_fast_8xb16-300e_coco/yolov5_x-v61_syncbn_fast_8xb16-300e_coco_20230305_152943-00776a4b.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_x-v61_syncbn_fast_8xb16-300e_coco/yolov5_x-v61_syncbn_fast_8xb16-300e_coco_20230305_152943.log.json)                                     |
+| YOLOv5-x  |  P5  | 640  |     Yes     |  Yes   | Yes |   12.2   | 50.9 (+0.7) |            | [config](./mask_refine/yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230305_154321-07edeb62.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230305_154321.log.json) |
+| YOLOv5u-x |  P5  | 640  |     Yes     |  Yes   | Yes |          |             |            | [config](./yolov5/yolov5u/yolov5_x_mask-refine_syncbn_fast_8xb16-300e_coco.py)  |                                                                                                                                                                                               [model](<>) \| [log](<>)                                                                                                                                                                                               |
+| YOLOv5-n  |  P6  | 1280 |     No      |  Yes   | Yes |   5.8    |    35.9     |            |           [config](./yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco.py)            |                               [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_224705-d493c5f3.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_224705.log.json)                               |
+| YOLOv5-s  |  P6  | 1280 |     No      |  Yes   | Yes |   10.5   |    44.4     |            |           [config](./yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco.py)            |                               [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_215044-58865c19.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_215044.log.json)                               |
+| YOLOv5-m  |  P6  | 1280 |     No      |  Yes   | Yes |   19.1   |    51.3     |            |           [config](./yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco.py)            |                               [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_230453-49564d58.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_230453.log.json)                               |
+| YOLOv5-l  |  P6  | 1280 |     No      |  Yes   | Yes |   30.5   |    53.7     |            |           [config](./yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco.py)            |                               [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_234308-7a2ba6bf.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_234308.log.json)                               |
 
 **Note**:
-In the official YOLOv5 code, the `random_perspective` data augmentation in COCO object detection task training uses mask annotation information, which leads to higher performance. Object detection should not use mask annotation, so only box annotation information is used in `MMYOLO`. We will use the mask annotation information in the instance segmentation task. See https://github.com/ultralytics/yolov5/issues/9917 for details.
 
 1. `fast` means that `YOLOv5DetDataPreprocessor` and `yolov5_collate` are used for data preprocessing, which is faster for training, but less flexible for multitasking. Recommended to use fast version config if you only care about object detection.
 2. `detect` means that the network input is fixed to `640x640` and the post-processing thresholds is modified.
@@ -40,15 +50,33 @@ In the official YOLOv5 code, the `random_perspective` data augmentation in COCO
 4. We use 8x A100 for training, and the single-GPU batch size is 16. This is different from the official code.
 5. The performance is unstable and may fluctuate by about 0.4 mAP and the highest performance weight in `COCO` training in `YOLOv5` may not be the last epoch.
 6. `TTA` means that Test Time Augmentation. It's perform 3 multi-scaling transformations on the image, followed by 2 flipping transformations (flipping and not flipping). You only need to specify `--tta` when testing to enable.  see [TTA](https://github.com/open-mmlab/mmyolo/blob/dev/docs/en/common_usage/tta.md) for details.
+7. The performance of `Mask Refine` training is for the weight performance officially released by YOLOv5. `Mask Refine` means refining bbox by mask while loading annotations and transforming after `YOLOv5RandomAffine`, `Copy Paste` means using `YOLOv5CopyPaste`.
+8. `YOLOv5u` models use the same loss functions and split Detect head as `YOLOv8` models for improved performance, but only requires 300 epochs.
+
+### COCO Instance segmentation
+
+|       Backbone        | Arch | size | SyncBN | AMP | Mem (GB) | Box AP | Mask AP |                                          Config                                          |                                                                                                                                                                                                                             Download                                                                                                                                                                                                                             |
+| :-------------------: | :--: | :--: | :----: | :-: | :------: | :----: | :-----: | :--------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|       YOLOv5-n        |  P5  | 640  |  Yes   | Yes |   3.3    |  27.9  |  23.7   |       [config](./ins_seg/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance.py)       |                         [model](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance_20230424_104807-84cc9240.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance_20230424_104807.log.json)                         |
+|       YOLOv5-s        |  P5  | 640  |  Yes   | Yes |   4.8    |  38.1  |  32.0   |       [config](./ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py)       |                         [model](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance_20230426_012542-3e570436.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance_20230426_012542.log.json)                         |
+| YOLOv5-s(non-overlap) |  P5  | 640  |  Yes   | Yes |   4.8    |  38.0  |  32.1   | [config](./ins_seg/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance_20230424_104642-6780d34e.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance_20230424_104642.log.json) |
+|       YOLOv5-m        |  P5  | 640  |  Yes   | Yes |   7.3    |  45.1  |  37.3   |       [config](./ins_seg/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance.py)       |                         [model](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance_20230424_111529-ef5ba1a9.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance_20230424_111529.log.json)                         |
+|       YOLOv5-l        |  P5  | 640  |  Yes   | Yes |   10.7   |  48.8  |  39.9   |       [config](./ins_seg/yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance.py)       |                         [model](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance_20230508_104049-daa09f70.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance_20230508_104049.log.json)                         |
+|       YOLOv5-x        |  P5  | 640  |  Yes   | Yes |   15.0   |  50.6  |  41.4   |       [config](./ins_seg/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance.py)       |                         [model](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance_20230508_103925-a260c798.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance_20230508_103925.log.json)                         |
+
+**Note**:
+
+1. `Non-overlap` refers to the instance-level masks being stored in the format (num_instances, h, w) instead of (h, w). Storing masks in overlap format consumes less memory and GPU memory.
+2. For the M model, the `affine_scale` parameter should be 0.9, but due to some reason, we set it to 0.5 and found that the mAP did not change. Therefore, the released M model has an `affine_scale` parameter of 0.5, which is inconsistent with the value of 0.9 in the configuration.
 
 ### VOC
 
-| Backbone | size | Batchsize | AMP | Mem (GB) | box AP(COCO metric) |                                                     Config                                                     |                                                                                                                                                 Download                                                                                                                                                 |
-| :------: | :--: | :-------: | :-: | :------: | :-----------------: | :------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-| YOLOv5-n | 512  |    64     | Yes |   3.5    |        51.2         | [config](https://github.com/open-mmlab/mmyolo/tree/main/configs/yolov5/voc/yolov5_n-v61_fast_1xb64-50e_voc.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-v61_fast_1xb64-50e_voc/yolov5_n-v61_fast_1xb64-50e_voc_20221017_234254-f1493430.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-v61_fast_1xb64-50e_voc/yolov5_n-v61_fast_1xb64-50e_voc_20221017_234254.log.json) |
-| YOLOv5-s | 512  |    64     | Yes |   6.5    |        62.7         | [config](https://github.com/open-mmlab/mmyolo/tree/main/configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_fast_1xb64-50e_voc/yolov5_s-v61_fast_1xb64-50e_voc_20221017_234156-0009b33e.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_fast_1xb64-50e_voc/yolov5_s-v61_fast_1xb64-50e_voc_20221017_234156.log.json) |
-| YOLOv5-m | 512  |    64     | Yes |   12.0   |        70.1         | [config](https://github.com/open-mmlab/mmyolo/tree/main/configs/yolov5/voc/yolov5_m-v61_fast_1xb64-50e_voc.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-v61_fast_1xb64-50e_voc/yolov5_m-v61_fast_1xb64-50e_voc_20221017_114138-815c143a.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-v61_fast_1xb64-50e_voc/yolov5_m-v61_fast_1xb64-50e_voc_20221017_114138.log.json) |
-| YOLOv5-l | 512  |    32     | Yes |   10.0   |        73.1         | [config](https://github.com/open-mmlab/mmyolo/tree/main/configs/yolov5/voc/yolov5_l-v61_fast_1xb32-50e_voc.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_l-v61_fast_1xb32-50e_voc/yolov5_l-v61_fast_1xb32-50e_voc_20221017_045500-edc7e0d8.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_l-v61_fast_1xb32-50e_voc/yolov5_l-v61_fast_1xb32-50e_voc_20221017_045500.log.json) |
+| Backbone | size | Batchsize | AMP | Mem (GB) | box AP(COCO metric) |                          Config                           |                                                                                                                                                 Download                                                                                                                                                 |
+| :------: | :--: | :-------: | :-: | :------: | :-----------------: | :-------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| YOLOv5-n | 512  |    64     | Yes |   3.5    |        51.2         | [config](./yolov5/voc/yolov5_n-v61_fast_1xb64-50e_voc.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-v61_fast_1xb64-50e_voc/yolov5_n-v61_fast_1xb64-50e_voc_20221017_234254-f1493430.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-v61_fast_1xb64-50e_voc/yolov5_n-v61_fast_1xb64-50e_voc_20221017_234254.log.json) |
+| YOLOv5-s | 512  |    64     | Yes |   6.5    |        62.7         | [config](./yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_fast_1xb64-50e_voc/yolov5_s-v61_fast_1xb64-50e_voc_20221017_234156-0009b33e.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_fast_1xb64-50e_voc/yolov5_s-v61_fast_1xb64-50e_voc_20221017_234156.log.json) |
+| YOLOv5-m | 512  |    64     | Yes |   12.0   |        70.1         | [config](./yolov5/voc/yolov5_m-v61_fast_1xb64-50e_voc.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-v61_fast_1xb64-50e_voc/yolov5_m-v61_fast_1xb64-50e_voc_20221017_114138-815c143a.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-v61_fast_1xb64-50e_voc/yolov5_m-v61_fast_1xb64-50e_voc_20221017_114138.log.json) |
+| YOLOv5-l | 512  |    32     | Yes |   10.0   |        73.1         | [config](./yolov5/voc/yolov5_l-v61_fast_1xb32-50e_voc.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_l-v61_fast_1xb32-50e_voc/yolov5_l-v61_fast_1xb32-50e_voc_20221017_045500-edc7e0d8.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_l-v61_fast_1xb32-50e_voc/yolov5_l-v61_fast_1xb32-50e_voc_20221017_045500.log.json) |
 
 **Note**:
 
@@ -62,10 +90,10 @@ In the official YOLOv5 code, the `random_perspective` data augmentation in COCO
 
 Since the `iscrowd` annotation of the COCO dataset is not equivalent to `ignore`, we use the CrowdHuman dataset to verify that the YOLOv5 ignore logic is correct.
 
-| Backbone | size | SyncBN | AMP | Mem (GB) | ignore_iof_thr | box AP50(CrowDHuman Metric) |  MR  |  JI   |                                                             Config                                                              | Download |
-| :------: | :--: | :----: | :-: | :------: | :------------: | :-------------------------: | :--: | :---: | :-----------------------------------------------------------------------------------------------------------------------------: | :------: |
-| YOLOv5-s | 640  |  Yes   | Yes |   2.6    |       -1       |            85.79            | 48.7 | 75.33 |  [config](https://github.com/open-mmlab/mmyolo/tree/main/configs/yolov5/crowdhuman/yolov5_s-v61_fast_8xb16-300e_crowdhuman.py)  |          |
-| YOLOv5-s | 640  |  Yes   | Yes |   2.6    |      0.5       |            86.17            | 48.8 | 75.87 | [config](https://github.com/open-mmlab/mmyolo/tree/main/configs/yolov5/crowdhuman/yolov5_s-v61_8xb16-300e_ignore_crowdhuman.py) |          |
+| Backbone | size | SyncBN | AMP | Mem (GB) | ignore_iof_thr | box AP50(CrowDHuman Metric) |  MR  |  JI   |                                   Config                                   | Download |
+| :------: | :--: | :----: | :-: | :------: | :------------: | :-------------------------: | :--: | :---: | :------------------------------------------------------------------------: | :------: |
+| YOLOv5-s | 640  |  Yes   | Yes |   2.6    |       -1       |            85.79            | 48.7 | 75.33 |  [config](./yolov5/crowdhuman/yolov5_s-v61_fast_8xb16-300e_crowdhuman.py)  |          |
+| YOLOv5-s | 640  |  Yes   | Yes |   2.6    |      0.5       |            86.17            | 48.8 | 75.87 | [config](./yolov5/crowdhuman/yolov5_s-v61_8xb16-300e_ignore_crowdhuman.py) |          |
 
 **Note**:
 
diff --git a/configs/yolov5/crowdhuman/yolov5_s-v61_8xb16-300e_ignore_crowdhuman.py b/configs/yolov5/crowdhuman/yolov5_s-v61_8xb16-300e_ignore_crowdhuman.py
index 90ba758a5..85b371929 100644
--- a/configs/yolov5/crowdhuman/yolov5_s-v61_8xb16-300e_ignore_crowdhuman.py
+++ b/configs/yolov5/crowdhuman/yolov5_s-v61_8xb16-300e_ignore_crowdhuman.py
@@ -19,7 +19,7 @@
 ]
 
 pre_transform = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     # only change this
     dict(type='mmdet.LoadAnnotations', with_bbox=True)
 ]
diff --git a/configs/yolov5/ins_seg/yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance.py b/configs/yolov5/ins_seg/yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance.py
new file mode 100644
index 000000000..6b27c7647
--- /dev/null
+++ b/configs/yolov5/ins_seg/yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance.py
@@ -0,0 +1,81 @@
+_base_ = './yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance.py'  # noqa
+
+# This config use refining bbox and `YOLOv5CopyPaste`.
+# Refining bbox means refining bbox by mask while loading annotations and
+# transforming after `YOLOv5RandomAffine`
+# ========================modified parameters======================
+deepen_factor = 1.0
+widen_factor = 1.0
+
+mixup_prob = 0.1
+copypaste_prob = 0.1
+
+# =======================Unmodified in most cases==================
+img_scale = _base_.img_scale
+
+model = dict(
+    backbone=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+    ),
+    neck=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+    ),
+    bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
+
+pre_transform = _base_.pre_transform
+albu_train_transforms = _base_.albu_train_transforms
+mosaic_affine_pipeline = [
+    dict(
+        type='Mosaic',
+        img_scale=img_scale,
+        pad_val=114.0,
+        pre_transform=pre_transform),
+    dict(type='YOLOv5CopyPaste', prob=copypaste_prob),
+    dict(
+        type='YOLOv5RandomAffine',
+        max_rotate_degree=0.0,
+        max_shear_degree=0.0,
+        scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale),
+        border=(-img_scale[0] // 2, -img_scale[1] // 2),
+        border_val=(114, 114, 114),
+        min_area_ratio=_base_.min_area_ratio,
+        max_aspect_ratio=_base_.max_aspect_ratio,
+        use_mask_refine=_base_.use_mask2refine),
+]
+
+# enable mixup
+train_pipeline = [
+    *pre_transform,
+    *mosaic_affine_pipeline,
+    dict(
+        type='YOLOv5MixUp',
+        prob=mixup_prob,
+        pre_transform=[*pre_transform, *mosaic_affine_pipeline]),
+    # TODO: support mask transform in albu
+    # Geometric transformations are not supported in albu now.
+    dict(
+        type='mmdet.Albu',
+        transforms=albu_train_transforms,
+        bbox_params=dict(
+            type='BboxParams',
+            format='pascal_voc',
+            label_fields=['gt_bboxes_labels', 'gt_ignore_flags']),
+        keymap={
+            'img': 'image',
+            'gt_bboxes': 'bboxes'
+        }),
+    dict(type='YOLOv5HSVRandomAug'),
+    dict(type='mmdet.RandomFlip', prob=0.5),
+    dict(
+        type='Polygon2Mask',
+        downsample_ratio=_base_.downsample_ratio,
+        mask_overlap=_base_.mask_overlap),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
+                   'flip_direction'))
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
diff --git a/configs/yolov5/ins_seg/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance.py b/configs/yolov5/ins_seg/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance.py
new file mode 100644
index 000000000..831e815cb
--- /dev/null
+++ b/configs/yolov5/ins_seg/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance.py
@@ -0,0 +1,89 @@
+_base_ = './yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py'  # noqa
+
+# ========================modified parameters======================
+deepen_factor = 0.67
+widen_factor = 0.75
+lr_factor = 0.1
+loss_cls_weight = 0.3
+loss_obj_weight = 0.7
+
+affine_scale = 0.9
+mixup_prob = 0.1
+
+# =======================Unmodified in most cases==================
+num_classes = _base_.num_classes
+num_det_layers = _base_.num_det_layers
+img_scale = _base_.img_scale
+
+model = dict(
+    backbone=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+    ),
+    neck=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+    ),
+    bbox_head=dict(
+        head_module=dict(widen_factor=widen_factor),
+        loss_cls=dict(loss_weight=loss_cls_weight *
+                      (num_classes / 80 * 3 / num_det_layers)),
+        loss_obj=dict(loss_weight=loss_obj_weight *
+                      ((img_scale[0] / 640)**2 * 3 / num_det_layers))))
+
+pre_transform = _base_.pre_transform
+albu_train_transforms = _base_.albu_train_transforms
+
+mosaic_affine_pipeline = [
+    dict(
+        type='Mosaic',
+        img_scale=img_scale,
+        pad_val=114.0,
+        pre_transform=pre_transform),
+    dict(
+        type='YOLOv5RandomAffine',
+        max_rotate_degree=0.0,
+        max_shear_degree=0.0,
+        scaling_ratio_range=(1 - affine_scale, 1 + affine_scale),
+        border=(-img_scale[0] // 2, -img_scale[1] // 2),
+        border_val=(114, 114, 114),
+        min_area_ratio=_base_.min_area_ratio,
+        max_aspect_ratio=_base_.max_aspect_ratio,
+        use_mask_refine=_base_.use_mask2refine),
+]
+
+# enable mixup
+train_pipeline = [
+    *pre_transform,
+    *mosaic_affine_pipeline,
+    dict(
+        type='YOLOv5MixUp',
+        prob=mixup_prob,
+        pre_transform=[*pre_transform, *mosaic_affine_pipeline]),
+    # TODO: support mask transform in albu
+    # Geometric transformations are not supported in albu now.
+    dict(
+        type='mmdet.Albu',
+        transforms=albu_train_transforms,
+        bbox_params=dict(
+            type='BboxParams',
+            format='pascal_voc',
+            label_fields=['gt_bboxes_labels', 'gt_ignore_flags']),
+        keymap={
+            'img': 'image',
+            'gt_bboxes': 'bboxes'
+        }),
+    dict(type='YOLOv5HSVRandomAug'),
+    dict(type='mmdet.RandomFlip', prob=0.5),
+    dict(
+        type='Polygon2Mask',
+        downsample_ratio=_base_.downsample_ratio,
+        mask_overlap=_base_.mask_overlap),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
+                   'flip_direction'))
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+default_hooks = dict(param_scheduler=dict(lr_factor=lr_factor))
diff --git a/configs/yolov5/ins_seg/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance.py b/configs/yolov5/ins_seg/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance.py
new file mode 100644
index 000000000..e06130bd3
--- /dev/null
+++ b/configs/yolov5/ins_seg/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance.py
@@ -0,0 +1,15 @@
+_base_ = './yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py'  # noqa
+
+deepen_factor = 0.33
+widen_factor = 0.25
+
+model = dict(
+    backbone=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+    ),
+    neck=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+    ),
+    bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
diff --git a/configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py b/configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py
new file mode 100644
index 000000000..82e2ae6d0
--- /dev/null
+++ b/configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py
@@ -0,0 +1,42 @@
+_base_ = './yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py'  # noqa
+
+data_root = 'data/balloon/'
+# Path of train annotation file
+train_ann_file = 'train.json'
+train_data_prefix = 'train/'  # Prefix of train image path
+# Path of val annotation file
+val_ann_file = 'val.json'
+val_data_prefix = 'val/'  # Prefix of val image path
+metainfo = {
+    'classes': ('balloon', ),
+    'palette': [
+        (220, 20, 60),
+    ]
+}
+num_classes = 1
+
+train_batch_size_per_gpu = 4
+train_num_workers = 2
+log_interval = 1
+#####################
+train_dataloader = dict(
+    batch_size=train_batch_size_per_gpu,
+    num_workers=train_num_workers,
+    dataset=dict(
+        data_root=data_root,
+        metainfo=metainfo,
+        data_prefix=dict(img=train_data_prefix),
+        ann_file=train_ann_file))
+val_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        metainfo=metainfo,
+        data_prefix=dict(img=val_data_prefix),
+        ann_file=val_ann_file))
+test_dataloader = val_dataloader
+val_evaluator = dict(ann_file=data_root + val_ann_file)
+test_evaluator = val_evaluator
+default_hooks = dict(logger=dict(interval=log_interval))
+#####################
+
+model = dict(bbox_head=dict(head_module=dict(num_classes=num_classes)))
diff --git a/configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py b/configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py
new file mode 100644
index 000000000..0ab980ca7
--- /dev/null
+++ b/configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py
@@ -0,0 +1,126 @@
+_base_ = '../yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py'  # noqa
+
+# ========================modified parameters======================
+# YOLOv5RandomAffine
+use_mask2refine = True
+max_aspect_ratio = 100
+min_area_ratio = 0.01
+# Polygon2Mask
+downsample_ratio = 4
+mask_overlap = True
+# LeterResize
+# half_pad_param: if set to True, left and right pad_param will
+# be given by dividing padding_h by 2. If set to False, pad_param is
+# in int format. We recommend setting this to False for object
+# detection tasks, and True for instance segmentation tasks.
+# Default to False.
+half_pad_param = True
+
+# Testing take a long time due to model_test_cfg.
+# If you want to speed it up, you can increase score_thr
+# or decraese nms_pre and max_per_img
+model_test_cfg = dict(
+    multi_label=True,
+    nms_pre=30000,
+    min_bbox_size=0,
+    score_thr=0.001,
+    nms=dict(type='nms', iou_threshold=0.6),
+    max_per_img=300,
+    mask_thr_binary=0.5,
+    # fast_test: Whether to use fast test methods. When set
+    # to False, the implementation here is the same as the
+    # official, with higher mAP. If set to True, mask will first
+    # be upsampled to origin image shape through Pytorch, and
+    # then use mask_thr_binary to determine which pixels belong
+    # to the object. If set to False, will first use
+    # mask_thr_binary to determine which pixels belong to the
+    # object , and then use opencv to upsample mask to origin
+    # image shape. Default to False.
+    fast_test=True)
+
+# ===============================Unmodified in most cases====================
+model = dict(
+    type='YOLODetector',
+    bbox_head=dict(
+        type='YOLOv5InsHead',
+        head_module=dict(
+            type='YOLOv5InsHeadModule', mask_channels=32, proto_channels=256),
+        mask_overlap=mask_overlap,
+        loss_mask=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=True, reduction='none'),
+        loss_mask_weight=0.05),
+    test_cfg=model_test_cfg)
+
+pre_transform = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        mask2bbox=use_mask2refine)
+]
+
+train_pipeline = [
+    *pre_transform,
+    dict(
+        type='Mosaic',
+        img_scale=_base_.img_scale,
+        pad_val=114.0,
+        pre_transform=pre_transform),
+    dict(
+        type='YOLOv5RandomAffine',
+        max_rotate_degree=0.0,
+        max_shear_degree=0.0,
+        scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale),
+        border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
+        border_val=(114, 114, 114),
+        min_area_ratio=min_area_ratio,
+        max_aspect_ratio=max_aspect_ratio,
+        use_mask_refine=use_mask2refine),
+    # TODO: support mask transform in albu
+    # Geometric transformations are not supported in albu now.
+    dict(
+        type='mmdet.Albu',
+        transforms=_base_.albu_train_transforms,
+        bbox_params=dict(
+            type='BboxParams',
+            format='pascal_voc',
+            label_fields=['gt_bboxes_labels', 'gt_ignore_flags']),
+        keymap={
+            'img': 'image',
+            'gt_bboxes': 'bboxes'
+        }),
+    dict(type='YOLOv5HSVRandomAug'),
+    dict(type='mmdet.RandomFlip', prob=0.5),
+    dict(
+        type='Polygon2Mask',
+        downsample_ratio=downsample_ratio,
+        mask_overlap=mask_overlap),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
+                   'flip_direction'))
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='YOLOv5KeepRatioResize', scale=_base_.img_scale),
+    dict(
+        type='LetterResize',
+        scale=_base_.img_scale,
+        allow_scale_up=False,
+        half_pad_param=half_pad_param,
+        pad_val=dict(img=114)),
+    dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),
+    dict(
+        type='mmdet.PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'pad_param'))
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(metric=['bbox', 'segm'])
+test_evaluator = val_evaluator
diff --git a/configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance.py b/configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance.py
new file mode 100644
index 000000000..83b48cab6
--- /dev/null
+++ b/configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance.py
@@ -0,0 +1,49 @@
+_base_ = './yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py'  # noqa
+
+# ========================modified parameters======================
+mask_overlap = False  # Polygon2Mask
+
+# ===============================Unmodified in most cases====================
+model = dict(bbox_head=dict(mask_overlap=mask_overlap))
+
+train_pipeline = [
+    *_base_.pre_transform,
+    dict(
+        type='Mosaic',
+        img_scale=_base_.img_scale,
+        pad_val=114.0,
+        pre_transform=_base_.pre_transform),
+    dict(
+        type='YOLOv5RandomAffine',
+        max_rotate_degree=0.0,
+        max_shear_degree=0.0,
+        scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale),
+        border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
+        border_val=(114, 114, 114),
+        min_area_ratio=_base_.min_area_ratio,
+        max_aspect_ratio=_base_.max_aspect_ratio,
+        use_mask_refine=True),
+    dict(
+        type='mmdet.Albu',
+        transforms=_base_.albu_train_transforms,
+        bbox_params=dict(
+            type='BboxParams',
+            format='pascal_voc',
+            label_fields=['gt_bboxes_labels', 'gt_ignore_flags']),
+        keymap={
+            'img': 'image',
+            'gt_bboxes': 'bboxes',
+        }),
+    dict(type='YOLOv5HSVRandomAug'),
+    dict(type='mmdet.RandomFlip', prob=0.5),
+    dict(
+        type='Polygon2Mask',
+        downsample_ratio=_base_.downsample_ratio,
+        mask_overlap=mask_overlap),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
+                   'flip_direction'))
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
diff --git a/configs/yolov5/ins_seg/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance.py b/configs/yolov5/ins_seg/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance.py
new file mode 100644
index 000000000..a18170ccc
--- /dev/null
+++ b/configs/yolov5/ins_seg/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance.py
@@ -0,0 +1,15 @@
+_base_ = './yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance.py'  # noqa
+
+deepen_factor = 1.33
+widen_factor = 1.25
+
+model = dict(
+    backbone=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+    ),
+    neck=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+    ),
+    bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
diff --git a/configs/yolov5/mask_refine/yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py b/configs/yolov5/mask_refine/yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py
new file mode 100644
index 000000000..206eec3c4
--- /dev/null
+++ b/configs/yolov5/mask_refine/yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py
@@ -0,0 +1,77 @@
+_base_ = './yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py'
+
+# This config use refining bbox and `YOLOv5CopyPaste`.
+# Refining bbox means refining bbox by mask while loading annotations and
+# transforming after `YOLOv5RandomAffine`
+
+# ========================modified parameters======================
+deepen_factor = 1.0
+widen_factor = 1.0
+
+mixup_prob = 0.1
+copypaste_prob = 0.1
+
+# =======================Unmodified in most cases==================
+img_scale = _base_.img_scale
+
+model = dict(
+    backbone=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+    ),
+    neck=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+    ),
+    bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
+
+pre_transform = _base_.pre_transform
+albu_train_transforms = _base_.albu_train_transforms
+
+mosaic_affine_pipeline = [
+    dict(
+        type='Mosaic',
+        img_scale=img_scale,
+        pad_val=114.0,
+        pre_transform=pre_transform),
+    dict(type='YOLOv5CopyPaste', prob=copypaste_prob),
+    dict(
+        type='YOLOv5RandomAffine',
+        max_rotate_degree=0.0,
+        max_shear_degree=0.0,
+        scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale),
+        # img_scale is (width, height)
+        border=(-img_scale[0] // 2, -img_scale[1] // 2),
+        border_val=(114, 114, 114),
+        min_area_ratio=_base_.min_area_ratio,
+        use_mask_refine=_base_.use_mask2refine),
+    dict(type='RemoveDataElement', keys=['gt_masks'])
+]
+
+# enable mixup and copypaste
+train_pipeline = [
+    *pre_transform, *mosaic_affine_pipeline,
+    dict(
+        type='YOLOv5MixUp',
+        prob=mixup_prob,
+        pre_transform=[*pre_transform, *mosaic_affine_pipeline]),
+    dict(
+        type='mmdet.Albu',
+        transforms=albu_train_transforms,
+        bbox_params=dict(
+            type='BboxParams',
+            format='pascal_voc',
+            label_fields=['gt_bboxes_labels', 'gt_ignore_flags']),
+        keymap={
+            'img': 'image',
+            'gt_bboxes': 'bboxes'
+        }),
+    dict(type='YOLOv5HSVRandomAug'),
+    dict(type='mmdet.RandomFlip', prob=0.5),
+    dict(
+        type='mmdet.PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
+                   'flip_direction'))
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
diff --git a/configs/yolov5/mask_refine/yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py b/configs/yolov5/mask_refine/yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py
new file mode 100644
index 000000000..4af27a917
--- /dev/null
+++ b/configs/yolov5/mask_refine/yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py
@@ -0,0 +1,86 @@
+_base_ = './yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py'
+
+# This config will refine bbox by mask while loading annotations and
+# transforming after `YOLOv5RandomAffine`
+
+# ========================modified parameters======================
+deepen_factor = 0.67
+widen_factor = 0.75
+lr_factor = 0.1
+loss_cls_weight = 0.3
+loss_obj_weight = 0.7
+
+affine_scale = 0.9
+mixup_prob = 0.1
+
+# =======================Unmodified in most cases==================
+num_classes = _base_.num_classes
+num_det_layers = _base_.num_det_layers
+img_scale = _base_.img_scale
+
+model = dict(
+    backbone=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+    ),
+    neck=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+    ),
+    bbox_head=dict(
+        head_module=dict(widen_factor=widen_factor),
+        loss_cls=dict(loss_weight=loss_cls_weight *
+                      (num_classes / 80 * 3 / num_det_layers)),
+        loss_obj=dict(loss_weight=loss_obj_weight *
+                      ((img_scale[0] / 640)**2 * 3 / num_det_layers))))
+
+pre_transform = _base_.pre_transform
+albu_train_transforms = _base_.albu_train_transforms
+
+mosaic_affine_pipeline = [
+    dict(
+        type='Mosaic',
+        img_scale=img_scale,
+        pad_val=114.0,
+        pre_transform=pre_transform),
+    dict(
+        type='YOLOv5RandomAffine',
+        max_rotate_degree=0.0,
+        max_shear_degree=0.0,
+        scaling_ratio_range=(1 - affine_scale, 1 + affine_scale),
+        # img_scale is (width, height)
+        border=(-img_scale[0] // 2, -img_scale[1] // 2),
+        border_val=(114, 114, 114),
+        min_area_ratio=_base_.min_area_ratio,
+        use_mask_refine=_base_.use_mask2refine),
+    dict(type='RemoveDataElement', keys=['gt_masks'])
+]
+
+# enable mixup
+train_pipeline = [
+    *pre_transform, *mosaic_affine_pipeline,
+    dict(
+        type='YOLOv5MixUp',
+        prob=mixup_prob,
+        pre_transform=[*pre_transform, *mosaic_affine_pipeline]),
+    dict(
+        type='mmdet.Albu',
+        transforms=albu_train_transforms,
+        bbox_params=dict(
+            type='BboxParams',
+            format='pascal_voc',
+            label_fields=['gt_bboxes_labels', 'gt_ignore_flags']),
+        keymap={
+            'img': 'image',
+            'gt_bboxes': 'bboxes'
+        }),
+    dict(type='YOLOv5HSVRandomAug'),
+    dict(type='mmdet.RandomFlip', prob=0.5),
+    dict(
+        type='mmdet.PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
+                   'flip_direction'))
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+default_hooks = dict(param_scheduler=dict(lr_factor=lr_factor))
diff --git a/configs/yolov5/mask_refine/yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py b/configs/yolov5/mask_refine/yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py
new file mode 100644
index 000000000..3fe8dc32c
--- /dev/null
+++ b/configs/yolov5/mask_refine/yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py
@@ -0,0 +1,20 @@
+_base_ = './yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py'
+
+# This config will refine bbox by mask while loading annotations and
+# transforming after `YOLOv5RandomAffine`
+
+# ========================modified parameters======================
+deepen_factor = 0.33
+widen_factor = 0.25
+
+# ===============================Unmodified in most cases====================
+model = dict(
+    backbone=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+    ),
+    neck=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+    ),
+    bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
diff --git a/configs/yolov5/mask_refine/yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py b/configs/yolov5/mask_refine/yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py
new file mode 100644
index 000000000..74febbb77
--- /dev/null
+++ b/configs/yolov5/mask_refine/yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py
@@ -0,0 +1,62 @@
+_base_ = '../yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py'
+
+# This config will refine bbox by mask while loading annotations and
+# transforming after `YOLOv5RandomAffine`
+
+# ========================modified parameters======================
+use_mask2refine = True
+min_area_ratio = 0.01  # YOLOv5RandomAffine
+
+# ===============================Unmodified in most cases====================
+pre_transform = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        mask2bbox=use_mask2refine)
+]
+
+last_transform = [
+    # Delete gt_masks to avoid more computation
+    dict(type='RemoveDataElement', keys=['gt_masks']),
+    dict(
+        type='mmdet.Albu',
+        transforms=_base_.albu_train_transforms,
+        bbox_params=dict(
+            type='BboxParams',
+            format='pascal_voc',
+            label_fields=['gt_bboxes_labels', 'gt_ignore_flags']),
+        keymap={
+            'img': 'image',
+            'gt_bboxes': 'bboxes'
+        }),
+    dict(type='YOLOv5HSVRandomAug'),
+    dict(type='mmdet.RandomFlip', prob=0.5),
+    dict(
+        type='mmdet.PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
+                   'flip_direction'))
+]
+
+train_pipeline = [
+    *pre_transform,
+    dict(
+        type='Mosaic',
+        img_scale=_base_.img_scale,
+        pad_val=114.0,
+        pre_transform=pre_transform),
+    dict(
+        type='YOLOv5RandomAffine',
+        max_rotate_degree=0.0,
+        max_shear_degree=0.0,
+        scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale),
+        # img_scale is (width, height)
+        border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
+        border_val=(114, 114, 114),
+        min_area_ratio=min_area_ratio,
+        use_mask_refine=use_mask2refine),
+    *last_transform
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
diff --git a/configs/yolov5/mask_refine/yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py b/configs/yolov5/mask_refine/yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py
new file mode 100644
index 000000000..fb76f1057
--- /dev/null
+++ b/configs/yolov5/mask_refine/yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py
@@ -0,0 +1,21 @@
+_base_ = './yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py'
+
+# This config use refining bbox and `YOLOv5CopyPaste`.
+# Refining bbox means refining bbox by mask while loading annotations and
+# transforming after `YOLOv5RandomAffine`
+
+# ========================modified parameters======================
+deepen_factor = 1.33
+widen_factor = 1.25
+
+# ===============================Unmodified in most cases====================
+model = dict(
+    backbone=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+    ),
+    neck=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+    ),
+    bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
diff --git a/configs/yolov5/metafile.yml b/configs/yolov5/metafile.yml
index c64f38e5b..bfe5add4f 100644
--- a/configs/yolov5/metafile.yml
+++ b/configs/yolov5/metafile.yml
@@ -80,6 +80,18 @@ Models:
         Metrics:
           box AP: 48.8
     Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_l-v61_syncbn_fast_8xb16-300e_coco/yolov5_l-v61_syncbn_fast_8xb16-300e_coco_20220917_031007-096ef0eb.pth
+  - Name: yolov5_x-v61_syncbn_fast_8xb16-300e_coco
+    In Collection: YOLOv5
+    Config: configs/yolov5/yolov5_x-v61_syncbn_fast_8xb16-300e_coco.py
+    Metadata:
+      Training Memory (GB): 12.2
+      Epochs: 300
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 50.2
+    Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_x-v61_syncbn_fast_8xb16-300e_coco/yolov5_x-v61_syncbn_fast_8xb16-300e_coco_20230305_152943-00776a4b.pth
   - Name: yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco
     In Collection: YOLOv5
     Config: configs/yolov5/yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco.py
@@ -176,3 +188,159 @@ Models:
         Metrics:
           box AP: 73.1
     Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_l-v61_fast_1xb32-50e_voc/yolov5_l-v61_fast_1xb32-50e_voc_20221017_045500-edc7e0d8.pth
+  - Name: yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco
+    In Collection: YOLOv5
+    Config: configs/yolov5/mask_refine/yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py
+    Metadata:
+      Training Memory (GB): 1.5
+      Epochs: 300
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 28.0
+    Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230305_152706-712fb1b2.pth
+  - Name: yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco
+    In Collection: YOLOv5
+    Config: configs/yolov5/mask_refine/yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py
+    Metadata:
+      Training Memory (GB): 2.7
+      Epochs: 300
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.0
+    Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230304_033134-8e0cd271.pth
+  - Name: yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco
+    In Collection: YOLOv5
+    Config: configs/yolov5/mask_refine/yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py
+    Metadata:
+      Training Memory (GB): 5.0
+      Epochs: 300
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.3
+    Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230305_153946-44e96155.pth
+  - Name: yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco
+    In Collection: YOLOv5
+    Config: configs/yolov5/mask_refine/yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py
+    Metadata:
+      Training Memory (GB): 8.1
+      Epochs: 300
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 49.3
+    Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230305_154301-2c1d912a.pth
+  - Name: yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco
+    In Collection: YOLOv5
+    Config: configs/yolov5/mask_refine/yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py
+    Metadata:
+      Training Memory (GB): 12.2
+      Epochs: 300
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 50.9
+    Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230305_154321-07edeb62.pth
+  - Name: yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance
+    In Collection: YOLOv5
+    Config: configs/yolov5/ins_seg/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance.py
+    Metadata:
+      Training Memory (GB): 3.3
+      Epochs: 300
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 27.9
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 23.7
+    Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance_20230424_104807-84cc9240.pth
+  - Name: yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance
+    In Collection: YOLOv5
+    Config: configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py
+    Metadata:
+      Training Memory (GB): 4.8
+      Epochs: 300
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 32.0
+    Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance_20230426_012542-3e570436.pth
+  - Name: yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance
+    In Collection: YOLOv5
+    Config: configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance.py
+    Metadata:
+      Training Memory (GB): 4.8
+      Epochs: 300
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 38.0
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 32.1
+    Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance_20230424_104642-6780d34e.pth
+  - Name: yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance
+    In Collection: YOLOv5
+    Config: configs/yolov5/ins_seg/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance.py
+    Metadata:
+      Training Memory (GB): 7.3
+      Epochs: 300
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 45.1
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 37.3
+    Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance_20230424_111529-ef5ba1a9.pth
+  - Name: yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance
+    In Collection: YOLOv5
+    Config: configs/yolov5/ins_seg/yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance.py
+    Metadata:
+      Training Memory (GB): 10.7
+      Epochs: 300
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 48.8
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 39.9
+    Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance_20230508_104049-daa09f70.pth
+  - Name: yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance
+    In Collection: YOLOv5
+    Config: configs/yolov5/ins_seg/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance.py
+    Metadata:
+      Training Memory (GB): 15.0
+      Epochs: 300
+    Results:
+      - Task: Object Detection
+        Dataset: COCO
+        Metrics:
+          box AP: 50.6
+      - Task: Instance Segmentation
+        Dataset: COCO
+        Metrics:
+          mask AP: 41.4
+    Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance_20230508_103925-a260c798.pth
diff --git a/configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py b/configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py
index 9585b51fd..f777fff96 100644
--- a/configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py
+++ b/configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py
@@ -168,7 +168,7 @@
     collate_fn=dict(type='yolov5_collate'))
 
 test_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(type='YOLOv5KeepRatioResize', scale=img_scale),
     dict(
         type='LetterResize',
@@ -250,7 +250,7 @@
 ]
 
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(
         type='TestTimeAug',
         transforms=[
diff --git a/configs/yolov5/yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco.py b/configs/yolov5/yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco.py
index 0af1fcb84..f64df69fd 100644
--- a/configs/yolov5/yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco.py
+++ b/configs/yolov5/yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco.py
@@ -84,7 +84,7 @@
 train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
 
 test_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(type='YOLOv5KeepRatioResize', scale=img_scale),
     dict(
         type='LetterResize',
@@ -118,7 +118,7 @@
 ]
 
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(
         type='TestTimeAug',
         transforms=[
diff --git a/configs/yolov5/yolov5_s-v61_fast_1xb12-40e_608x352_cat.py b/configs/yolov5/yolov5_s-v61_fast_1xb12-40e_608x352_cat.py
index a7ea4f44c..5bbd13e08 100644
--- a/configs/yolov5/yolov5_s-v61_fast_1xb12-40e_608x352_cat.py
+++ b/configs/yolov5/yolov5_s-v61_fast_1xb12-40e_608x352_cat.py
@@ -51,7 +51,7 @@
 _base_.train_dataloader.dataset.pipeline = train_pipeline
 
 test_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(type='YOLOv5KeepRatioResize', scale=img_scale),
     dict(
         type='LetterResize',
diff --git a/configs/yolov5/yolov5_s-v61_fast_1xb12-ms-40e_cat.py b/configs/yolov5/yolov5_s-v61_fast_1xb12-ms-40e_cat.py
new file mode 100644
index 000000000..dc460fa98
--- /dev/null
+++ b/configs/yolov5/yolov5_s-v61_fast_1xb12-ms-40e_cat.py
@@ -0,0 +1,13 @@
+_base_ = 'yolov5_s-v61_fast_1xb12-40e_cat.py'
+
+model = dict(
+    data_preprocessor=dict(
+        type='YOLOv5DetDataPreprocessor',
+        pad_size_divisor=32,
+        batch_augments=[
+            dict(
+                type='YOLOXBatchSyncRandomResize',
+                random_size_range=(480, 800),
+                size_divisor=32,
+                interval=1)
+        ]))
diff --git a/configs/yolov5/yolov5_s-v61_syncbn-detect_8xb16-300e_coco.py b/configs/yolov5/yolov5_s-v61_syncbn-detect_8xb16-300e_coco.py
index 627f98592..d8238c137 100644
--- a/configs/yolov5/yolov5_s-v61_syncbn-detect_8xb16-300e_coco.py
+++ b/configs/yolov5/yolov5_s-v61_syncbn-detect_8xb16-300e_coco.py
@@ -1,7 +1,7 @@
 _base_ = 'yolov5_s-v61_syncbn_8xb16-300e_coco.py'
 
 test_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(
         type='LetterResize',
         scale=_base_.img_scale,
diff --git a/configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py b/configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py
index 305034132..7e81a0385 100644
--- a/configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py
+++ b/configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py
@@ -158,7 +158,7 @@
 ]
 
 pre_transform = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(type='LoadAnnotations', with_bbox=True)
 ]
 
@@ -211,7 +211,7 @@
         pipeline=train_pipeline))
 
 test_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(type='YOLOv5KeepRatioResize', scale=img_scale),
     dict(
         type='LetterResize',
diff --git a/configs/yolov5/yolov5u/yolov5u_l_mask-refine_syncbn_fast_8xb16-300e_coco.py b/configs/yolov5/yolov5u/yolov5u_l_mask-refine_syncbn_fast_8xb16-300e_coco.py
new file mode 100644
index 000000000..60c11feb3
--- /dev/null
+++ b/configs/yolov5/yolov5u/yolov5u_l_mask-refine_syncbn_fast_8xb16-300e_coco.py
@@ -0,0 +1,59 @@
+_base_ = './yolov5u_m_mask-refine_syncbn_fast_8xb16-300e_coco.py'
+
+# This config will refine bbox by mask while loading annotations and
+# transforming after `YOLOv5RandomAffine`
+
+# ========================modified parameters======================
+deepen_factor = 1.00
+widen_factor = 1.00
+
+mixup_prob = 0.15
+copypaste_prob = 0.3
+
+# =======================Unmodified in most cases==================
+img_scale = _base_.img_scale
+pre_transform = _base_.pre_transform
+last_transform = _base_.last_transform
+affine_scale = _base_.affine_scale
+
+model = dict(
+    backbone=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+    ),
+    neck=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+    ),
+    bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
+
+mosaic_affine_transform = [
+    dict(
+        type='Mosaic',
+        img_scale=img_scale,
+        pad_val=114.0,
+        pre_transform=pre_transform),
+    dict(type='YOLOv5CopyPaste', prob=copypaste_prob),
+    dict(
+        type='YOLOv5RandomAffine',
+        max_rotate_degree=0.0,
+        max_shear_degree=0.0,
+        max_aspect_ratio=100.,
+        scaling_ratio_range=(1 - affine_scale, 1 + affine_scale),
+        # img_scale is (width, height)
+        border=(-img_scale[0] // 2, -img_scale[1] // 2),
+        border_val=(114, 114, 114),
+        min_area_ratio=_base_.min_area_ratio,
+        use_mask_refine=_base_.use_mask2refine)
+]
+
+train_pipeline = [
+    *pre_transform, *mosaic_affine_transform,
+    dict(
+        type='YOLOv5MixUp',
+        prob=mixup_prob,
+        pre_transform=[*pre_transform, *mosaic_affine_transform]),
+    *last_transform
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
diff --git a/configs/yolov5/yolov5u/yolov5u_l_syncbn_fast_8xb16-300e_coco.py b/configs/yolov5/yolov5u/yolov5u_l_syncbn_fast_8xb16-300e_coco.py
new file mode 100644
index 000000000..22b9e881d
--- /dev/null
+++ b/configs/yolov5/yolov5u/yolov5u_l_syncbn_fast_8xb16-300e_coco.py
@@ -0,0 +1,18 @@
+_base_ = './yolov5u_s_syncbn_fast_8xb16-300e_coco.py'
+
+# ========================modified parameters======================
+# TODO: Update the training hyperparameters
+deepen_factor = 1.0
+widen_factor = 1.0
+
+# =======================Unmodified in most cases==================
+model = dict(
+    backbone=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+    ),
+    neck=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+    ),
+    bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
diff --git a/configs/yolov5/yolov5u/yolov5u_m_mask-refine_syncbn_fast_8xb16-300e_coco.py b/configs/yolov5/yolov5u/yolov5u_m_mask-refine_syncbn_fast_8xb16-300e_coco.py
new file mode 100644
index 000000000..ecc86fdd2
--- /dev/null
+++ b/configs/yolov5/yolov5u/yolov5u_m_mask-refine_syncbn_fast_8xb16-300e_coco.py
@@ -0,0 +1,79 @@
+_base_ = './yolov5u_s_mask-refine_syncbn_fast_8xb16-300e_coco.py'
+
+# This config will refine bbox by mask while loading annotations and
+# transforming after `YOLOv5RandomAffine`
+
+# ========================modified parameters======================
+deepen_factor = 0.67
+widen_factor = 0.75
+
+affine_scale = 0.9
+mixup_prob = 0.1
+copypaste_prob = 0.1
+
+# =======================Unmodified in most cases==================
+img_scale = _base_.img_scale
+pre_transform = _base_.pre_transform
+last_transform = _base_.last_transform
+
+model = dict(
+    backbone=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+    ),
+    neck=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+    ),
+    bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
+
+mosaic_affine_transform = [
+    dict(
+        type='Mosaic',
+        img_scale=img_scale,
+        pad_val=114.0,
+        pre_transform=pre_transform),
+    dict(type='YOLOv5CopyPaste', prob=copypaste_prob),
+    dict(
+        type='YOLOv5RandomAffine',
+        max_rotate_degree=0.0,
+        max_shear_degree=0.0,
+        max_aspect_ratio=100.,
+        scaling_ratio_range=(1 - affine_scale, 1 + affine_scale),
+        # img_scale is (width, height)
+        border=(-img_scale[0] // 2, -img_scale[1] // 2),
+        border_val=(114, 114, 114),
+        min_area_ratio=_base_.min_area_ratio,
+        use_mask_refine=_base_.use_mask2refine)
+]
+
+train_pipeline = [
+    *pre_transform, *mosaic_affine_transform,
+    dict(
+        type='YOLOv5MixUp',
+        prob=mixup_prob,
+        pre_transform=[*pre_transform, *mosaic_affine_transform]),
+    *last_transform
+]
+
+train_pipeline_stage2 = [
+    *pre_transform,
+    dict(type='YOLOv5KeepRatioResize', scale=img_scale),
+    dict(
+        type='LetterResize',
+        scale=img_scale,
+        allow_scale_up=True,
+        pad_val=dict(img=114.0)),
+    dict(
+        type='YOLOv5RandomAffine',
+        max_rotate_degree=0.0,
+        max_shear_degree=0.0,
+        scaling_ratio_range=(1 - affine_scale, 1 + affine_scale),
+        max_aspect_ratio=_base_.max_aspect_ratio,
+        border_val=(114, 114, 114),
+        min_area_ratio=_base_.min_area_ratio,
+        use_mask_refine=_base_.use_mask2refine), *last_transform
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+_base_.custom_hooks[1].switch_pipeline = train_pipeline_stage2
diff --git a/configs/yolov5/yolov5u/yolov5u_m_syncbn_fast_8xb16-300e_coco.py b/configs/yolov5/yolov5u/yolov5u_m_syncbn_fast_8xb16-300e_coco.py
new file mode 100644
index 000000000..0cfb33248
--- /dev/null
+++ b/configs/yolov5/yolov5u/yolov5u_m_syncbn_fast_8xb16-300e_coco.py
@@ -0,0 +1,18 @@
+_base_ = './yolov5u_s_syncbn_fast_8xb16-300e_coco.py'
+
+# ========================modified parameters======================
+# TODO: Update the training hyperparameters
+deepen_factor = 0.67
+widen_factor = 0.75
+
+# =======================Unmodified in most cases==================
+model = dict(
+    backbone=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+    ),
+    neck=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+    ),
+    bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
diff --git a/configs/yolov5/yolov5u/yolov5u_n_mask-refine_syncbn_fast_8xb16-300e_coco.py b/configs/yolov5/yolov5u/yolov5u_n_mask-refine_syncbn_fast_8xb16-300e_coco.py
new file mode 100644
index 000000000..1ca21b651
--- /dev/null
+++ b/configs/yolov5/yolov5u/yolov5u_n_mask-refine_syncbn_fast_8xb16-300e_coco.py
@@ -0,0 +1,20 @@
+_base_ = './yolov5u_s_mask-refine_syncbn_fast_8xb16-300e_coco.py'
+
+# This config will refine bbox by mask while loading annotations and
+# transforming after `YOLOv5RandomAffine`
+
+# ========================modified parameters======================
+deepen_factor = 0.33
+widen_factor = 0.25
+
+# ===============================Unmodified in most cases====================
+model = dict(
+    backbone=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+    ),
+    neck=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+    ),
+    bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
diff --git a/configs/yolov5/yolov5u/yolov5u_n_syncbn_fast_8xb16-300e_coco.py b/configs/yolov5/yolov5u/yolov5u_n_syncbn_fast_8xb16-300e_coco.py
new file mode 100644
index 000000000..ad6a9f2eb
--- /dev/null
+++ b/configs/yolov5/yolov5u/yolov5u_n_syncbn_fast_8xb16-300e_coco.py
@@ -0,0 +1,17 @@
+_base_ = './yolov5u_s_syncbn_fast_8xb16-300e_coco.py'
+
+# ========================modified parameters======================
+deepen_factor = 0.33
+widen_factor = 0.25
+
+# =======================Unmodified in most cases==================
+model = dict(
+    backbone=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+    ),
+    neck=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+    ),
+    bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
diff --git a/configs/yolov5/yolov5u/yolov5u_s_mask-refine_syncbn_fast_8xb16-300e_coco.py b/configs/yolov5/yolov5u/yolov5u_s_mask-refine_syncbn_fast_8xb16-300e_coco.py
new file mode 100644
index 000000000..d6840bc28
--- /dev/null
+++ b/configs/yolov5/yolov5u/yolov5u_s_mask-refine_syncbn_fast_8xb16-300e_coco.py
@@ -0,0 +1,80 @@
+_base_ = './yolov5u_s_syncbn_fast_8xb16-300e_coco.py'
+
+# This config will refine bbox by mask while loading annotations and
+# transforming after `YOLOv5RandomAffine`
+
+# ========================modified parameters======================
+use_mask2refine = True
+min_area_ratio = 0.01  # YOLOv5RandomAffine
+
+# ===============================Unmodified in most cases====================
+pre_transform = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(
+        type='LoadAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        mask2bbox=use_mask2refine)
+]
+
+last_transform = [
+    # Delete gt_masks to avoid more computation
+    dict(type='RemoveDataElement', keys=['gt_masks']),
+    dict(
+        type='mmdet.Albu',
+        transforms=_base_.albu_train_transforms,
+        bbox_params=dict(
+            type='BboxParams',
+            format='pascal_voc',
+            label_fields=['gt_bboxes_labels', 'gt_ignore_flags']),
+        keymap={
+            'img': 'image',
+            'gt_bboxes': 'bboxes'
+        }),
+    dict(type='YOLOv5HSVRandomAug'),
+    dict(type='mmdet.RandomFlip', prob=0.5),
+    dict(
+        type='mmdet.PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
+                   'flip_direction'))
+]
+
+train_pipeline = [
+    *pre_transform,
+    dict(
+        type='Mosaic',
+        img_scale=_base_.img_scale,
+        pad_val=114.0,
+        pre_transform=pre_transform),
+    dict(
+        type='YOLOv5RandomAffine',
+        max_rotate_degree=0.0,
+        max_shear_degree=0.0,
+        scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale),
+        # img_scale is (width, height)
+        border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
+        border_val=(114, 114, 114),
+        min_area_ratio=min_area_ratio,
+        use_mask_refine=use_mask2refine),
+    *last_transform
+]
+
+train_pipeline_stage2 = [
+    *pre_transform,
+    dict(type='YOLOv5KeepRatioResize', scale=_base_.img_scale),
+    dict(
+        type='LetterResize',
+        scale=_base_.img_scale,
+        allow_scale_up=True,
+        pad_val=dict(img=114.0)),
+    dict(
+        type='YOLOv5RandomAffine',
+        max_rotate_degree=0.0,
+        max_shear_degree=0.0,
+        scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale),
+        max_aspect_ratio=_base_.max_aspect_ratio,
+        border_val=(114, 114, 114)), *last_transform
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+_base_.custom_hooks[1].switch_pipeline = train_pipeline_stage2
diff --git a/configs/yolov5/yolov5u/yolov5u_s_syncbn_fast_8xb16-300e_coco.py b/configs/yolov5/yolov5u/yolov5u_s_syncbn_fast_8xb16-300e_coco.py
new file mode 100644
index 000000000..81d3a981c
--- /dev/null
+++ b/configs/yolov5/yolov5u/yolov5u_s_syncbn_fast_8xb16-300e_coco.py
@@ -0,0 +1,326 @@
+_base_ = ['../../_base_/default_runtime.py', '../../_base_/det_p5_tta.py']
+
+# ========================Frequently modified parameters======================
+# -----data related-----
+data_root = 'data/coco/'  # Root path of data
+# Path of train annotation file
+train_ann_file = 'annotations/instances_train2017.json'
+train_data_prefix = 'train2017/'  # Prefix of train image path
+# Path of val annotation file
+val_ann_file = 'annotations/instances_val2017.json'
+val_data_prefix = 'val2017/'  # Prefix of val image path
+
+num_classes = 80  # Number of classes for classification
+# Batch size of a single GPU during training
+train_batch_size_per_gpu = 16
+# Worker to pre-fetch data for each single GPU during training
+train_num_workers = 8
+# persistent_workers must be False if num_workers is 0
+persistent_workers = True
+
+# -----train val related-----
+# Base learning rate for optim_wrapper. Corresponding to 8xb16=128 bs
+base_lr = 0.01
+max_epochs = 300  # Maximum training epochs
+# Disable mosaic augmentation for final 10 epochs (stage 2)
+close_mosaic_epochs = 10
+
+model_test_cfg = dict(
+    # The config of multi-label for multi-class prediction.
+    multi_label=True,
+    # The number of boxes before NMS
+    nms_pre=30000,
+    score_thr=0.001,  # Threshold to filter out boxes.
+    nms=dict(type='nms', iou_threshold=0.7),  # NMS type and threshold
+    max_per_img=300)  # Max number of detections of each image
+
+# ========================Possible modified parameters========================
+# -----data related-----
+img_scale = (640, 640)  # width, height
+# Dataset type, this will be used to define the dataset
+dataset_type = 'YOLOv5CocoDataset'
+# Batch size of a single GPU during validation
+val_batch_size_per_gpu = 1
+# Worker to pre-fetch data for each single GPU during validation
+val_num_workers = 2
+
+# Config of batch shapes. Only on val.
+# It means not used if batch_shapes_cfg is None.
+batch_shapes_cfg = dict(
+    type='BatchShapePolicy',
+    batch_size=val_batch_size_per_gpu,
+    img_size=img_scale[0],
+    # The image scale of padding should be divided by pad_size_divisor
+    size_divisor=32,
+    # Additional paddings for pixel scale
+    extra_pad_ratio=0.5)
+
+# -----model related-----
+# The scaling factor that controls the depth of the network structure
+deepen_factor = 0.33
+# The scaling factor that controls the width of the network structure
+widen_factor = 0.5
+# Strides of multi-scale prior box
+strides = [8, 16, 32]
+num_det_layers = 3  # The number of model output scales
+norm_cfg = dict(type='BN', momentum=0.03, eps=0.001)  # Normalization config
+
+# -----train val related-----
+tal_topk = 10  # Number of bbox selected in each level
+tal_alpha = 0.5  # A Hyper-parameter related to alignment_metrics
+tal_beta = 6.0  # A Hyper-parameter related to alignment_metrics
+
+affine_scale = 0.5  # YOLOv5RandomAffine scaling ratio
+# YOLOv5RandomAffine aspect ratio of width and height thres to filter bboxes
+max_aspect_ratio = 100
+# TODO: Automatically scale loss_weight based on number of detection layers
+loss_cls_weight = 0.5
+loss_bbox_weight = 7.5
+# Since the dfloss is implemented differently in the official
+# and mmdet, we're going to divide loss_weight by 4.
+loss_dfl_weight = 1.5 / 4
+lr_factor = 0.01  # Learning rate scaling factor
+weight_decay = 0.001
+# Save model checkpoint and validation intervals
+save_checkpoint_intervals = 10
+# The maximum checkpoints to keep.
+max_keep_ckpts = 3
+# Single-scale training is recommended to
+# be turned on, which can speed up training.
+env_cfg = dict(cudnn_benchmark=True)
+
+# ===============================Unmodified in most cases====================
+model = dict(
+    type='YOLODetector',
+    data_preprocessor=dict(
+        type='YOLOv5DetDataPreprocessor',
+        mean=[0., 0., 0.],
+        std=[255., 255., 255.],
+        bgr_to_rgb=True),
+    backbone=dict(
+        type='YOLOv5CSPDarknet',
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+        norm_cfg=norm_cfg,
+        act_cfg=dict(type='SiLU', inplace=True)),
+    neck=dict(
+        type='YOLOv5PAFPN',
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+        in_channels=[256, 512, 1024],
+        out_channels=[256, 512, 1024],
+        num_csp_blocks=3,
+        norm_cfg=norm_cfg,
+        act_cfg=dict(type='SiLU', inplace=True)),
+    bbox_head=dict(
+        type='YOLOv8Head',
+        head_module=dict(
+            type='YOLOv8HeadModule',
+            num_classes=num_classes,
+            in_channels=[256, 512, 1024],
+            widen_factor=widen_factor,
+            reg_max=16,
+            norm_cfg=norm_cfg,
+            act_cfg=dict(type='SiLU', inplace=True),
+            featmap_strides=strides),
+        prior_generator=dict(
+            type='mmdet.MlvlPointGenerator', offset=0.5, strides=strides),
+        bbox_coder=dict(type='DistancePointBBoxCoder'),
+        # scaled based on number of detection layers
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='none',
+            loss_weight=loss_cls_weight),
+        loss_bbox=dict(
+            type='IoULoss',
+            iou_mode='ciou',
+            bbox_format='xyxy',
+            reduction='sum',
+            loss_weight=loss_bbox_weight,
+            return_iou=False),
+        loss_dfl=dict(
+            type='mmdet.DistributionFocalLoss',
+            reduction='mean',
+            loss_weight=loss_dfl_weight)),
+    train_cfg=dict(
+        assigner=dict(
+            type='BatchTaskAlignedAssigner',
+            num_classes=num_classes,
+            use_ciou=True,
+            topk=tal_topk,
+            alpha=tal_alpha,
+            beta=tal_beta,
+            eps=1e-9)),
+    test_cfg=model_test_cfg)
+
+albu_train_transforms = [
+    dict(type='Blur', p=0.01),
+    dict(type='MedianBlur', p=0.01),
+    dict(type='ToGray', p=0.01),
+    dict(type='CLAHE', p=0.01)
+]
+
+pre_transform = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='LoadAnnotations', with_bbox=True)
+]
+
+last_transform = [
+    dict(
+        type='mmdet.Albu',
+        transforms=albu_train_transforms,
+        bbox_params=dict(
+            type='BboxParams',
+            format='pascal_voc',
+            label_fields=['gt_bboxes_labels', 'gt_ignore_flags']),
+        keymap={
+            'img': 'image',
+            'gt_bboxes': 'bboxes'
+        }),
+    dict(type='YOLOv5HSVRandomAug'),
+    dict(type='mmdet.RandomFlip', prob=0.5),
+    dict(
+        type='mmdet.PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
+                   'flip_direction'))
+]
+
+train_pipeline = [
+    *pre_transform,
+    dict(
+        type='Mosaic',
+        img_scale=img_scale,
+        pad_val=114.0,
+        pre_transform=pre_transform),
+    dict(
+        type='YOLOv5RandomAffine',
+        max_rotate_degree=0.0,
+        max_shear_degree=0.0,
+        scaling_ratio_range=(1 - affine_scale, 1 + affine_scale),
+        max_aspect_ratio=max_aspect_ratio,
+        # img_scale is (width, height)
+        border=(-img_scale[0] // 2, -img_scale[1] // 2),
+        border_val=(114, 114, 114)),
+    *last_transform
+]
+
+train_pipeline_stage2 = [
+    *pre_transform,
+    dict(type='YOLOv5KeepRatioResize', scale=img_scale),
+    dict(
+        type='LetterResize',
+        scale=img_scale,
+        allow_scale_up=True,
+        pad_val=dict(img=114.0)),
+    dict(
+        type='YOLOv5RandomAffine',
+        max_rotate_degree=0.0,
+        max_shear_degree=0.0,
+        scaling_ratio_range=(1 - affine_scale, 1 + affine_scale),
+        max_aspect_ratio=max_aspect_ratio,
+        border_val=(114, 114, 114)), *last_transform
+]
+
+train_dataloader = dict(
+    batch_size=train_batch_size_per_gpu,
+    num_workers=train_num_workers,
+    persistent_workers=persistent_workers,
+    pin_memory=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    collate_fn=dict(type='yolov5_collate'),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=train_ann_file,
+        data_prefix=dict(img=train_data_prefix),
+        filter_cfg=dict(filter_empty_gt=False, min_size=32),
+        pipeline=train_pipeline))
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='YOLOv5KeepRatioResize', scale=img_scale),
+    dict(
+        type='LetterResize',
+        scale=img_scale,
+        allow_scale_up=False,
+        pad_val=dict(img=114)),
+    dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),
+    dict(
+        type='mmdet.PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'pad_param'))
+]
+
+val_dataloader = dict(
+    batch_size=val_batch_size_per_gpu,
+    num_workers=val_num_workers,
+    persistent_workers=persistent_workers,
+    pin_memory=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        test_mode=True,
+        data_prefix=dict(img=val_data_prefix),
+        ann_file=val_ann_file,
+        pipeline=test_pipeline,
+        batch_shapes_cfg=batch_shapes_cfg))
+
+test_dataloader = val_dataloader
+
+param_scheduler = None
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(
+        type='SGD',
+        lr=base_lr,
+        momentum=0.937,
+        weight_decay=weight_decay,
+        nesterov=True,
+        batch_size_per_gpu=train_batch_size_per_gpu),
+    constructor='YOLOv5OptimizerConstructor')
+
+default_hooks = dict(
+    param_scheduler=dict(
+        type='YOLOv5ParamSchedulerHook',
+        scheduler_type='linear',
+        lr_factor=lr_factor,
+        max_epochs=max_epochs,
+        warmup_epochs=3.0,
+        warmup_momentum=0.8,
+        warmup_bias_lr=0.1),
+    checkpoint=dict(
+        type='CheckpointHook',
+        interval=save_checkpoint_intervals,
+        save_best='auto',
+        max_keep_ckpts=max_keep_ckpts))
+
+custom_hooks = [
+    dict(
+        type='EMAHook',
+        ema_type='ExpMomentumEMA',
+        momentum=0.0001,
+        update_buffers=True,
+        strict_load=False,
+        priority=49),
+    dict(
+        type='mmdet.PipelineSwitchHook',
+        switch_epoch=max_epochs - close_mosaic_epochs,
+        switch_pipeline=train_pipeline_stage2)
+]
+
+val_evaluator = dict(
+    type='mmdet.CocoMetric',
+    proposal_nums=(100, 1, 10),
+    ann_file=data_root + val_ann_file,
+    metric='bbox')
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop',
+    max_epochs=max_epochs,
+    val_interval=save_checkpoint_intervals)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
diff --git a/configs/yolov5/yolov5u/yolov5u_x_mask-refine_syncbn_fast_8xb16-300e_coco.py b/configs/yolov5/yolov5u/yolov5u_x_mask-refine_syncbn_fast_8xb16-300e_coco.py
new file mode 100644
index 000000000..33092aa6a
--- /dev/null
+++ b/configs/yolov5/yolov5u/yolov5u_x_mask-refine_syncbn_fast_8xb16-300e_coco.py
@@ -0,0 +1,17 @@
+_base_ = './yolov5u_l_mask-refine_syncbn_fast_8xb16-300e_coco.py'
+
+# ========================modified parameters======================
+deepen_factor = 1.33
+widen_factor = 1.25
+
+# =======================Unmodified in most cases==================
+model = dict(
+    backbone=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+    ),
+    neck=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+    ),
+    bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
diff --git a/configs/yolov5/yolov5u/yolov5u_x_syncbn_fast_8xb16-300e_coco.py b/configs/yolov5/yolov5u/yolov5u_x_syncbn_fast_8xb16-300e_coco.py
new file mode 100644
index 000000000..fd471fd46
--- /dev/null
+++ b/configs/yolov5/yolov5u/yolov5u_x_syncbn_fast_8xb16-300e_coco.py
@@ -0,0 +1,18 @@
+_base_ = './yolov5u_l_syncbn_fast_8xb16-300e_coco.py'
+
+# ========================modified parameters======================
+# TODO: Update the training hyperparameters
+deepen_factor = 1.33
+widen_factor = 1.25
+
+# =======================Unmodified in most cases==================
+model = dict(
+    backbone=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+    ),
+    neck=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+    ),
+    bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
diff --git a/configs/yolov6/README.md b/configs/yolov6/README.md
index 4dac37f4a..7ecda2769 100644
--- a/configs/yolov6/README.md
+++ b/configs/yolov6/README.md
@@ -26,13 +26,13 @@ YOLOv6-l model structure
 
 ### COCO
 
-| Backbone | Arch | Size | Epoch | SyncBN | AMP | Mem (GB) | Box AP |                           Config                            |                                                                                                                                                           Download                                                                                                                                                           |
-| :------: | :--: | :--: | :---: | :----: | :-: | :------: | :----: | :---------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-| YOLOv6-n |  P5  | 640  |  400  |  Yes   | Yes |   6.04   |  36.2  | [config](../yolov6/yolov6_n_syncbn_fast_8xb32-400e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_n_syncbn_fast_8xb32-400e_coco/yolov6_n_syncbn_fast_8xb32-400e_coco_20221030_202726-d99b2e82.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_n_syncbn_fast_8xb32-400e_coco/yolov6_n_syncbn_fast_8xb32-400e_coco_20221030_202726.log.json) |
-| YOLOv6-t |  P5  | 640  |  400  |  Yes   | Yes |   8.13   |  41.0  | [config](../yolov6/yolov6_t_syncbn_fast_8xb32-400e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_t_syncbn_fast_8xb32-400e_coco/yolov6_t_syncbn_fast_8xb32-400e_coco_20221030_143755-cf0d278f.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_t_syncbn_fast_8xb32-400e_coco/yolov6_t_syncbn_fast_8xb32-400e_coco_20221030_143755.log.json) |
-| YOLOv6-s |  P5  | 640  |  400  |  Yes   | Yes |   8.88   |  44.0  | [config](../yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco/yolov6_s_syncbn_fast_8xb32-400e_coco_20221102_203035-932e1d91.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco/yolov6_s_syncbn_fast_8xb32-400e_coco_20221102_203035.log.json) |
-| YOLOv6-m |  P5  | 640  |  300  |  Yes   | Yes |  16.69   |  48.4  | [config](../yolov6/yolov6_m_syncbn_fast_8xb32-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_m_syncbn_fast_8xb32-300e_coco/yolov6_m_syncbn_fast_8xb32-300e_coco_20221109_182658-85bda3f4.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_m_syncbn_fast_8xb32-300e_coco/yolov6_m_syncbn_fast_8xb32-300e_coco_20221109_182658.log.json) |
-| YOLOv6-l |  P5  | 640  |  300  |  Yes   | Yes |  20.86   |  51.0  | [config](../yolov6/yolov6_l_syncbn_fast_8xb32-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_l_syncbn_fast_8xb32-300e_coco/yolov6_l_syncbn_fast_8xb32-300e_coco_20221109_183156-91e3c447.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_l_syncbn_fast_8xb32-300e_coco/yolov6_l_syncbn_fast_8xb32-300e_coco_20221109_183156.log.json) |
+| Backbone | Arch | Size | Epoch | SyncBN | AMP | Mem (GB) | Box AP |                       Config                        |                                                                                                                                                           Download                                                                                                                                                           |
+| :------: | :--: | :--: | :---: | :----: | :-: | :------: | :----: | :-------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| YOLOv6-n |  P5  | 640  |  400  |  Yes   | Yes |   6.04   |  36.2  | [config](./yolov6_n_syncbn_fast_8xb32-400e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_n_syncbn_fast_8xb32-400e_coco/yolov6_n_syncbn_fast_8xb32-400e_coco_20221030_202726-d99b2e82.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_n_syncbn_fast_8xb32-400e_coco/yolov6_n_syncbn_fast_8xb32-400e_coco_20221030_202726.log.json) |
+| YOLOv6-t |  P5  | 640  |  400  |  Yes   | Yes |   8.13   |  41.0  | [config](./yolov6_t_syncbn_fast_8xb32-400e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_t_syncbn_fast_8xb32-400e_coco/yolov6_t_syncbn_fast_8xb32-400e_coco_20221030_143755-cf0d278f.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_t_syncbn_fast_8xb32-400e_coco/yolov6_t_syncbn_fast_8xb32-400e_coco_20221030_143755.log.json) |
+| YOLOv6-s |  P5  | 640  |  400  |  Yes   | Yes |   8.88   |  44.0  | [config](./yolov6_s_syncbn_fast_8xb32-400e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco/yolov6_s_syncbn_fast_8xb32-400e_coco_20221102_203035-932e1d91.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco/yolov6_s_syncbn_fast_8xb32-400e_coco_20221102_203035.log.json) |
+| YOLOv6-m |  P5  | 640  |  300  |  Yes   | Yes |  16.69   |  48.4  | [config](./yolov6_m_syncbn_fast_8xb32-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_m_syncbn_fast_8xb32-300e_coco/yolov6_m_syncbn_fast_8xb32-300e_coco_20221109_182658-85bda3f4.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_m_syncbn_fast_8xb32-300e_coco/yolov6_m_syncbn_fast_8xb32-300e_coco_20221109_182658.log.json) |
+| YOLOv6-l |  P5  | 640  |  300  |  Yes   | Yes |  20.86   |  51.0  | [config](./yolov6_l_syncbn_fast_8xb32-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_l_syncbn_fast_8xb32-300e_coco/yolov6_l_syncbn_fast_8xb32-300e_coco_20221109_183156-91e3c447.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_l_syncbn_fast_8xb32-300e_coco/yolov6_l_syncbn_fast_8xb32-300e_coco_20221109_183156.log.json) |
 
 **Note**:
 
diff --git a/configs/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py b/configs/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py
index 0b5fa560a..eb564c07a 100644
--- a/configs/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py
+++ b/configs/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py
@@ -126,7 +126,7 @@
 # The training pipeline of YOLOv6 is basically the same as YOLOv5.
 # The difference is that Mosaic and RandomAffine will be closed in the last 15 epochs. # noqa
 pre_transform = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(type='LoadAnnotations', with_bbox=True)
 ]
 
@@ -193,7 +193,7 @@
         pipeline=train_pipeline))
 
 test_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(type='YOLOv5KeepRatioResize', scale=img_scale),
     dict(
         type='LetterResize',
diff --git a/configs/yolov6/yolov6_v3_l_syncbn_fast_8xb32-300e_coco.py b/configs/yolov6/yolov6_v3_l_syncbn_fast_8xb32-300e_coco.py
new file mode 100644
index 000000000..7ed4b0553
--- /dev/null
+++ b/configs/yolov6/yolov6_v3_l_syncbn_fast_8xb32-300e_coco.py
@@ -0,0 +1,28 @@
+_base_ = './yolov6_v3_m_syncbn_fast_8xb32-300e_coco.py'
+
+# ======================= Possible modified parameters =======================
+# -----model related-----
+# The scaling factor that controls the depth of the network structure
+deepen_factor = 1
+# The scaling factor that controls the width of the network structure
+widen_factor = 1
+
+# ============================== Unmodified in most cases ===================
+model = dict(
+    backbone=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+        hidden_ratio=1. / 2,
+        block_cfg=dict(
+            type='ConvWrapper',
+            norm_cfg=dict(type='BN', momentum=0.03, eps=0.001)),
+        act_cfg=dict(type='SiLU', inplace=True)),
+    neck=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+        hidden_ratio=1. / 2,
+        block_cfg=dict(
+            type='ConvWrapper',
+            norm_cfg=dict(type='BN', momentum=0.03, eps=0.001)),
+        block_act_cfg=dict(type='SiLU', inplace=True)),
+    bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
diff --git a/configs/yolov6/yolov6_v3_m_syncbn_fast_8xb32-300e_coco.py b/configs/yolov6/yolov6_v3_m_syncbn_fast_8xb32-300e_coco.py
new file mode 100644
index 000000000..982b0c886
--- /dev/null
+++ b/configs/yolov6/yolov6_v3_m_syncbn_fast_8xb32-300e_coco.py
@@ -0,0 +1,63 @@
+_base_ = './yolov6_v3_s_syncbn_fast_8xb32-300e_coco.py'
+
+# ======================= Possible modified parameters =======================
+# -----model related-----
+# The scaling factor that controls the depth of the network structure
+deepen_factor = 0.6
+# The scaling factor that controls the width of the network structure
+widen_factor = 0.75
+
+# -----train val related-----
+affine_scale = 0.9  # YOLOv5RandomAffine scaling ratio
+
+# ============================== Unmodified in most cases ===================
+model = dict(
+    backbone=dict(
+        type='YOLOv6CSPBep',
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+        hidden_ratio=2. / 3,
+        block_cfg=dict(type='RepVGGBlock'),
+        act_cfg=dict(type='ReLU', inplace=True)),
+    neck=dict(
+        type='YOLOv6CSPRepBiPAFPN',
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+        block_cfg=dict(type='RepVGGBlock'),
+        hidden_ratio=2. / 3,
+        block_act_cfg=dict(type='ReLU', inplace=True)),
+    bbox_head=dict(
+        type='YOLOv6Head',
+        head_module=dict(reg_max=16, widen_factor=widen_factor)))
+
+mosaic_affine_pipeline = [
+    dict(
+        type='Mosaic',
+        img_scale=_base_.img_scale,
+        pad_val=114.0,
+        pre_transform=_base_.pre_transform),
+    dict(
+        type='YOLOv5RandomAffine',
+        max_rotate_degree=0.0,
+        max_shear_degree=0.0,
+        scaling_ratio_range=(1 - affine_scale, 1 + affine_scale),
+        # img_scale is (width, height)
+        border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
+        border_val=(114, 114, 114))
+]
+
+train_pipeline = [
+    *_base_.pre_transform, *mosaic_affine_pipeline,
+    dict(
+        type='YOLOv5MixUp',
+        prob=0.1,
+        pre_transform=[*_base_.pre_transform, *mosaic_affine_pipeline]),
+    dict(type='YOLOv5HSVRandomAug'),
+    dict(type='mmdet.RandomFlip', prob=0.5),
+    dict(
+        type='mmdet.PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
+                   'flip_direction'))
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
diff --git a/configs/yolov6/yolov6_v3_n_syncbn_fast_8xb32-300e_coco.py b/configs/yolov6/yolov6_v3_n_syncbn_fast_8xb32-300e_coco.py
new file mode 100644
index 000000000..96469f026
--- /dev/null
+++ b/configs/yolov6/yolov6_v3_n_syncbn_fast_8xb32-300e_coco.py
@@ -0,0 +1,21 @@
+_base_ = './yolov6_v3_s_syncbn_fast_8xb32-300e_coco.py'
+
+# ======================= Possible modified parameters =======================
+# -----model related-----
+# The scaling factor that controls the depth of the network structure
+deepen_factor = 0.33
+# The scaling factor that controls the width of the network structure
+widen_factor = 0.25
+
+# -----train val related-----
+lr_factor = 0.02  # Learning rate scaling factor
+
+# ============================== Unmodified in most cases ===================
+model = dict(
+    backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
+    neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
+    bbox_head=dict(
+        head_module=dict(widen_factor=widen_factor),
+        loss_bbox=dict(iou_mode='siou')))
+
+default_hooks = dict(param_scheduler=dict(lr_factor=lr_factor))
diff --git a/configs/yolov6/yolov6_v3_s_syncbn_fast_8xb32-300e_coco.py b/configs/yolov6/yolov6_v3_s_syncbn_fast_8xb32-300e_coco.py
new file mode 100644
index 000000000..8b0ad1901
--- /dev/null
+++ b/configs/yolov6/yolov6_v3_s_syncbn_fast_8xb32-300e_coco.py
@@ -0,0 +1,282 @@
+_base_ = ['../_base_/default_runtime.py', '../_base_/det_p5_tta.py']
+
+# ======================= Frequently modified parameters =====================
+# -----data related-----
+data_root = 'data/coco/'  # Root path of data
+# Path of train annotation file
+train_ann_file = 'annotations/instances_train2017.json'
+train_data_prefix = 'train2017/'  # Prefix of train image path
+# Path of val annotation file
+val_ann_file = 'annotations/instances_val2017.json'
+val_data_prefix = 'val2017/'  # Prefix of val image path
+
+num_classes = 80  # Number of classes for classification
+# Batch size of a single GPU during training
+train_batch_size_per_gpu = 32
+# Worker to pre-fetch data for each single GPU during training
+train_num_workers = 8
+# persistent_workers must be False if num_workers is 0
+persistent_workers = True
+
+# -----train val related-----
+# Base learning rate for optim_wrapper
+base_lr = 0.01
+max_epochs = 300  # Maximum training epochs
+num_last_epochs = 15  # Last epoch number to switch training pipeline
+
+# ======================= Possible modified parameters =======================
+# -----data related-----
+img_scale = (640, 640)  # width, height
+# Dataset type, this will be used to define the dataset
+dataset_type = 'YOLOv5CocoDataset'
+# Batch size of a single GPU during validation
+val_batch_size_per_gpu = 1
+# Worker to pre-fetch data for each single GPU during validation
+val_num_workers = 2
+
+# Config of batch shapes. Only on val.
+# It means not used if batch_shapes_cfg is None.
+batch_shapes_cfg = dict(
+    type='BatchShapePolicy',
+    batch_size=val_batch_size_per_gpu,
+    img_size=img_scale[0],
+    size_divisor=32,
+    extra_pad_ratio=0.5)
+
+# -----model related-----
+# The scaling factor that controls the depth of the network structure
+deepen_factor = 0.33
+# The scaling factor that controls the width of the network structure
+widen_factor = 0.5
+
+# -----train val related-----
+affine_scale = 0.5  # YOLOv5RandomAffine scaling ratio
+lr_factor = 0.01  # Learning rate scaling factor
+weight_decay = 0.0005
+# Save model checkpoint and validation intervals
+save_epoch_intervals = 10
+# The maximum checkpoints to keep.
+max_keep_ckpts = 3
+# Single-scale training is recommended to
+# be turned on, which can speed up training.
+env_cfg = dict(cudnn_benchmark=True)
+
+# ============================== Unmodified in most cases ===================
+model = dict(
+    type='YOLODetector',
+    data_preprocessor=dict(
+        type='YOLOv5DetDataPreprocessor',
+        mean=[0., 0., 0.],
+        std=[255., 255., 255.],
+        bgr_to_rgb=True),
+    backbone=dict(
+        type='YOLOv6EfficientRep',
+        out_indices=[1, 2, 3, 4],
+        use_cspsppf=True,
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+        norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+        act_cfg=dict(type='ReLU', inplace=True)),
+    neck=dict(
+        type='YOLOv6RepBiPAFPN',
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+        in_channels=[128, 256, 512, 1024],
+        out_channels=[128, 256, 512],
+        num_csp_blocks=12,
+        norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+        act_cfg=dict(type='ReLU', inplace=True),
+    ),
+    bbox_head=dict(
+        type='YOLOv6Head',
+        head_module=dict(
+            type='YOLOv6HeadModule',
+            num_classes=num_classes,
+            in_channels=[128, 256, 512],
+            widen_factor=widen_factor,
+            norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+            act_cfg=dict(type='SiLU', inplace=True),
+            featmap_strides=[8, 16, 32]),
+        loss_bbox=dict(
+            type='IoULoss',
+            iou_mode='giou',
+            bbox_format='xyxy',
+            reduction='mean',
+            loss_weight=2.5,
+            return_iou=False)),
+    train_cfg=dict(
+        initial_epoch=4,
+        initial_assigner=dict(
+            type='BatchATSSAssigner',
+            num_classes=num_classes,
+            topk=9,
+            iou_calculator=dict(type='mmdet.BboxOverlaps2D')),
+        assigner=dict(
+            type='BatchTaskAlignedAssigner',
+            num_classes=num_classes,
+            topk=13,
+            alpha=1,
+            beta=6),
+    ),
+    test_cfg=dict(
+        multi_label=True,
+        nms_pre=30000,
+        score_thr=0.001,
+        nms=dict(type='nms', iou_threshold=0.65),
+        max_per_img=300))
+
+# The training pipeline of YOLOv6 is basically the same as YOLOv5.
+# The difference is that Mosaic and RandomAffine will be closed in the last 15 epochs. # noqa
+pre_transform = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='LoadAnnotations', with_bbox=True)
+]
+
+train_pipeline = [
+    *pre_transform,
+    dict(
+        type='Mosaic',
+        img_scale=img_scale,
+        pad_val=114.0,
+        pre_transform=pre_transform),
+    dict(
+        type='YOLOv5RandomAffine',
+        max_rotate_degree=0.0,
+        max_translate_ratio=0.1,
+        scaling_ratio_range=(1 - affine_scale, 1 + affine_scale),
+        # img_scale is (width, height)
+        border=(-img_scale[0] // 2, -img_scale[1] // 2),
+        border_val=(114, 114, 114),
+        max_shear_degree=0.0),
+    dict(type='YOLOv5HSVRandomAug'),
+    dict(type='mmdet.RandomFlip', prob=0.5),
+    dict(
+        type='mmdet.PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
+                   'flip_direction'))
+]
+
+train_pipeline_stage2 = [
+    *pre_transform,
+    dict(type='YOLOv5KeepRatioResize', scale=img_scale),
+    dict(
+        type='LetterResize',
+        scale=img_scale,
+        allow_scale_up=True,
+        pad_val=dict(img=114)),
+    dict(
+        type='YOLOv5RandomAffine',
+        max_rotate_degree=0.0,
+        max_translate_ratio=0.1,
+        scaling_ratio_range=(1 - affine_scale, 1 + affine_scale),
+        max_shear_degree=0.0,
+    ),
+    dict(type='YOLOv5HSVRandomAug'),
+    dict(type='mmdet.RandomFlip', prob=0.5),
+    dict(
+        type='mmdet.PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
+                   'flip_direction'))
+]
+
+train_dataloader = dict(
+    batch_size=train_batch_size_per_gpu,
+    num_workers=train_num_workers,
+    collate_fn=dict(type='yolov5_collate'),
+    persistent_workers=persistent_workers,
+    pin_memory=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=train_ann_file,
+        data_prefix=dict(img=train_data_prefix),
+        filter_cfg=dict(filter_empty_gt=False, min_size=32),
+        pipeline=train_pipeline))
+
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='YOLOv5KeepRatioResize', scale=img_scale),
+    dict(
+        type='LetterResize',
+        scale=img_scale,
+        allow_scale_up=False,
+        pad_val=dict(img=114)),
+    dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),
+    dict(
+        type='mmdet.PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'pad_param'))
+]
+
+val_dataloader = dict(
+    batch_size=val_batch_size_per_gpu,
+    num_workers=val_num_workers,
+    persistent_workers=persistent_workers,
+    pin_memory=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        test_mode=True,
+        data_prefix=dict(img=val_data_prefix),
+        ann_file=val_ann_file,
+        pipeline=test_pipeline,
+        batch_shapes_cfg=batch_shapes_cfg))
+
+test_dataloader = val_dataloader
+
+# Optimizer and learning rate scheduler of YOLOv6 are basically the same as YOLOv5. # noqa
+# The difference is that the scheduler_type of YOLOv6 is cosine.
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(
+        type='SGD',
+        lr=base_lr,
+        momentum=0.937,
+        weight_decay=weight_decay,
+        nesterov=True,
+        batch_size_per_gpu=train_batch_size_per_gpu),
+    constructor='YOLOv5OptimizerConstructor')
+
+default_hooks = dict(
+    param_scheduler=dict(
+        type='YOLOv5ParamSchedulerHook',
+        scheduler_type='cosine',
+        lr_factor=lr_factor,
+        max_epochs=max_epochs),
+    checkpoint=dict(
+        type='CheckpointHook',
+        interval=save_epoch_intervals,
+        max_keep_ckpts=max_keep_ckpts,
+        save_best='auto'))
+
+custom_hooks = [
+    dict(
+        type='EMAHook',
+        ema_type='ExpMomentumEMA',
+        momentum=0.0001,
+        update_buffers=True,
+        strict_load=False,
+        priority=49),
+    dict(
+        type='mmdet.PipelineSwitchHook',
+        switch_epoch=max_epochs - num_last_epochs,
+        switch_pipeline=train_pipeline_stage2)
+]
+
+val_evaluator = dict(
+    type='mmdet.CocoMetric',
+    proposal_nums=(100, 1, 10),
+    ann_file=data_root + val_ann_file,
+    metric='bbox')
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop',
+    max_epochs=max_epochs,
+    val_interval=save_epoch_intervals,
+    dynamic_intervals=[(max_epochs - num_last_epochs, 1)])
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
diff --git a/configs/yolov6/yolov6_v3_t_syncbn_fast_8xb32-300e_coco.py b/configs/yolov6/yolov6_v3_t_syncbn_fast_8xb32-300e_coco.py
new file mode 100644
index 000000000..d088b6b66
--- /dev/null
+++ b/configs/yolov6/yolov6_v3_t_syncbn_fast_8xb32-300e_coco.py
@@ -0,0 +1,17 @@
+_base_ = './yolov6_v3_s_syncbn_fast_8xb32-300e_coco.py'
+
+# ======================= Possible modified parameters =======================
+# -----model related-----
+# The scaling factor that controls the depth of the network structure
+deepen_factor = 0.33
+# The scaling factor that controls the width of the network structure
+widen_factor = 0.375
+
+# ============================== Unmodified in most cases ===================
+model = dict(
+    backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
+    neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
+    bbox_head=dict(
+        type='YOLOv6Head',
+        head_module=dict(widen_factor=widen_factor),
+        loss_bbox=dict(iou_mode='siou')))
diff --git a/configs/yolov7/README.md b/configs/yolov7/README.md
index e4f812616..f8f87f835 100644
--- a/configs/yolov7/README.md
+++ b/configs/yolov7/README.md
@@ -13,7 +13,7 @@ YOLOv7 surpasses all known object detectors in both speed and accuracy in the ra
 </div>
 
 <div align=center>
-<img alt="YOLOv7-l" src="https://user-images.githubusercontent.com/68552295/216335336-963bd03a-71f3-4556-97af-18b20d69e065.png" width = 95.5%/>
+<img alt="YOLOv7-l" src="https://user-images.githubusercontent.com/27466624/228872754-d78be729-2977-47e6-92c1-721535781776.jpg" width = 95.5%/>
 YOLOv7-l-P5 model structure
 </div>
 
@@ -21,13 +21,13 @@ YOLOv7-l-P5 model structure
 
 ### COCO
 
-|  Backbone   | Arch | Size | SyncBN | AMP | Mem (GB) | Box AP |                                                       Config                                                       |                                                                                                                                                                 Download                                                                                                                                                                 |
-| :---------: | :--: | :--: | :----: | :-: | :------: | :----: | :----------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-| YOLOv7-tiny |  P5  | 640  |  Yes   | Yes |   2.7    |  37.5  | [config](https://github.com/open-mmlab/mmyolo/tree/main/configs/yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco/yolov7_tiny_syncbn_fast_8x16b-300e_coco_20221126_102719-0ee5bbdf.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco/yolov7_tiny_syncbn_fast_8x16b-300e_coco_20221126_102719.log.json) |
-|  YOLOv7-l   |  P5  | 640  |  Yes   | Yes |   10.3   |  50.9  |  [config](https://github.com/open-mmlab/mmyolo/tree/main/configs/yolov7/yolov7_l_syncbn_fast_8x16b-300e_coco.py)   |       [model](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_l_syncbn_fast_8x16b-300e_coco/yolov7_l_syncbn_fast_8x16b-300e_coco_20221123_023601-8113c0eb.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_l_syncbn_fast_8x16b-300e_coco/yolov7_l_syncbn_fast_8x16b-300e_coco_20221123_023601.log.json)       |
-|  YOLOv7-x   |  P5  | 640  |  Yes   | Yes |   13.7   |  52.8  |  [config](https://github.com/open-mmlab/mmyolo/tree/main/configs/yolov7/yolov7_x_syncbn_fast_8x16b-300e_coco.py)   |       [model](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_x_syncbn_fast_8x16b-300e_coco/yolov7_x_syncbn_fast_8x16b-300e_coco_20221124_215331-ef949a68.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_x_syncbn_fast_8x16b-300e_coco/yolov7_x_syncbn_fast_8x16b-300e_coco_20221124_215331.log.json)       |
-|  YOLOv7-w   |  P6  | 1280 |  Yes   | Yes |   27.0   |  54.1  | [config](https://github.com/open-mmlab/mmyolo/tree/main/configs/yolov7/yolov7_w-p6_syncbn_fast_8x16b-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_w-p6_syncbn_fast_8x16b-300e_coco/yolov7_w-p6_syncbn_fast_8x16b-300e_coco_20221123_053031-a68ef9d2.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_w-p6_syncbn_fast_8x16b-300e_coco/yolov7_w-p6_syncbn_fast_8x16b-300e_coco_20221123_053031.log.json) |
-|  YOLOv7-e   |  P6  | 1280 |  Yes   | Yes |   42.5   |  55.1  | [config](https://github.com/open-mmlab/mmyolo/tree/main/configs/yolov7/yolov7_e-p6_syncbn_fast_8x16b-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_e-p6_syncbn_fast_8x16b-300e_coco/yolov7_e-p6_syncbn_fast_8x16b-300e_coco_20221126_102636-34425033.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_e-p6_syncbn_fast_8x16b-300e_coco/yolov7_e-p6_syncbn_fast_8x16b-300e_coco_20221126_102636.log.json) |
+|  Backbone   | Arch | Size | SyncBN | AMP | Mem (GB) | Box AP |                         Config                         |                                                                                                                                                                 Download                                                                                                                                                                 |
+| :---------: | :--: | :--: | :----: | :-: | :------: | :----: | :----------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| YOLOv7-tiny |  P5  | 640  |  Yes   | Yes |   2.7    |  37.5  | [config](./yolov7_tiny_syncbn_fast_8x16b-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco/yolov7_tiny_syncbn_fast_8x16b-300e_coco_20221126_102719-0ee5bbdf.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco/yolov7_tiny_syncbn_fast_8x16b-300e_coco_20221126_102719.log.json) |
+|  YOLOv7-l   |  P5  | 640  |  Yes   | Yes |   10.3   |  50.9  |  [config](./yolov7_l_syncbn_fast_8x16b-300e_coco.py)   |       [model](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_l_syncbn_fast_8x16b-300e_coco/yolov7_l_syncbn_fast_8x16b-300e_coco_20221123_023601-8113c0eb.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_l_syncbn_fast_8x16b-300e_coco/yolov7_l_syncbn_fast_8x16b-300e_coco_20221123_023601.log.json)       |
+|  YOLOv7-x   |  P5  | 640  |  Yes   | Yes |   13.7   |  52.8  |  [config](./yolov7_x_syncbn_fast_8x16b-300e_coco.py)   |       [model](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_x_syncbn_fast_8x16b-300e_coco/yolov7_x_syncbn_fast_8x16b-300e_coco_20221124_215331-ef949a68.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_x_syncbn_fast_8x16b-300e_coco/yolov7_x_syncbn_fast_8x16b-300e_coco_20221124_215331.log.json)       |
+|  YOLOv7-w   |  P6  | 1280 |  Yes   | Yes |   27.0   |  54.1  | [config](./yolov7_w-p6_syncbn_fast_8x16b-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_w-p6_syncbn_fast_8x16b-300e_coco/yolov7_w-p6_syncbn_fast_8x16b-300e_coco_20221123_053031-a68ef9d2.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_w-p6_syncbn_fast_8x16b-300e_coco/yolov7_w-p6_syncbn_fast_8x16b-300e_coco_20221123_053031.log.json) |
+|  YOLOv7-e   |  P6  | 1280 |  Yes   | Yes |   42.5   |  55.1  | [config](./yolov7_e-p6_syncbn_fast_8x16b-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_e-p6_syncbn_fast_8x16b-300e_coco/yolov7_e-p6_syncbn_fast_8x16b-300e_coco_20221126_102636-34425033.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_e-p6_syncbn_fast_8x16b-300e_coco/yolov7_e-p6_syncbn_fast_8x16b-300e_coco_20221126_102636.log.json) |
 
 **Note**:
 In the official YOLOv7 code, the `random_perspective` data augmentation in COCO object detection task training uses mask annotation information, which leads to higher performance. Object detection should not use mask annotation, so only box annotation information is used in `MMYOLO`. We will use the mask annotation information in the instance segmentation task.
diff --git a/configs/yolov7/yolov7_l_syncbn_fast_8x16b-300e_coco.py b/configs/yolov7/yolov7_l_syncbn_fast_8x16b-300e_coco.py
index 6712002c2..e8a756c27 100644
--- a/configs/yolov7/yolov7_l_syncbn_fast_8x16b-300e_coco.py
+++ b/configs/yolov7/yolov7_l_syncbn_fast_8x16b-300e_coco.py
@@ -165,7 +165,7 @@
     test_cfg=model_test_cfg)
 
 pre_transform = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(type='LoadAnnotations', with_bbox=True)
 ]
 
@@ -241,7 +241,7 @@
         pipeline=train_pipeline))
 
 test_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(type='YOLOv5KeepRatioResize', scale=img_scale),
     dict(
         type='LetterResize',
diff --git a/configs/yolov7/yolov7_w-p6_syncbn_fast_8x16b-300e_coco.py b/configs/yolov7/yolov7_w-p6_syncbn_fast_8x16b-300e_coco.py
index 11164d217..9758b8717 100644
--- a/configs/yolov7/yolov7_w-p6_syncbn_fast_8x16b-300e_coco.py
+++ b/configs/yolov7/yolov7_w-p6_syncbn_fast_8x16b-300e_coco.py
@@ -128,7 +128,7 @@
 train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
 
 test_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(type='YOLOv5KeepRatioResize', scale=img_scale),
     dict(
         type='LetterResize',
@@ -162,7 +162,7 @@
 ]
 
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(
         type='TestTimeAug',
         transforms=[
diff --git a/configs/yolov8/README.md b/configs/yolov8/README.md
index a284e237e..766aa9916 100644
--- a/configs/yolov8/README.md
+++ b/configs/yolov8/README.md
@@ -8,11 +8,11 @@ Ultralytics YOLOv8, developed by Ultralytics, is a cutting-edge, state-of-the-ar
 
 <div align=center>
 <img src="https://user-images.githubusercontent.com/17425982/212812246-51dc029c-e892-455d-86b4-946b5d03957a.png"/>
-performance
+YOLOv8 performance
 </div>
 
 <div align=center>
-<img src="https://user-images.githubusercontent.com/27466624/211974251-8de633c8-090c-47c9-ba52-4941dc9e3a48.jpg"/>
+<img src="https://user-images.githubusercontent.com/27466624/222869864-1955f054-aa6d-4a80-aed3-92f30af28849.jpg"/>
 YOLOv8-P5 model structure
 </div>
 
@@ -20,18 +20,18 @@ YOLOv8-P5 model structure
 
 ### COCO
 
-| Backbone | Arch | size | Mask Refine | SyncBN | AMP | Mem (GB) |   box AP    | TTA box AP |                                 Config                                  |                                                                                                                                                                                   Download                                                                                                                                                                                   |
-| :------: | :--: | :--: | :---------: | :----: | :-: | :------: | :---------: | :--------: | :---------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-| YOLOv8-n |  P5  | 640  |     No      |  Yes   | Yes |   2.8    |    37.2     |            |       [config](../yolov8/yolov8_n_syncbn_fast_8xb16-500e_coco.py)       |                         [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_n_syncbn_fast_8xb16-500e_coco/yolov8_n_syncbn_fast_8xb16-500e_coco_20230114_131804-88c11cdb.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_n_syncbn_fast_8xb16-500e_coco/yolov8_n_syncbn_fast_8xb16-500e_coco_20230114_131804.log.json)                         |
-| YOLOv8-n |  P5  | 640  |     Yes     |  Yes   | Yes |   2.5    | 37.4 (+0.2) |    39.9    | [config](../yolov8/yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco_20230216_101206-b975b1cd.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco_20230216_101206.log.json) |
-| YOLOv8-s |  P5  | 640  |     No      |  Yes   | Yes |   4.0    |    44.2     |            |       [config](../yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco.py)       |                         [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco/yolov8_s_syncbn_fast_8xb16-500e_coco_20230117_180101-5aa5f0f1.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco/yolov8_s_syncbn_fast_8xb16-500e_coco_20230117_180101.log.json)                         |
-| YOLOv8-s |  P5  | 640  |     Yes     |  Yes   | Yes |   4.0    | 45.1 (+0.9) |    46.8    | [config](../yolov8/yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco_20230216_095938-ce3c1b3f.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco_20230216_095938.log.json) |
-| YOLOv8-m |  P5  | 640  |     No      |  Yes   | Yes |   7.2    |    49.8     |            |       [config](../yolov8/yolov8_m_syncbn_fast_8xb16-500e_coco.py)       |                         [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_m_syncbn_fast_8xb16-500e_coco/yolov8_m_syncbn_fast_8xb16-500e_coco_20230115_192200-c22e560a.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_m_syncbn_fast_8xb16-500e_coco/yolov8_m_syncbn_fast_8xb16-500e_coco_20230115_192200.log.json)                         |
-| YOLOv8-m |  P5  | 640  |     Yes     |  Yes   | Yes |   7.0    | 50.6 (+0.8) |    52.3    | [config](../yolov8/yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco_20230216_223400-f40abfcd.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco_20230216_223400.log.json) |
-| YOLOv8-l |  P5  | 640  |     No      |  Yes   | Yes |   9.8    |    52.1     |            |       [config](../yolov8/yolov8_l_syncbn_fast_8xb16-500e_coco.py)       |                         [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_l_syncbn_fast_8xb16-500e_coco/yolov8_l_syncbn_fast_8xb16-500e_coco_20230217_182526-189611b6.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_l_syncbn_fast_8xb16-500e_coco/yolov8_l_syncbn_fast_8xb16-500e_coco_20230217_182526.log.json)                         |
-| YOLOv8-l |  P5  | 640  |     Yes     |  Yes   | Yes |   9.1    | 53.0 (+0.9) |    54.4    | [config](../yolov8/yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco_20230217_120100-5881dec4.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco_20230217_120100.log.json) |
-| YOLOv8-x |  P5  | 640  |     No      |  Yes   | Yes |   12.2   |    52.7     |            |       [config](../yolov8/yolov8_x_syncbn_fast_8xb16-500e_coco.py)       |                         [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_x_syncbn_fast_8xb16-500e_coco/yolov8_x_syncbn_fast_8xb16-500e_coco_20230218_023338-5674673c.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_x_syncbn_fast_8xb16-500e_coco/yolov8_x_syncbn_fast_8xb16-500e_coco_20230218_023338.log.json)                         |
-| YOLOv8-x |  P5  | 640  |     Yes     |  Yes   | Yes |   12.4   | 54.0 (+1.3) |    55.0    | [config](../yolov8/yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco_20230217_120411-079ca8d1.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco_20230217_120411.log.json) |
+| Backbone | Arch | size | Mask Refine | SyncBN | AMP | Mem (GB) |   box AP    | TTA box AP |                             Config                              |                                                                                                                                                                                   Download                                                                                                                                                                                   |
+| :------: | :--: | :--: | :---------: | :----: | :-: | :------: | :---------: | :--------: | :-------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| YOLOv8-n |  P5  | 640  |     No      |  Yes   | Yes |   2.8    |    37.2     |            |       [config](./yolov8_n_syncbn_fast_8xb16-500e_coco.py)       |                         [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_n_syncbn_fast_8xb16-500e_coco/yolov8_n_syncbn_fast_8xb16-500e_coco_20230114_131804-88c11cdb.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_n_syncbn_fast_8xb16-500e_coco/yolov8_n_syncbn_fast_8xb16-500e_coco_20230114_131804.log.json)                         |
+| YOLOv8-n |  P5  | 640  |     Yes     |  Yes   | Yes |   2.5    | 37.4 (+0.2) |    39.9    | [config](./yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco_20230216_101206-b975b1cd.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco_20230216_101206.log.json) |
+| YOLOv8-s |  P5  | 640  |     No      |  Yes   | Yes |   4.0    |    44.2     |            |       [config](./yolov8_s_syncbn_fast_8xb16-500e_coco.py)       |                         [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco/yolov8_s_syncbn_fast_8xb16-500e_coco_20230117_180101-5aa5f0f1.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco/yolov8_s_syncbn_fast_8xb16-500e_coco_20230117_180101.log.json)                         |
+| YOLOv8-s |  P5  | 640  |     Yes     |  Yes   | Yes |   4.0    | 45.1 (+0.9) |    46.8    | [config](./yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco_20230216_095938-ce3c1b3f.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco_20230216_095938.log.json) |
+| YOLOv8-m |  P5  | 640  |     No      |  Yes   | Yes |   7.2    |    49.8     |            |       [config](./yolov8_m_syncbn_fast_8xb16-500e_coco.py)       |                         [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_m_syncbn_fast_8xb16-500e_coco/yolov8_m_syncbn_fast_8xb16-500e_coco_20230115_192200-c22e560a.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_m_syncbn_fast_8xb16-500e_coco/yolov8_m_syncbn_fast_8xb16-500e_coco_20230115_192200.log.json)                         |
+| YOLOv8-m |  P5  | 640  |     Yes     |  Yes   | Yes |   7.0    | 50.6 (+0.8) |    52.3    | [config](./yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco_20230216_223400-f40abfcd.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco_20230216_223400.log.json) |
+| YOLOv8-l |  P5  | 640  |     No      |  Yes   | Yes |   9.8    |    52.1     |            |       [config](./yolov8_l_syncbn_fast_8xb16-500e_coco.py)       |                         [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_l_syncbn_fast_8xb16-500e_coco/yolov8_l_syncbn_fast_8xb16-500e_coco_20230217_182526-189611b6.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_l_syncbn_fast_8xb16-500e_coco/yolov8_l_syncbn_fast_8xb16-500e_coco_20230217_182526.log.json)                         |
+| YOLOv8-l |  P5  | 640  |     Yes     |  Yes   | Yes |   9.1    | 53.0 (+0.9) |    54.4    | [config](./yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco_20230217_120100-5881dec4.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco_20230217_120100.log.json) |
+| YOLOv8-x |  P5  | 640  |     No      |  Yes   | Yes |   12.2   |    52.7     |            |       [config](./yolov8_x_syncbn_fast_8xb16-500e_coco.py)       |                         [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_x_syncbn_fast_8xb16-500e_coco/yolov8_x_syncbn_fast_8xb16-500e_coco_20230218_023338-5674673c.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_x_syncbn_fast_8xb16-500e_coco/yolov8_x_syncbn_fast_8xb16-500e_coco_20230218_023338.log.json)                         |
+| YOLOv8-x |  P5  | 640  |     Yes     |  Yes   | Yes |   12.4   | 54.0 (+1.3) |    55.0    | [config](./yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco_20230217_120411-079ca8d1.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco_20230217_120411.log.json) |
 
 **Note**
 
diff --git a/configs/yolov8/yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py b/configs/yolov8/yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py
index 3ab3a2bc8..769a698e4 100644
--- a/configs/yolov8/yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py
+++ b/configs/yolov8/yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py
@@ -9,7 +9,7 @@
 
 # ===============================Unmodified in most cases====================
 pre_transform = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(
         type='LoadAnnotations',
         with_bbox=True,
diff --git a/configs/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco.py b/configs/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco.py
index adb9c7feb..7e4127efb 100644
--- a/configs/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco.py
+++ b/configs/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco.py
@@ -169,7 +169,7 @@
 ]
 
 pre_transform = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(type='LoadAnnotations', with_bbox=True)
 ]
 
@@ -245,7 +245,7 @@
         pipeline=train_pipeline))
 
 test_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(type='YOLOv5KeepRatioResize', scale=img_scale),
     dict(
         type='LetterResize',
diff --git a/configs/yolox/README.md b/configs/yolox/README.md
index 11f9e3070..7d5dc683c 100644
--- a/configs/yolox/README.md
+++ b/configs/yolox/README.md
@@ -19,16 +19,16 @@ YOLOX-l model structure
 
 ## 🥳 🚀 Results and Models
 
-|  Backbone  | Size | Batch Size | AMP | RTMDet-Hyp | Mem (GB) |   Box AP    |                                                       Config                                                        |                                                                                                                                                                      Download                                                                                                                                                                      |
-| :--------: | :--: | :--------: | :-: | :--------: | :------: | :---------: | :-----------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-| YOLOX-tiny | 416  |    8xb8    | No  |     No     |   2.8    |    32.7     |       [config](https://github.com/open-mmlab/mmyolo/tree/dev/configs/yolox/yolox_tiny_fast_8xb8-300e_coco.py)       |                                   [model](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_tiny_8xb8-300e_coco/yolox_tiny_8xb8-300e_coco_20220919_090908-0e40a6fc.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_tiny_8xb8-300e_coco/yolox_tiny_8xb8-300e_coco_20220919_090908.log.json)                                   |
-| YOLOX-tiny | 416  |   8xb32    | Yes |    Yes     |   4.9    | 34.3 (+1.6) | [config](https://github.com/open-mmlab/mmyolo/tree/dev/configs/yolox/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco_20230210_143637-4c338102.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco_20230210_143637.log.json) |
-|  YOLOX-s   | 640  |    8xb8    | Yes |     No     |   2.9    |    40.7     |        [config](https://github.com/open-mmlab/mmyolo/tree/dev/configs/yolox/yolox_s_fast_8xb8-300e_coco.py)         |                               [model](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_s_fast_8xb8-300e_coco/yolox_s_fast_8xb8-300e_coco_20230213_142600-2b224d8b.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_s_fast_8xb8-300e_coco/yolox_s_fast_8xb8-300e_coco_20230213_142600.log.json)                               |
-|  YOLOX-s   | 640  |   8xb32    | Yes |    Yes     |   9.8    | 41.9 (+1.2) |  [config](https://github.com/open-mmlab/mmyolo/tree/dev/configs/yolox/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco.py)   |       [model](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco_20230210_134645-3a8dfbd7.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco_20230210_134645.log.json)       |
-|  YOLOX-m   | 640  |    8xb8    | Yes |     No     |   4.9    |    46.9     |        [config](https://github.com/open-mmlab/mmyolo/tree/dev/configs/yolox/yolox_m_fast_8xb8-300e_coco.py)         |                               [model](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_m_fast_8xb8-300e_coco/yolox_m_fast_8xb8-300e_coco_20230213_160218-a71a6b25.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_m_fast_8xb8-300e_coco/yolox_m_fast_8xb8-300e_coco_20230213_160218.log.json)                               |
-|  YOLOX-m   | 640  |   8xb32    | Yes |    Yes     |   17.6   | 47.5 (+0.6) |  [config](https://github.com/open-mmlab/mmyolo/tree/dev/configs/yolox/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco.py)   |       [model](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco_20230210_144328-e657e182.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco_20230210_144328.log.json)       |
-|  YOLOX-l   | 640  |    8xb8    | Yes |     No     |   8.0    |    50.1     |        [config](https://github.com/open-mmlab/mmyolo/tree/dev/configs/yolox/yolox_l_fast_8xb8-300e_coco.py)         |                              [model](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_l_fast__8xb8-300e_coco/yolox_l_fast_8xb8-300e_coco_20230213_160715-c731eb1c.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_l_fast_8xb8-300e_coco/yolox_l_fast_8xb8-300e_coco_20230213_160715.log.json)                               |
-|  YOLOX-x   | 640  |    8xb8    | Yes |     No     |   9.8    |    51.4     |        [config](https://github.com/open-mmlab/mmyolo/tree/dev/configs/yolox/yolox_x_fast_8xb8-300e_coco.py)         |                               [model](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_x_fast_8xb8-300e_coco/yolox_x_fast_8xb8-300e_coco_20230215_133950-1d509fab.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_x_fast_8xb8-300e_coco/yolox_x_fast_8xb8-300e_coco_20230215_133950.log.json)                               |
+|  Backbone  | Size | Batch Size | AMP | RTMDet-Hyp | Mem (GB) |   Box AP    |                          Config                           |                                                                                                                                                                      Download                                                                                                                                                                      |
+| :--------: | :--: | :--------: | :-: | :--------: | :------: | :---------: | :-------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| YOLOX-tiny | 416  |    8xb8    | No  |     No     |   2.8    |    32.7     |       [config](./yolox_tiny_fast_8xb8-300e_coco.py)       |                                   [model](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_tiny_8xb8-300e_coco/yolox_tiny_8xb8-300e_coco_20220919_090908-0e40a6fc.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_tiny_8xb8-300e_coco/yolox_tiny_8xb8-300e_coco_20220919_090908.log.json)                                   |
+| YOLOX-tiny | 416  |   8xb32    | Yes |    Yes     |   4.9    | 34.3 (+1.6) | [config](./yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco_20230210_143637-4c338102.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco_20230210_143637.log.json) |
+|  YOLOX-s   | 640  |    8xb8    | Yes |     No     |   2.9    |    40.7     |        [config](./yolox_s_fast_8xb8-300e_coco.py)         |                               [model](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_s_fast_8xb8-300e_coco/yolox_s_fast_8xb8-300e_coco_20230213_142600-2b224d8b.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_s_fast_8xb8-300e_coco/yolox_s_fast_8xb8-300e_coco_20230213_142600.log.json)                               |
+|  YOLOX-s   | 640  |   8xb32    | Yes |    Yes     |   9.8    | 41.9 (+1.2) |  [config](./yolox_s_fast_8xb32-300e-rtmdet-hyp_coco.py)   |       [model](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco_20230210_134645-3a8dfbd7.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco_20230210_134645.log.json)       |
+|  YOLOX-m   | 640  |    8xb8    | Yes |     No     |   4.9    |    46.9     |        [config](./yolox_m_fast_8xb8-300e_coco.py)         |                               [model](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_m_fast_8xb8-300e_coco/yolox_m_fast_8xb8-300e_coco_20230213_160218-a71a6b25.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_m_fast_8xb8-300e_coco/yolox_m_fast_8xb8-300e_coco_20230213_160218.log.json)                               |
+|  YOLOX-m   | 640  |   8xb32    | Yes |    Yes     |   17.6   | 47.5 (+0.6) |  [config](./yolox_m_fast_8xb32-300e-rtmdet-hyp_coco.py)   |       [model](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco_20230210_144328-e657e182.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco_20230210_144328.log.json)       |
+|  YOLOX-l   | 640  |    8xb8    | Yes |     No     |   8.0    |    50.1     |        [config](./yolox_l_fast_8xb8-300e_coco.py)         |                               [model](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_l_fast_8xb8-300e_coco/yolox_l_fast_8xb8-300e_coco_20230213_160715-c731eb1c.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_l_fast_8xb8-300e_coco/yolox_l_fast_8xb8-300e_coco_20230213_160715.log.json)                               |
+|  YOLOX-x   | 640  |    8xb8    | Yes |     No     |   9.8    |    51.4     |        [config](./yolox_x_fast_8xb8-300e_coco.py)         |                               [model](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_x_fast_8xb8-300e_coco/yolox_x_fast_8xb8-300e_coco_20230215_133950-1d509fab.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_x_fast_8xb8-300e_coco/yolox_x_fast_8xb8-300e_coco_20230215_133950.log.json)                               |
 
 YOLOX uses a default training configuration of `8xbs8` which results in a long training time, we expect it to use `8xbs32` to speed up the training and not cause a decrease in mAP. We modified `train_batch_size_per_gpu` from 8 to 32, `batch_augments_interval` from 10 to 1 and `base_lr` from 0.01 to 0.04 under YOLOX-s default configuration based on the linear scaling rule, which resulted in mAP degradation. Finally, I found that using RTMDet's training hyperparameter can improve performance in YOLOX Tiny/S/M, which also validates the superiority of RTMDet's training hyperparameter.
 
@@ -45,6 +45,35 @@ The modified training parameters are as follows：
 1. The test score threshold is 0.001.
 2. Due to the need for pre-training weights, we cannot reproduce the performance of the `yolox-nano` model. Please refer to https://github.com/Megvii-BaseDetection/YOLOX/issues/674 for more information.
 
+## YOLOX-Pose
+
+Based on [MMPose](https://github.com/open-mmlab/mmpose/blob/main/projects/yolox-pose/README.md), we have implemented a YOLOX-based human pose estimator, utilizing the approach outlined in **YOLO-Pose: Enhancing YOLO for Multi Person Pose Estimation Using Object Keypoint Similarity Loss (CVPRW 2022)**. This pose estimator is lightweight and quick, making it well-suited for crowded scenes.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/26127467/226655503-3cee746e-6e42-40be-82ae-6e7cae2a4c7e.jpg"/>
+</div>
+
+### Results
+
+|  Backbone  | Size | Batch Size | AMP | RTMDet-Hyp | Mem (GB) |  AP  |                             Config                             |                                                                                                                                                                           Download                                                                                                                                                                           |
+| :--------: | :--: | :--------: | :-: | :--------: | :------: | :--: | :------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| YOLOX-tiny | 416  |   8xb32    | Yes |    Yes     |   5.3    | 52.8 | [config](./pose/yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolox/pose/yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco/yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco_20230427_080351-2117af67.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/pose/yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco/yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco_20230427_080351.log.json) |
+|  YOLOX-s   | 640  |   8xb32    | Yes |    Yes     |   10.7   | 63.7 |  [config](./pose/yolox-pose_s_8xb32-300e-rtmdet-hyp_coco.py)   |       [model](https://download.openmmlab.com/mmyolo/v0/yolox/pose/yolox-pose_s_8xb32-300e-rtmdet-hyp_coco/yolox-pose_s_8xb32-300e-rtmdet-hyp_coco_20230427_005150-e87d843a.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/pose/yolox-pose_s_8xb32-300e-rtmdet-hyp_coco/yolox-pose_s_8xb32-300e-rtmdet-hyp_coco_20230427_005150.log.json)       |
+|  YOLOX-m   | 640  |   8xb32    | Yes |    Yes     |   19.2   | 69.3 |  [config](./pose/yolox-pose_m_8xb32-300e-rtmdet-hyp_coco.py)   |       [model](https://download.openmmlab.com/mmyolo/v0/yolox/pose/yolox-pose_m_8xb32-300e-rtmdet-hyp_coco/yolox-pose_m_8xb32-300e-rtmdet-hyp_coco_20230427_094024-bbeacc1c.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/pose/yolox-pose_m_8xb32-300e-rtmdet-hyp_coco/yolox-pose_m_8xb32-300e-rtmdet-hyp_coco_20230427_094024.log.json)       |
+|  YOLOX-l   | 640  |   8xb32    | Yes |    Yes     |   30.3   | 71.1 |  [config](./pose/yolox-pose_l_8xb32-300e-rtmdet-hyp_coco.py)   |       [model](https://download.openmmlab.com/mmyolo/v0/yolox/pose/yolox-pose_l_8xb32-300e-rtmdet-hyp_coco/yolox-pose_l_8xb32-300e-rtmdet-hyp_coco_20230427_041140-82d65ac8.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/pose/yolox-pose_l_8xb32-300e-rtmdet-hyp_coco/yolox-pose_l_8xb32-300e-rtmdet-hyp_coco_20230427_041140.log.json)       |
+
+**Note**
+
+1. The performance is unstable and may fluctuate and the highest performance weight in `COCO` training may not be the last epoch. The performance shown above is the best model.
+
+### Installation
+
+Install MMPose
+
+```
+mim install -r requirements/mmpose.txt
+```
+
 ## Citation
 
 ```latex
diff --git a/configs/yolox/metafile.yml b/configs/yolox/metafile.yml
index 0926519ec..78ede704a 100644
--- a/configs/yolox/metafile.yml
+++ b/configs/yolox/metafile.yml
@@ -116,3 +116,51 @@ Models:
         Metrics:
           box AP: 47.5
     Weights: https://download.openmmlab.com/mmyolo/v0/yolox/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco_20230210_144328-e657e182.pth
+  - Name: yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco
+    In Collection: YOLOX
+    Config: yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco.py
+    Metadata:
+      Training Memory (GB): 5.3
+      Epochs: 300
+    Results:
+      - Task: Human Pose Estimation
+        Dataset: COCO
+        Metrics:
+          AP: 52.8
+    Weights: https://download.openmmlab.com/mmyolo/v0/yolox/pose/yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco/yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco_20230427_080351-2117af67.pth
+  - Name: yolox-pose_s_8xb32-300e-rtmdet-hyp_coco
+    In Collection: YOLOX
+    Config: yolox-pose_s_8xb32-300e-rtmdet-hyp_coco.py
+    Metadata:
+      Training Memory (GB): 10.7
+      Epochs: 300
+    Results:
+      - Task: Human Pose Estimation
+        Dataset: COCO
+        Metrics:
+          AP: 63.7
+    Weights: https://download.openmmlab.com/mmyolo/v0/yolox/pose/yolox-pose_s_8xb32-300e-rtmdet-hyp_coco/yolox-pose_s_8xb32-300e-rtmdet-hyp_coco_20230427_005150-e87d843a.pth
+  - Name: yolox-pose_m_8xb32-300e-rtmdet-hyp_coco
+    In Collection: YOLOX
+    Config: yolox-pose_m_8xb32-300e-rtmdet-hyp_coco.py
+    Metadata:
+      Training Memory (GB): 19.2
+      Epochs: 300
+    Results:
+      - Task: Human Pose Estimation
+        Dataset: COCO
+        Metrics:
+          AP: 69.3
+    Weights: https://download.openmmlab.com/mmyolo/v0/yolox/pose/yolox-pose_m_8xb32-300e-rtmdet-hyp_coco/yolox-pose_m_8xb32-300e-rtmdet-hyp_coco_20230427_094024-bbeacc1c.pth
+  - Name: yolox-pose_l_8xb32-300e-rtmdet-hyp_coco
+    In Collection: YOLOX
+    Config: yolox-pose_l_8xb32-300e-rtmdet-hyp_coco.py
+    Metadata:
+      Training Memory (GB): 30.3
+      Epochs: 300
+    Results:
+      - Task: Human Pose Estimation
+        Dataset: COCO
+        Metrics:
+          AP: 71.1
+    Weights: https://download.openmmlab.com/mmyolo/v0/yolox/pose/yolox-pose_l_8xb32-300e-rtmdet-hyp_coco/yolox-pose_l_8xb32-300e-rtmdet-hyp_coco_20230427_041140-82d65ac8.pth
diff --git a/configs/yolox/pose/yolox-pose_l_8xb32-300e-rtmdet-hyp_coco.py b/configs/yolox/pose/yolox-pose_l_8xb32-300e-rtmdet-hyp_coco.py
new file mode 100644
index 000000000..96de5e981
--- /dev/null
+++ b/configs/yolox/pose/yolox-pose_l_8xb32-300e-rtmdet-hyp_coco.py
@@ -0,0 +1,14 @@
+_base_ = ['./yolox-pose_m_8xb32-300e-rtmdet-hyp_coco.py']
+
+load_from = 'https://download.openmmlab.com/mmyolo/v0/yolox/yolox_l_fast_8xb8-300e_coco/yolox_l_fast_8xb8-300e_coco_20230213_160715-c731eb1c.pth'  # noqa
+
+# ========================modified parameters======================
+deepen_factor = 1.0
+widen_factor = 1.0
+
+# =======================Unmodified in most cases==================
+# model settings
+model = dict(
+    backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
+    neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
+    bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
diff --git a/configs/yolox/pose/yolox-pose_m_8xb32-300e-rtmdet-hyp_coco.py b/configs/yolox/pose/yolox-pose_m_8xb32-300e-rtmdet-hyp_coco.py
new file mode 100644
index 000000000..f78d6a3a2
--- /dev/null
+++ b/configs/yolox/pose/yolox-pose_m_8xb32-300e-rtmdet-hyp_coco.py
@@ -0,0 +1,14 @@
+_base_ = ['./yolox-pose_s_8xb32-300e-rtmdet-hyp_coco.py']
+
+load_from = 'https://download.openmmlab.com/mmyolo/v0/yolox/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco_20230210_144328-e657e182.pth'  # noqa
+
+# ========================modified parameters======================
+deepen_factor = 0.67
+widen_factor = 0.75
+
+# =======================Unmodified in most cases==================
+# model settings
+model = dict(
+    backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
+    neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
+    bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
diff --git a/configs/yolox/pose/yolox-pose_s_8xb32-300e-rtmdet-hyp_coco.py b/configs/yolox/pose/yolox-pose_s_8xb32-300e-rtmdet-hyp_coco.py
new file mode 100644
index 000000000..8fa2172c9
--- /dev/null
+++ b/configs/yolox/pose/yolox-pose_s_8xb32-300e-rtmdet-hyp_coco.py
@@ -0,0 +1,136 @@
+_base_ = '../yolox_s_fast_8xb32-300e-rtmdet-hyp_coco.py'
+
+load_from = 'https://download.openmmlab.com/mmyolo/v0/yolox/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco_20230210_134645-3a8dfbd7.pth'  # noqa
+
+num_keypoints = 17
+scaling_ratio_range = (0.75, 1.0)
+mixup_ratio_range = (0.8, 1.6)
+num_last_epochs = 20
+
+# model settings
+model = dict(
+    bbox_head=dict(
+        type='YOLOXPoseHead',
+        head_module=dict(
+            type='YOLOXPoseHeadModule',
+            num_classes=1,
+            num_keypoints=num_keypoints,
+        ),
+        loss_pose=dict(
+            type='OksLoss',
+            metainfo='configs/_base_/pose/coco.py',
+            loss_weight=30.0)),
+    train_cfg=dict(
+        assigner=dict(
+            type='PoseSimOTAAssigner',
+            center_radius=2.5,
+            oks_weight=3.0,
+            iou_calculator=dict(type='mmdet.BboxOverlaps2D'),
+            oks_calculator=dict(
+                type='OksLoss', metainfo='configs/_base_/pose/coco.py'))),
+    test_cfg=dict(score_thr=0.01))
+
+# pipelines
+pre_transform = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='LoadAnnotations', with_keypoints=True)
+]
+
+img_scale = _base_.img_scale
+
+train_pipeline_stage1 = [
+    *pre_transform,
+    dict(
+        type='Mosaic',
+        img_scale=img_scale,
+        pad_val=114.0,
+        pre_transform=pre_transform),
+    dict(
+        type='RandomAffine',
+        scaling_ratio_range=scaling_ratio_range,
+        border=(-img_scale[0] // 2, -img_scale[1] // 2)),
+    dict(
+        type='YOLOXMixUp',
+        img_scale=img_scale,
+        ratio_range=mixup_ratio_range,
+        pad_val=114.0,
+        pre_transform=pre_transform),
+    dict(type='mmdet.YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='FilterAnnotations', by_keypoints=True, keep_empty=False),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape'))
+]
+
+train_pipeline_stage2 = [
+    *pre_transform,
+    dict(type='Resize', scale=img_scale, keep_ratio=True),
+    dict(
+        type='mmdet.Pad',
+        pad_to_square=True,
+        pad_val=dict(img=(114.0, 114.0, 114.0))),
+    dict(type='mmdet.YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='FilterAnnotations', by_keypoints=True, keep_empty=False),
+    dict(type='PackDetInputs')
+]
+
+test_pipeline = [
+    *pre_transform,
+    dict(type='Resize', scale=img_scale, keep_ratio=True),
+    dict(
+        type='mmdet.Pad',
+        pad_to_square=True,
+        pad_val=dict(img=(114.0, 114.0, 114.0))),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip_indices'))
+]
+
+# dataset settings
+dataset_type = 'PoseCocoDataset'
+
+train_dataloader = dict(
+    dataset=dict(
+        type=dataset_type,
+        data_mode='bottomup',
+        ann_file='annotations/person_keypoints_train2017.json',
+        pipeline=train_pipeline_stage1))
+
+val_dataloader = dict(
+    dataset=dict(
+        type=dataset_type,
+        data_mode='bottomup',
+        ann_file='annotations/person_keypoints_val2017.json',
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# evaluators
+val_evaluator = dict(
+    _delete_=True,
+    type='mmpose.CocoMetric',
+    ann_file=_base_.data_root + 'annotations/person_keypoints_val2017.json',
+    score_mode='bbox')
+test_evaluator = val_evaluator
+
+default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater'))
+
+visualizer = dict(type='mmpose.PoseLocalVisualizer')
+
+custom_hooks = [
+    dict(
+        type='YOLOXModeSwitchHook',
+        num_last_epochs=num_last_epochs,
+        new_train_pipeline=train_pipeline_stage2,
+        priority=48),
+    dict(type='mmdet.SyncNormHook', priority=48),
+    dict(
+        type='EMAHook',
+        ema_type='ExpMomentumEMA',
+        momentum=0.0002,
+        update_buffers=True,
+        strict_load=False,
+        priority=49)
+]
diff --git a/configs/yolox/pose/yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco.py b/configs/yolox/pose/yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco.py
new file mode 100644
index 000000000..a7399065e
--- /dev/null
+++ b/configs/yolox/pose/yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco.py
@@ -0,0 +1,70 @@
+_base_ = './yolox-pose_s_8xb32-300e-rtmdet-hyp_coco.py'
+
+load_from = 'https://download.openmmlab.com/mmyolo/v0/yolox/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco_20230210_143637-4c338102.pth'  # noqa
+
+deepen_factor = 0.33
+widen_factor = 0.375
+scaling_ratio_range = (0.75, 1.0)
+
+# model settings
+model = dict(
+    data_preprocessor=dict(batch_augments=[
+        dict(
+            type='YOLOXBatchSyncRandomResize',
+            random_size_range=(320, 640),
+            size_divisor=32,
+            interval=1)
+    ]),
+    backbone=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+    ),
+    neck=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+    ),
+    bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
+
+# data settings
+img_scale = _base_.img_scale
+pre_transform = _base_.pre_transform
+
+train_pipeline_stage1 = [
+    *pre_transform,
+    dict(
+        type='Mosaic',
+        img_scale=img_scale,
+        pad_val=114.0,
+        pre_transform=pre_transform),
+    dict(
+        type='RandomAffine',
+        scaling_ratio_range=scaling_ratio_range,
+        border=(-img_scale[0] // 2, -img_scale[1] // 2)),
+    dict(type='mmdet.YOLOXHSVRandomAug'),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='FilterAnnotations',
+        by_keypoints=True,
+        min_gt_bbox_wh=(1, 1),
+        keep_empty=False),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape'))
+]
+
+test_pipeline = [
+    *pre_transform,
+    dict(type='Resize', scale=(416, 416), keep_ratio=True),
+    dict(
+        type='mmdet.Pad',
+        pad_to_square=True,
+        pad_val=dict(img=(114.0, 114.0, 114.0))),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip_indices'))
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline_stage1))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
diff --git a/configs/yolox/yolox_p5_tta.py b/configs/yolox/yolox_p5_tta.py
index 3a5b4652f..7ffe3490c 100644
--- a/configs/yolox/yolox_p5_tta.py
+++ b/configs/yolox/yolox_p5_tta.py
@@ -1,11 +1,12 @@
-# TODO: Need to solve the problem of multiple file_client_args parameters
-# _file_client_args = dict(
+# TODO: Need to solve the problem of multiple backend_args parameters
+# _backend_args = dict(
 #     backend='petrel',
 #     path_mapping=dict({
 #         './data/': 's3://openmmlab/datasets/detection/',
 #         'data/': 's3://openmmlab/datasets/detection/'
 #     }))
-_file_client_args = dict(backend='disk')
+
+_backend_args = None
 
 tta_model = dict(
     type='mmdet.DetTTAModel',
@@ -24,7 +25,7 @@
 #  PackDetIn  PackDetIn         PackDetIn  PackDetIn        PackDetIn  PackDetIn # noqa
 
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_backend_args),
     dict(
         type='TestTimeAug',
         transforms=[
diff --git a/configs/yolox/yolox_s_fast_8xb8-300e_coco.py b/configs/yolox/yolox_s_fast_8xb8-300e_coco.py
index e751b0d6a..b371ea11d 100644
--- a/configs/yolox/yolox_s_fast_8xb8-300e_coco.py
+++ b/configs/yolox/yolox_s_fast_8xb8-300e_coco.py
@@ -149,7 +149,7 @@
     test_cfg=model_test_cfg)
 
 pre_transform = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(type='LoadAnnotations', with_bbox=True)
 ]
 
@@ -217,7 +217,7 @@
         pipeline=train_pipeline_stage1))
 
 test_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(type='mmdet.Resize', scale=img_scale, keep_ratio=True),
     dict(
         type='mmdet.Pad',
diff --git a/configs/yolox/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco.py b/configs/yolox/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco.py
index d133c95f0..28e539c94 100644
--- a/configs/yolox/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco.py
+++ b/configs/yolox/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco.py
@@ -52,7 +52,7 @@
 ]
 
 test_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(type='mmdet.Resize', scale=(416, 416), keep_ratio=True),  # note
     dict(
         type='mmdet.Pad',
diff --git a/configs/yolox/yolox_tiny_fast_8xb8-300e_coco.py b/configs/yolox/yolox_tiny_fast_8xb8-300e_coco.py
index 141873421..fd175a6c7 100644
--- a/configs/yolox/yolox_tiny_fast_8xb8-300e_coco.py
+++ b/configs/yolox/yolox_tiny_fast_8xb8-300e_coco.py
@@ -50,7 +50,7 @@
 ]
 
 test_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(type='mmdet.Resize', scale=test_img_scale, keep_ratio=True),  # note
     dict(
         type='mmdet.Pad',
@@ -69,7 +69,7 @@
 
 # Config for Test Time Augmentation. (TTA)
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(
         type='TestTimeAug',
         transforms=[
diff --git a/demo/15_minutes_instance_segmentation.ipynb b/demo/15_minutes_instance_segmentation.ipynb
new file mode 100644
index 000000000..a09a1a105
--- /dev/null
+++ b/demo/15_minutes_instance_segmentation.ipynb
@@ -0,0 +1,658 @@
+{
+  "cells": [
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "x7seefPduh36"
+      },
+      "source": [
+        "<div align=\"center\">\n",
+        "  <img width=\"600\" src=\"https://github.com/open-mmlab/mmyolo/raw/main/resources/mmyolo-logo.png\"/>\n",
+        "  <div>&nbsp;</div>\n",
+        "  <div align=\"center\">\n",
+        "    <b><font size=\"5\">OpenMMLab website</font></b>\n",
+        "    <sup>\n",
+        "      <a href=\"https://openmmlab.com\">\n",
+        "        <i><font size=\"4\">HOT</font></i>\n",
+        "      </a>\n",
+        "    </sup>\n",
+        "    &nbsp;&nbsp;&nbsp;&nbsp;\n",
+        "    <b><font size=\"5\">OpenMMLab platform</font></b>\n",
+        "    <sup>\n",
+        "      <a href=\"https://platform.openmmlab.com\">\n",
+        "        <i><font size=\"4\">TRY IT OUT</font></i>\n",
+        "      </a>\n",
+        "    </sup>\n",
+        "  </div>\n",
+        "  <div>&nbsp;</div>\n",
+        "\n",
+        "<a href=\"https://colab.research.google.com/drive/1ogJ7MQ6DTeKc6mN2an9GvuCjkH3RREXk?usp=sharing\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"></a>\n",
+        "\n",
+        "[![PyPI](https://img.shields.io/pypi/v/mmyolo)](https://pypi.org/project/mmyolo)\n",
+        "[![docs](https://img.shields.io/badge/docs-latest-blue)](https://mmyolo.readthedocs.io/en/latest/)\n",
+        "[![deploy](https://github.com/open-mmlab/mmyolo/workflows/deploy/badge.svg)](https://github.com/open-mmlab/mmyolo/actions)\n",
+        "[![codecov](https://codecov.io/gh/open-mmlab/mmyolo/branch/main/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmyolo)\n",
+        "[![license](https://img.shields.io/github/license/open-mmlab/mmyolo.svg)](https://github.com/open-mmlab/mmyolo/blob/main/LICENSE)\n",
+        "[![open issues](https://isitmaintained.com/badge/open/open-mmlab/mmyolo.svg)](https://github.com/open-mmlab/mmyolo/issues)\n",
+        "[![issue resolution](https://isitmaintained.com/badge/resolution/open-mmlab/mmyolo.svg)](https://github.com/open-mmlab/mmyolo/issues)\n",
+        "\n",
+        "[📘Documentation](https://mmyolo.readthedocs.io/en/latest/) |\n",
+        "[🛠️Installation](https://mmyolo.readthedocs.io/en/latest/get_started/installation.html) |\n",
+        "[👀Model Zoo](https://mmyolo.readthedocs.io/en/latest/model_zoo.html) |\n",
+        "[🆕Update News](https://mmyolo.readthedocs.io/en/latest/notes/changelog.html) |\n",
+        "[🤔Reporting Issues](https://github.com/open-mmlab/mmyolo/issues/new/choose)\n",
+        "\n",
+        "</div>\n",
+        "\n",
+        "<div align=\"center\">\n",
+        "  <a href=\"https://openmmlab.medium.com/\" style=\"text-decoration:none;\">\n",
+        "    <img src=\"https://user-images.githubusercontent.com/25839884/219255827-67c1a27f-f8c5-46a9-811d-5e57448c61d1.png\" width=\"3%\" alt=\"\" /></a>\n",
+        "  <img src=\"https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png\" width=\"3%\" alt=\"\" />\n",
+        "  <a href=\"https://discord.com/channels/1037617289144569886/1046608014234370059\" style=\"text-decoration:none;\">\n",
+        "    <img src=\"https://user-images.githubusercontent.com/25839884/218347213-c080267f-cbb6-443e-8532-8e1ed9a58ea9.png\" width=\"3%\" alt=\"\" /></a>\n",
+        "  <img src=\"https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png\" width=\"3%\" alt=\"\" />\n",
+        "  <a href=\"https://twitter.com/OpenMMLab\" style=\"text-decoration:none;\">\n",
+        "    <img src=\"https://user-images.githubusercontent.com/25839884/218346637-d30c8a0f-3eba-4699-8131-512fb06d46db.png\" width=\"3%\" alt=\"\" /></a>\n",
+        "  <img src=\"https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png\" width=\"3%\" alt=\"\" />\n",
+        "  <a href=\"https://www.youtube.com/openmmlab\" style=\"text-decoration:none;\">\n",
+        "    <img src=\"https://user-images.githubusercontent.com/25839884/218346691-ceb2116a-465a-40af-8424-9f30d2348ca9.png\" width=\"3%\" alt=\"\" /></a>\n",
+        "  <img src=\"https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png\" width=\"3%\" alt=\"\" />\n",
+        "  <a href=\"https://space.bilibili.com/1293512903\" style=\"text-decoration:none;\">\n",
+        "    <img src=\"https://user-images.githubusercontent.com/25839884/219026751-d7d14cce-a7c9-4e82-9942-8375fca65b99.png\" width=\"3%\" alt=\"\" /></a>\n",
+        "  <img src=\"https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png\" width=\"3%\" alt=\"\" />\n",
+        "  <a href=\"https://www.zhihu.com/people/openmmlab\" style=\"text-decoration:none;\">\n",
+        "    <img src=\"https://user-images.githubusercontent.com/25839884/219026120-ba71e48b-6e94-4bd4-b4e9-b7d175b5e362.png\" width=\"3%\" alt=\"\" /></a>\n",
+        "</div>"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "V6W8P5XEJGoc"
+      },
+      "source": [
+        "# 15 minutes to get started with MMYOLO instance segmentation\n",
+        "\n",
+        "Instance segmentation is a task in computer vision that aims to segment each object in an image and assign each object a unique identifier.\n",
+        "\n",
+        "Unlike semantic segmentation, instance segmentation not only segments out different categories in an image, but also separates different instances of the same category.\n",
+        "\n",
+        "<div align=center>\n",
+        "<img src=\"https://github.com/open-mmlab/mmyolo/assets/87774050/6fd6316f-d78d-48a5-a413-86e7a74583fd\" alt=\"Instance Segmentation\" width=\"100%\"/>\n",
+        "</div>\n",
+        "\n",
+        "Taking the downloadable balloon dataset as an example, I will guide you through a 15-minute easy introduction to MMYOLO instance segmentation. The entire process includes the following steps:\n",
+        "\n",
+        "- [Installation](#installation)\n",
+        "- [Dataset](#dataset)\n",
+        "- [Config](#config)\n",
+        "- [Training](#training)\n",
+        "- [Testing](#testing)\n",
+        "- [EasyDeploy](#easydeploy-deployment)\n",
+        "\n",
+        "In this tutorial, we will use YOLOv5-s as an example. For the demo configuration of the balloon dataset with other YOLO series algorithms, please refer to the corresponding algorithm configuration folder."
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Ae5SqsA7wYGQ"
+      },
+      "source": [
+        "## Installation\n",
+        "\n",
+        "Assuming you've already installed Conda in advance, then install PyTorch using the following commands."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "XVLRaEIzwW-6",
+        "outputId": "901b5db6-b1d7-4830-e746-485ee76d6648"
+      },
+      "outputs": [],
+      "source": [
+        "# -----------------------------------------------------------------------------------------\n",
+        "# If you are using colab, you can skip this cell for PyTorch is pre-installed on the colab.\n",
+        "# -----------------------------------------------------------------------------------------\n",
+        "!python -V\n",
+        "# Check nvcc version\n",
+        "!nvcc -V\n",
+        "# Check GCC version\n",
+        "!gcc --version\n",
+        "# Create a new Conda environment\n",
+        "%conda create -n mmyolo python=3.8 -y\n",
+        "%conda activate mmyolo\n",
+        "# If you have GPU\n",
+        "%conda install pytorch torchvision -c pytorch\n",
+        "# If you only have CPU\n",
+        "# %conda install pytorch torchvision cpuonly -c pytorch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Check PyTorch version\n",
+        "import torch\n",
+        "print(torch.__version__)\n",
+        "print(torch.cuda.is_available())"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Install MMYOLO and dependency libraries using the following commands.\n",
+        "For details about how to configure the environment, see [Installation and verification](https://mmyolo.readthedocs.io/en/latest/get_started/installation.html).\n",
+        "```{note}\n",
+        "Note: Since this repo uses OpenMMLab 2.0, it is better to create a new conda virtual environment to prevent conflicts with the repo installed in OpenMMLab 1.0.\n",
+        "```"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "-qATUuntwmfD",
+        "outputId": "24be577b-efce-46f2-8b2f-a65d02824467"
+      },
+      "outputs": [],
+      "source": [
+        "!git clone https://github.com/open-mmlab/mmyolo.git\n",
+        "%cd mmyolo\n",
+        "%pip install -U openmim\n",
+        "!mim install -r requirements/mminstall.txt\n",
+        "# Install albumentations\n",
+        "!mim install -r requirements/albu.txt\n",
+        "# Install MMYOLO\n",
+        "!mim install -v -e .\n",
+        "# \"-v\" means verbose, or more output\n",
+        "# \"-e\" means installing a project in editable mode,\n",
+        "# thus any local modifications made to the code will take effect without reinstallation."
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Dataset\n",
+        "\n",
+        "The Balloon dataset is a single-class dataset that consists of 74 images and includes annotated information required for training. Here is an example image from the dataset:\n",
+        "\n",
+        "<div align=center>\n",
+        "<img src=\"https://user-images.githubusercontent.com/87774050/236993643-f581b087-9231-48a5-810b-97a5f31abe63.png\" alt=\"balloon dataset\"/>\n",
+        "</div>\n",
+        "\n",
+        "You can download and use it directly by the following command:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "gMQXwWuIw3ef",
+        "outputId": "c8efeac7-5b0c-4342-b5af-d3e790e358c3"
+      },
+      "outputs": [],
+      "source": [
+        "!python tools/misc/download_dataset.py --dataset-name balloon --save-dir ./data/balloon --unzip --delete\n",
+        "!python ./tools/dataset_converters/balloon2coco.py"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "covQskXXw2ul"
+      },
+      "source": [
+        "The data for the MMYOLO project is located in the MMYOLO project directory. The `train.json` and `val.json` files store the annotations in COCO format, while the `data/balloon/train` and `data/balloon/val` directories contain all the images for the dataset.\n",
+        "\n",
+        "## Config\n",
+        "\n",
+        "Taking YOLOv5 algorithm as an example, considering the limited GPU memory of users, we need to modify some default training parameters to make them run smoothly. The key parameters to be modified are as follows:\n",
+        "\n",
+        "- YOLOv5 is an Anchor-Based algorithm, and different datasets need to calculate suitable anchors adaptively.\n",
+        "- The default config uses 8 GPUs with a batch size of 16 per GPU. Now change it to a single GPU with a batch size of 12.\n",
+        "- In principle, the learning rate should be linearly scaled accordingly when the batch size is changed, but actual measurements have found that this is not necessary.\n",
+        "\n",
+        "To perform the specific operation, create a new configuration file named `yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py` in the `configs/yolov5/ins_seg` folder. For convenience, we have already provided this configuration file. Copy the following contents into the configuration file.\n",
+        "\n",
+        "```python\n",
+        "_base_ = './yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py'  # noqa\n",
+        "\n",
+        "data_root = 'data/balloon/' # dataset root\n",
+        "# Training set annotation file of json path\n",
+        "train_ann_file = 'train.json'\n",
+        "train_data_prefix = 'train/'  # Dataset prefix\n",
+        "# Validation set annotation file of json path\n",
+        "val_ann_file = 'val.json'\n",
+        "val_data_prefix = 'val/'\n",
+        "metainfo = {\n",
+        "    'classes': ('balloon', ), # dataset category name\n",
+        "    'palette': [\n",
+        "        (220, 20, 60),\n",
+        "    ]\n",
+        "}\n",
+        "num_classes = 1\n",
+        "# Set batch size to 4\n",
+        "train_batch_size_per_gpu = 4\n",
+        "# dataloader num workers\n",
+        "train_num_workers = 2\n",
+        "log_interval = 1\n",
+        "#####################\n",
+        "train_dataloader = dict(\n",
+        "    batch_size=train_batch_size_per_gpu,\n",
+        "    num_workers=train_num_workers,\n",
+        "    dataset=dict(\n",
+        "        data_root=data_root,\n",
+        "        metainfo=metainfo,\n",
+        "        data_prefix=dict(img=train_data_prefix),\n",
+        "        ann_file=train_ann_file))\n",
+        "val_dataloader = dict(\n",
+        "    dataset=dict(\n",
+        "        data_root=data_root,\n",
+        "        metainfo=metainfo,\n",
+        "        data_prefix=dict(img=val_data_prefix),\n",
+        "        ann_file=val_ann_file))\n",
+        "test_dataloader = val_dataloader\n",
+        "val_evaluator = dict(ann_file=data_root + val_ann_file)\n",
+        "test_evaluator = val_evaluator\n",
+        "default_hooks = dict(logger=dict(interval=log_interval))\n",
+        "#####################\n",
+        "\n",
+        "model = dict(bbox_head=dict(head_module=dict(num_classes=num_classes)))\n",
+        "```\n",
+        "\n",
+        "The above configuration inherits from `yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py` and updates configurations such as `data_root`, `metainfo`, `train_dataloader`, `val_dataloader`, `num_classes`, etc., based on the characteristics of the balloon dataset.\n",
+        "\n",
+        "## Training"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!python tools/train.py configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "TQ0h6sv_rJxq"
+      },
+      "source": [
+        "After running the training command mentioned above, the folder `work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance` will be automatically generated. The weight files and the training configuration file for this session will be saved in this folder. On a lower-end GPU like the GTX 1660, the entire training process will take approximately 30 minutes.\n",
+        "\n",
+        "<div align=center>\n",
+        "<img src=\"https://user-images.githubusercontent.com/87774050/236995027-00a16a9e-2a2d-44cc-be8a-e2c8a36ff77f.png\" alt=\"image\"/>\n",
+        "</div>\n",
+        "\n",
+        "The performance on `val.json` is as follows:\n",
+        "\n",
+        "```text\n",
+        " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.330\n",
+        " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.509\n",
+        " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.317\n",
+        " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000\n",
+        " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.103\n",
+        " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.417\n",
+        " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.150\n",
+        " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.396\n",
+        " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.454\n",
+        " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000\n",
+        " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.317\n",
+        " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.525\n",
+        "```\n",
+        "\n",
+        "The above performance is obtained by printing using the COCO API, where -1 indicates the absence of objects of that scale.\n",
+        "\n",
+        "### Some Notes\n",
+        "\n",
+        "Two key warnings are printed during training:\n",
+        "\n",
+        "- You are using `YOLOv5Head` with num_classes == 1. The loss_cls will be 0. This is a normal phenomenon.\n",
+        "\n",
+        "The warning is because the `num_classes` currently trained is 1, the loss of the classification branch is always 0 according to the community of the YOLOv5 algorithm, which is a normal phenomenon.\n",
+        "\n",
+        "### Training is resumed after the interruption\n",
+        "\n",
+        "If you stop training, you can add `--resume` to the end of the training command and the program will automatically resume training with the latest weights file from `work_dirs`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!python tools/train.py configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py --resume"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "3sJxvQoUrMhX"
+      },
+      "source": [
+        "### Save GPU memory strategy\n",
+        "\n",
+        "The above config requires about 3G RAM, so if you don't have enough, consider turning on mixed-precision training"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!python tools/train.py configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py --amp"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "jVJdyHTxrQ9a"
+      },
+      "source": [
+        "### Training visualization\n",
+        "\n",
+        "MMYOLO currently supports local, TensorBoard, WandB and other back-end visualization. The default is to use local visualization, and you can switch to WandB and other real-time visualization of various indicators in the training process.\n",
+        "\n",
+        "#### 1 WandB\n",
+        "\n",
+        "WandB visualization need registered in website, and in the https://wandb.ai/settings for wandb API Keys.\n",
+        "\n",
+        "<div align=center>\n",
+        "<img src=\"https://cdn.vansin.top/img/20220913212628.png\" alt=\"image\"/>\n",
+        "</div>"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "%pip install wandb\n",
+        "# After running wandb login, enter the API Keys obtained above, and the login is successful.\n",
+        "!wandb login"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Yu0_4YYRrbyY"
+      },
+      "source": [
+        "Add the wandb config at the end of config file we just created: `configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py`.\n",
+        "\n",
+        "```python\n",
+        "visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')])\n",
+        "```\n",
+        "\n",
+        "Running the training command and you will see the loss, learning rate, and coco/bbox_mAP visualizations in the link."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!python tools/train.py configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "f_DyzfDIzwMa"
+      },
+      "source": [
+        "<div align=center>\n",
+        "<img src=\"https://user-images.githubusercontent.com/17425982/222131114-30a79285-56bc-427d-a38d-8d6a6982ad60.png\" alt=\"image\"/>\n",
+        "</div>\n",
+        "<div align=center>\n",
+        "<img src=\"https://user-images.githubusercontent.com/17425982/222132585-4b4962f1-211b-46f7-86b3-7534fc52a1b4.png\" alt=\"image\"/>\n",
+        "</div>\n",
+        "\n",
+        "#### 2 Tensorboard\n",
+        "\n",
+        "Install Tensorboard using the following command."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "gHkGlii3n29Q"
+      },
+      "outputs": [],
+      "source": [
+        "%pip install tensorboard"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "bE-nx9TY1P-M"
+      },
+      "source": [
+        "Add the `tensorboard` config at the end of config file we just created: `configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py`.\n",
+        "\n",
+        "```python\n",
+        "visualizer = dict(vis_backends=[dict(type='LocalVisBackend'),dict(type='TensorboardVisBackend')])\n",
+        "```\n",
+        "\n",
+        "After re-running the training command, Tensorboard file will be generated in the visualization folder `work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance/{timestamp}/vis_data`.\n",
+        "We can use Tensorboard to view the loss, learning rate, and coco/bbox_mAP visualizations from a web link by running the following command:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "g8fZgokho5CE"
+      },
+      "outputs": [],
+      "source": [
+        "!tensorboard --logdir=work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "GUZ7MPoaro-o"
+      },
+      "source": [
+        "## Testing"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "VYmxtE0GunTB",
+        "outputId": "f440807c-1931-4810-b76d-617f73fde227"
+      },
+      "outputs": [],
+      "source": [
+        "!python tools/test.py configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance best_coco_bbox_mAP_epoch_300.pth --show-dir show_results"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "_cFocUqN0BCb"
+      },
+      "source": [
+        "Run the above test command, you can not only get the AP performance printed in the **Training** section, You can also automatically save the result images to the `work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance/{timestamp}/show_results` folder. Below is one of the result images, the left image is the actual annotation, and the right image is the inference result of the model.\n",
+        "\n",
+        "<div align=center>\n",
+        "<img src=\"https://user-images.githubusercontent.com/87774050/236996421-46cfd38c-0827-4251-8216-408dfa9e03dd.jpg\" alt=\"result_img\"/>\n",
+        "</div>\n",
+        "\n",
+        "You can also visualize model inference results in a browser window if you use `WandbVisBackend` or `TensorboardVisBackend`.\n",
+        "\n",
+        "## Feature map visualization\n",
+        "\n",
+        "MMYOLO provides visualization scripts for feature map to analyze the current model training. Please refer to [Feature Map Visualization](../recommended_topics/visualization.md)\n",
+        "\n",
+        "Due to the bias of direct visualization of `test_pipeline`, we need to modify the `test_pipeline` of `configs/yolov5/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py`\n",
+        "\n",
+        "```python\n",
+        "test_pipeline = [\n",
+        "    dict(\n",
+        "        type='LoadImageFromFile',\n",
+        "        file_client_args=_base_.file_client_args),\n",
+        "    dict(type='YOLOv5KeepRatioResize', scale=img_scale),\n",
+        "    dict(\n",
+        "        type='LetterResize',\n",
+        "        scale=img_scale,\n",
+        "        allow_scale_up=False,\n",
+        "        pad_val=dict(img=114)),\n",
+        "    dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),\n",
+        "    dict(\n",
+        "        type='mmdet.PackDetInputs',\n",
+        "        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n",
+        "                   'scale_factor', 'pad_param'))\n",
+        "]\n",
+        "```\n",
+        "\n",
+        "to the following config:\n",
+        "\n",
+        "```python\n",
+        "test_pipeline = [\n",
+        "    dict(\n",
+        "        type='LoadImageFromFile',\n",
+        "        file_client_args=_base_.file_client_args),\n",
+        "    dict(type='mmdet.Resize', scale=img_scale, keep_ratio=False), # modify the LetterResize to mmdet.Resize\n",
+        "    dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),\n",
+        "    dict(\n",
+        "        type='mmdet.PackDetInputs',\n",
+        "        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n",
+        "                   'scale_factor'))\n",
+        "]\n",
+        "```\n",
+        "\n",
+        "Let's choose the `data/balloon/train/3927754171_9011487133_b.jpg` image as an example to visualize the output feature maps of YOLOv5 backbone and neck layers.\n",
+        "\n",
+        "**1. Visualize the three channels of YOLOv5 backbone**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!python demo/featmap_vis_demo.py data/balloon/train/3927754171_9011487133_b.jpg onfigs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance/best_coco_bbox_mAP_epoch_300.pth --target-layers backbone --channel-reduction squeeze_mean"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "<div align=center>\n",
+        "<img src=\"https://user-images.githubusercontent.com/87774050/236997582-233e292f-5e96-4e44-9e92-9e0787f302fc.jpg\" width=\"800\" alt=\"image\"/>\n",
+        "</div>\n",
+        "\n",
+        "The result will be saved to the output folder in current path. Three output feature maps plotted in the above figure correspond to small, medium and large output feature maps.\n",
+        "\n",
+        "**2. Visualize the three channels of YOLOv5 neck**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!python demo/featmap_vis_demo.py data/balloon/train/3927754171_9011487133_b.jpg \\\n",
+        "                                 configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py \\\n",
+        "                                 work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance/best_coco_bbox_mAP_epoch_300.pth \\\n",
+        "                                 --target-layers neck \\\n",
+        "                                 --channel-reduction squeeze_mean"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "<div align=center>\n",
+        "<img src=\"https://user-images.githubusercontent.com/87774050/236997860-719d2e18-7767-4129-a072-b21c97a5502a.jpg\" width=\"800\" alt=\"image\"/>\n",
+        "</div>\n",
+        "\n",
+        "**3. Grad-Based CAM visualization**\n",
+        "TODO"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## EasyDeploy deployment\n",
+        "TODO\n",
+        "\n",
+        "This completes the transformation deployment of the trained model and checks the inference results. This is the end of the tutorial.\n",
+        "\n",
+        "If you encounter problems during training or testing, please check the [common troubleshooting steps](https://mmyolo.readthedocs.io/en/dev/recommended_topics/troubleshooting_steps.html) first and feel free to open an [issue](https://github.com/open-mmlab/mmyolo/issues/new/choose) if you still can't solve it."
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "provenance": [],
+      "toc_visible": true
+    },
+    "gpuClass": "standard",
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/demo/15_minutes_object_detection.ipynb b/demo/15_minutes_object_detection.ipynb
new file mode 100644
index 000000000..47e0ccbd8
--- /dev/null
+++ b/demo/15_minutes_object_detection.ipynb
@@ -0,0 +1,1002 @@
+{
+  "cells": [
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "x7seefPduh36"
+      },
+      "source": [
+        "<div align=\"center\">\n",
+        "  <img width=\"600\" src=\"https://github.com/open-mmlab/mmyolo/raw/main/resources/mmyolo-logo.png\"/>\n",
+        "  <div>&nbsp;</div>\n",
+        "  <div align=\"center\">\n",
+        "    <b><font size=\"5\">OpenMMLab website</font></b>\n",
+        "    <sup>\n",
+        "      <a href=\"https://openmmlab.com\">\n",
+        "        <i><font size=\"4\">HOT</font></i>\n",
+        "      </a>\n",
+        "    </sup>\n",
+        "    &nbsp;&nbsp;&nbsp;&nbsp;\n",
+        "    <b><font size=\"5\">OpenMMLab platform</font></b>\n",
+        "    <sup>\n",
+        "      <a href=\"https://platform.openmmlab.com\">\n",
+        "        <i><font size=\"4\">TRY IT OUT</font></i>\n",
+        "      </a>\n",
+        "    </sup>\n",
+        "  </div>\n",
+        "  <div>&nbsp;</div>\n",
+        "\n",
+        "<a href=\"https://colab.research.google.com/github/open-mmlab/mmyolo/blob/dev/demo/15_minutes_object_detection.ipynb\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"></a>\n",
+        "\n",
+        "[![PyPI](https://img.shields.io/pypi/v/mmyolo)](https://pypi.org/project/mmyolo)\n",
+        "[![docs](https://img.shields.io/badge/docs-latest-blue)](https://mmyolo.readthedocs.io/en/latest/)\n",
+        "[![deploy](https://github.com/open-mmlab/mmyolo/workflows/deploy/badge.svg)](https://github.com/open-mmlab/mmyolo/actions)\n",
+        "[![codecov](https://codecov.io/gh/open-mmlab/mmyolo/branch/main/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmyolo)\n",
+        "[![license](https://img.shields.io/github/license/open-mmlab/mmyolo.svg)](https://github.com/open-mmlab/mmyolo/blob/main/LICENSE)\n",
+        "[![open issues](https://isitmaintained.com/badge/open/open-mmlab/mmyolo.svg)](https://github.com/open-mmlab/mmyolo/issues)\n",
+        "[![issue resolution](https://isitmaintained.com/badge/resolution/open-mmlab/mmyolo.svg)](https://github.com/open-mmlab/mmyolo/issues)\n",
+        "\n",
+        "[📘Documentation](https://mmyolo.readthedocs.io/en/latest/) |\n",
+        "[🛠️Installation](https://mmyolo.readthedocs.io/en/latest/get_started/installation.html) |\n",
+        "[👀Model Zoo](https://mmyolo.readthedocs.io/en/latest/model_zoo.html) |\n",
+        "[🆕Update News](https://mmyolo.readthedocs.io/en/latest/notes/changelog.html) |\n",
+        "[🤔Reporting Issues](https://github.com/open-mmlab/mmyolo/issues/new/choose)\n",
+        "\n",
+        "</div>\n",
+        "\n",
+        "<div align=\"center\">\n",
+        "  <a href=\"https://openmmlab.medium.com/\" style=\"text-decoration:none;\">\n",
+        "    <img src=\"https://user-images.githubusercontent.com/25839884/219255827-67c1a27f-f8c5-46a9-811d-5e57448c61d1.png\" width=\"3%\" alt=\"\" /></a>\n",
+        "  <img src=\"https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png\" width=\"3%\" alt=\"\" />\n",
+        "  <a href=\"https://discord.com/channels/1037617289144569886/1046608014234370059\" style=\"text-decoration:none;\">\n",
+        "    <img src=\"https://user-images.githubusercontent.com/25839884/218347213-c080267f-cbb6-443e-8532-8e1ed9a58ea9.png\" width=\"3%\" alt=\"\" /></a>\n",
+        "  <img src=\"https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png\" width=\"3%\" alt=\"\" />\n",
+        "  <a href=\"https://twitter.com/OpenMMLab\" style=\"text-decoration:none;\">\n",
+        "    <img src=\"https://user-images.githubusercontent.com/25839884/218346637-d30c8a0f-3eba-4699-8131-512fb06d46db.png\" width=\"3%\" alt=\"\" /></a>\n",
+        "  <img src=\"https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png\" width=\"3%\" alt=\"\" />\n",
+        "  <a href=\"https://www.youtube.com/openmmlab\" style=\"text-decoration:none;\">\n",
+        "    <img src=\"https://user-images.githubusercontent.com/25839884/218346691-ceb2116a-465a-40af-8424-9f30d2348ca9.png\" width=\"3%\" alt=\"\" /></a>\n",
+        "  <img src=\"https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png\" width=\"3%\" alt=\"\" />\n",
+        "  <a href=\"https://space.bilibili.com/1293512903\" style=\"text-decoration:none;\">\n",
+        "    <img src=\"https://user-images.githubusercontent.com/25839884/219026751-d7d14cce-a7c9-4e82-9942-8375fca65b99.png\" width=\"3%\" alt=\"\" /></a>\n",
+        "  <img src=\"https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png\" width=\"3%\" alt=\"\" />\n",
+        "  <a href=\"https://www.zhihu.com/people/openmmlab\" style=\"text-decoration:none;\">\n",
+        "    <img src=\"https://user-images.githubusercontent.com/25839884/219026120-ba71e48b-6e94-4bd4-b4e9-b7d175b5e362.png\" width=\"3%\" alt=\"\" /></a>\n",
+        "</div>"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "V6W8P5XEJGoc"
+      },
+      "source": [
+        "# 15 minutes to get started with MMYOLO object detection\n",
+        "\n",
+        "Object detection task refers to that given a picture, the network predicts all the categories of objects included in the picture and the corresponding boundary boxes\n",
+        "\n",
+        "<div align=center>\n",
+        "<img src=\"https://user-images.githubusercontent.com/17425982/220232979-fffa480b-9ae6-4601-8af6-4116265dc650.png\" alt=\"object detection\" width=\"800\"/>\n",
+        "</div>\n",
+        "\n",
+        "Take the small dataset of cat as an example, you can easily learn MMYOLO object detection in 15 minutes. The whole process consists of the following steps:\n",
+        "\n",
+        "- [Installation](#installation)\n",
+        "- [Dataset](#dataset)\n",
+        "- [Config](#config)\n",
+        "- [Training](#training)\n",
+        "- [Testing](#testing)\n",
+        "- [EasyDeploy](#easydeploy-deployment)\n",
+        "\n",
+        "In this tutorial, we take YOLOv5-s as an example. For the rest of the YOLO series algorithms, please see the corresponding algorithm configuration folder."
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Ae5SqsA7wYGQ"
+      },
+      "source": [
+        "## Installation\n",
+        "\n",
+        "Assuming you've already installed Conda in advance, then install PyTorch using the following commands."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "XVLRaEIzwW-6",
+        "outputId": "901b5db6-b1d7-4830-e746-485ee76d6648"
+      },
+      "outputs": [],
+      "source": [
+        "# -----------------------------------------------------------------------------------------\n",
+        "# If you are using colab, you can skip this cell for PyTorch is pre-installed on the colab.\n",
+        "# -----------------------------------------------------------------------------------------\n",
+        "!python -V\n",
+        "# Check nvcc version\n",
+        "!nvcc -V\n",
+        "# Check GCC version\n",
+        "!gcc --version\n",
+        "# Create a new Conda environment\n",
+        "%conda create -n mmyolo python=3.8 -y\n",
+        "%conda activate mmyolo\n",
+        "# If you have GPU\n",
+        "%conda install pytorch torchvision -c pytorch\n",
+        "# If you only have CPU\n",
+        "# %conda install pytorch torchvision cpuonly -c pytorch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Check PyTorch version\n",
+        "import torch\n",
+        "print(torch.__version__)\n",
+        "print(torch.cuda.is_available())"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Install MMYOLO and dependency libraries using the following commands.\n",
+        "For details about how to configure the environment, see [Installation and verification](https://mmyolo.readthedocs.io/en/latest/get_started/installation.html).\n",
+        "```{note}\n",
+        "Note: Since this repo uses OpenMMLab 2.0, it is better to create a new conda virtual environment to prevent conflicts with the repo installed in OpenMMLab 1.0.\n",
+        "```"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "-qATUuntwmfD",
+        "outputId": "24be577b-efce-46f2-8b2f-a65d02824467"
+      },
+      "outputs": [],
+      "source": [
+        "!git clone https://github.com/open-mmlab/mmyolo.git\n",
+        "%cd mmyolo\n",
+        "%pip install -U openmim\n",
+        "!mim install -r requirements/mminstall.txt\n",
+        "# Install albumentations\n",
+        "!mim install -r requirements/albu.txt\n",
+        "# Install MMYOLO\n",
+        "!mim install -v -e .\n",
+        "# \"-v\" means verbose, or more output\n",
+        "# \"-e\" means installing a project in editable mode,\n",
+        "# thus any local modifications made to the code will take effect without reinstallation."
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Dataset\n",
+        "\n",
+        "The Cat dataset is a single-category dataset consisting of 144 pictures (the original pictures are provided by @RangeKing, and cleaned by @PeterH0323), which contains the annotation information required for training. The sample image is shown below:\n",
+        "\n",
+        "<div align=center>\n",
+        "<img src=\"https://user-images.githubusercontent.com/25873202/205423220-c4b8f2fd-22ba-4937-8e47-1b3f6a8facd8.png\" alt=\"cat dataset\"/>\n",
+        "</div>\n",
+        "\n",
+        "You can download and use it directly by the following command:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "gMQXwWuIw3ef",
+        "outputId": "c8efeac7-5b0c-4342-b5af-d3e790e358c3"
+      },
+      "outputs": [],
+      "source": [
+        "!python tools/misc/download_dataset.py --dataset-name cat --save-dir ./data/cat --unzip --delete"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "covQskXXw2ul"
+      },
+      "source": [
+        "This dataset is automatically downloaded to the `./data/cat` dir with the following directory structure:\n",
+        "\n",
+        "<div align=center>\n",
+        "<img src=\"https://user-images.githubusercontent.com/17425982/220072078-48b88a08-6179-483e-b8d3-0549e1b465de.png\" alt=\"image\"/>\n",
+        "</div>\n",
+        "\n",
+        "The cat dataset is located in the mmyolo project dir, and `data/cat/annotations` stores annotations in COCO format, and `data/cat/images` stores all images\n",
+        "\n",
+        "## Config\n",
+        "\n",
+        "Taking YOLOv5 algorithm as an example, considering the limited GPU memory of users, we need to modify some default training parameters to make them run smoothly. The key parameters to be modified are as follows:\n",
+        "\n",
+        "- YOLOv5 is an Anchor-Based algorithm, and different datasets need to calculate suitable anchors adaptively\n",
+        "- The default config uses 8 GPUs with a batch size of 16 per GPU. Now change it to a single GPU with a batch size of 12.\n",
+        "- The default training epoch is 300. Change it to 40 epoch\n",
+        "- Given the small size of the dataset, we opted to use fixed backbone weights\n",
+        "- In principle, the learning rate should be linearly scaled accordingly when the batch size is changed, but actual measurements have found that this is not necessary\n",
+        "\n",
+        "Create a `yolov5_s-v61_fast_1xb12-40e_cat.py` config file in the `configs/yolov5` folder (we have provided this config for you to use directly) and copy the following into the config file.\n",
+        "\n",
+        "```python\n",
+        "# Inherit and overwrite part of the config based on this config\n",
+        "_base_ = 'yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py'\n",
+        "\n",
+        "data_root = './data/cat/' # dataset root\n",
+        "class_name = ('cat', ) # dataset category name\n",
+        "num_classes = len(class_name) # dataset category number\n",
+        "# metainfo is a configuration that must be passed to the dataloader, otherwise it is invalid\n",
+        "# palette is a display color for category at visualization\n",
+        "# The palette length must be greater than or equal to the length of the classes\n",
+        "metainfo = dict(classes=class_name, palette=[(20, 220, 60)])\n",
+        "\n",
+        "# Adaptive anchor based on tools/analysis_tools/optimize_anchors.py\n",
+        "anchors = [\n",
+        "    [(68, 69), (154, 91), (143, 162)],  # P3/8\n",
+        "    [(242, 160), (189, 287), (391, 207)],  # P4/16\n",
+        "    [(353, 337), (539, 341), (443, 432)]  # P5/32\n",
+        "]\n",
+        "# Max training 40 epoch\n",
+        "max_epochs = 40\n",
+        "# bs = 12\n",
+        "train_batch_size_per_gpu = 12\n",
+        "# dataloader num workers\n",
+        "train_num_workers = 4\n",
+        "\n",
+        "# load COCO pre-trained weight\n",
+        "load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth'  # noqa\n",
+        "\n",
+        "model = dict(\n",
+        "    # Fixed the weight of the entire backbone without training\n",
+        "    backbone=dict(frozen_stages=4),\n",
+        "    bbox_head=dict(\n",
+        "        head_module=dict(num_classes=num_classes),\n",
+        "        prior_generator=dict(base_sizes=anchors)\n",
+        "    ))\n",
+        "\n",
+        "train_dataloader = dict(\n",
+        "    batch_size=train_batch_size_per_gpu,\n",
+        "    num_workers=train_num_workers,\n",
+        "    dataset=dict(\n",
+        "        data_root=data_root,\n",
+        "        metainfo=metainfo,\n",
+        "        # Dataset annotation file of json path\n",
+        "        ann_file='annotations/trainval.json',\n",
+        "        # Dataset prefix\n",
+        "        data_prefix=dict(img='images/')))\n",
+        "\n",
+        "val_dataloader = dict(\n",
+        "    dataset=dict(\n",
+        "        metainfo=metainfo,\n",
+        "        data_root=data_root,\n",
+        "        ann_file='annotations/test.json',\n",
+        "        data_prefix=dict(img='images/')))\n",
+        "\n",
+        "test_dataloader = val_dataloader\n",
+        "\n",
+        "_base_.optim_wrapper.optimizer.batch_size_per_gpu = train_batch_size_per_gpu\n",
+        "\n",
+        "val_evaluator = dict(ann_file=data_root + 'annotations/test.json')\n",
+        "test_evaluator = val_evaluator\n",
+        "\n",
+        "default_hooks = dict(\n",
+        "    # Save weights every 10 epochs and a maximum of two weights can be saved.\n",
+        "    # The best model is saved automatically during model evaluation\n",
+        "    checkpoint=dict(interval=10, max_keep_ckpts=2, save_best='auto'),\n",
+        "    # The warmup_mim_iter parameter is critical.\n",
+        "    # The default value is 1000 which is not suitable for cat datasets.\n",
+        "    param_scheduler=dict(max_epochs=max_epochs, warmup_mim_iter=10),\n",
+        "    # The log printing interval is 5\n",
+        "    logger=dict(type='LoggerHook', interval=5))\n",
+        "# The evaluation interval is 10\n",
+        "train_cfg = dict(max_epochs=max_epochs, val_interval=10)\n",
+        "```\n",
+        "\n",
+        "The above config is inherited from `yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py`. According to the characteristics of cat dataset updated `data_root`, `metainfo`, `train_dataloader`, `val_dataloader`, `num_classes` and other config.\n",
+        "\n",
+        "## Training"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!python tools/train.py configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "TQ0h6sv_rJxq"
+      },
+      "source": [
+        "Run the above training command, `work_dirs/yolov5_s-v61_fast_1xb12-40e_cat` folder will be automatically generated, the checkpoint file and the training config file will be saved in this folder. On a low-end 1660 GPU, the entire training process takes about eight minutes.\n",
+        "\n",
+        "<div align=center>\n",
+        "<img src=\"https://user-images.githubusercontent.com/17425982/220236361-bd113606-248e-4a0e-a484-c0dc9e355b5b.png\" alt=\"image\"/>\n",
+        "</div>\n",
+        "\n",
+        "The performance on `test.json` is as follows:\n",
+        "\n",
+        "```text\n",
+        " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.631\n",
+        " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.909\n",
+        " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.747\n",
+        " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000\n",
+        " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000\n",
+        " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.631\n",
+        " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.627\n",
+        " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.703\n",
+        " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.703\n",
+        " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000\n",
+        " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000\n",
+        " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.703\n",
+        "```\n",
+        "\n",
+        "The above properties are printed via the COCO API, where -1 indicates that no object exists for the scale. According to the rules defined by COCO, the Cat dataset contains all large sized objects, and there are no small or medium-sized objects.\n",
+        "\n",
+        "### Some Notes\n",
+        "\n",
+        "Two key warnings are printed during training:\n",
+        "\n",
+        "- You are using `YOLOv5Head` with num_classes == 1. The loss_cls will be 0. This is a normal phenomenon.\n",
+        "- The model and loaded state dict do not match exactly\n",
+        "\n",
+        "Neither of these warnings will have any impact on performance. The first warning is because the `num_classes` currently trained is 1, the loss of the classification branch is always 0 according to the community of the YOLOv5 algorithm, which is a normal phenomenon. The second warning is because we are currently training in fine-tuning mode, we load the COCO pre-trained weights for 80 classes,\n",
+        "This will lead to the final Head module convolution channel number does not correspond, resulting in this part of the weight can not be loaded, which is also a normal phenomenon.\n",
+        "\n",
+        "### Training is resumed after the interruption\n",
+        "\n",
+        "If you stop training, you can add `--resume` to the end of the training command and the program will automatically resume training with the latest weights file from `work_dirs`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!python tools/train.py configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py --resume"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "3sJxvQoUrMhX"
+      },
+      "source": [
+        "### Save GPU memory strategy\n",
+        "\n",
+        "The above config requires about 3G RAM, so if you don't have enough, consider turning on mixed-precision training"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!python tools/train.py configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py --amp"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "jVJdyHTxrQ9a"
+      },
+      "source": [
+        "### Training visualization\n",
+        "\n",
+        "MMYOLO currently supports local, TensorBoard, WandB and other back-end visualization. The default is to use local visualization, and you can switch to WandB and other real-time visualization of various indicators in the training process.\n",
+        "\n",
+        "#### 1 WandB\n",
+        "\n",
+        "WandB visualization need registered in website, and in the https://wandb.ai/settings for wandb API Keys.\n",
+        "\n",
+        "<div align=center>\n",
+        "<img src=\"https://cdn.vansin.top/img/20220913212628.png\" alt=\"image\"/>\n",
+        "</div>"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "%pip install wandb\n",
+        "# After running wandb login, enter the API Keys obtained above, and the login is successful.\n",
+        "!wandb login"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Yu0_4YYRrbyY"
+      },
+      "source": [
+        "Add the wandb config at the end of config file we just created: `configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py`.\n",
+        "\n",
+        "```python\n",
+        "visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')])\n",
+        "```\n",
+        "\n",
+        "Running the training command and you will see the loss, learning rate, and coco/bbox_mAP visualizations in the link."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!python tools/train.py configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "f_DyzfDIzwMa"
+      },
+      "source": [
+        "<div align=center>\n",
+        "<img src=\"https://user-images.githubusercontent.com/17425982/222131114-30a79285-56bc-427d-a38d-8d6a6982ad60.png\" alt=\"image\"/>\n",
+        "</div>\n",
+        "<div align=center>\n",
+        "<img src=\"https://user-images.githubusercontent.com/17425982/222132585-4b4962f1-211b-46f7-86b3-7534fc52a1b4.png\" alt=\"image\"/>\n",
+        "</div>\n",
+        "\n",
+        "#### 2 Tensorboard\n",
+        "\n",
+        "Install Tensorboard using the following command."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "gHkGlii3n29Q"
+      },
+      "outputs": [],
+      "source": [
+        "%pip install tensorboard"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "bE-nx9TY1P-M"
+      },
+      "source": [
+        "Add the `tensorboard` config at the end of config file we just created: `configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py`.\n",
+        "\n",
+        "```python\n",
+        "visualizer = dict(vis_backends=[dict(type='LocalVisBackend'),dict(type='TensorboardVisBackend')])\n",
+        "```\n",
+        "\n",
+        "After re-running the training command, Tensorboard file will be generated in the visualization folder `work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/{timestamp}/vis_data`.\n",
+        "We can use Tensorboard to view the loss, learning rate, and coco/bbox_mAP visualizations from a web link by running the following command:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "g8fZgokho5CE"
+      },
+      "outputs": [],
+      "source": [
+        "!tensorboard --logdir=work_dirs/yolov5_s-v61_fast_1xb12-40e_cat"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "GUZ7MPoaro-o"
+      },
+      "source": [
+        "## Testing"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "VYmxtE0GunTB",
+        "outputId": "f440807c-1931-4810-b76d-617f73fde227"
+      },
+      "outputs": [],
+      "source": [
+        "!python tools/test.py configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \\\n",
+        "                      work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \\\n",
+        "                      --show-dir show_results"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "_cFocUqN0BCb"
+      },
+      "source": [
+        "Run the above test command, you can not only get the AP performance printed in the **Training** section, You can also automatically save the result images to the `work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/{timestamp}/show_results` folder. Below is one of the result images, the left image is the actual annotation, and the right image is the inference result of the model.\n",
+        "\n",
+        "<div align=center>\n",
+        "<img src=\"https://user-images.githubusercontent.com/17425982/220251677-6c7e5c8f-9417-4803-97fc-a968d0172ab7.png\" alt=\"result_img\"/>\n",
+        "</div>\n",
+        "\n",
+        "You can also visualize model inference results in a browser window if you use 'WandbVisBackend' or 'TensorboardVisBackend'.\n",
+        "\n",
+        "## Feature map visualization\n",
+        "\n",
+        "MMYOLO provides visualization scripts for feature map to analyze the current model training. Please refer to [Feature Map Visualization](../recommended_topics/visualization.md)\n",
+        "\n",
+        "Due to the bias of direct visualization of `test_pipeline`, we need modify the `test_pipeline` of `configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py`,\n",
+        "\n",
+        "```python\n",
+        "test_pipeline = [\n",
+        "    dict(\n",
+        "        type='LoadImageFromFile',\n",
+        "        file_client_args=_base_.file_client_args),\n",
+        "    dict(type='YOLOv5KeepRatioResize', scale=img_scale),\n",
+        "    dict(\n",
+        "        type='LetterResize',\n",
+        "        scale=img_scale,\n",
+        "        allow_scale_up=False,\n",
+        "        pad_val=dict(img=114)),\n",
+        "    dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),\n",
+        "    dict(\n",
+        "        type='mmdet.PackDetInputs',\n",
+        "        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n",
+        "                   'scale_factor', 'pad_param'))\n",
+        "]\n",
+        "```\n",
+        "\n",
+        "to the following config:\n",
+        "\n",
+        "```python\n",
+        "test_pipeline = [\n",
+        "    dict(\n",
+        "        type='LoadImageFromFile',\n",
+        "        file_client_args=_base_.file_client_args),\n",
+        "    dict(type='mmdet.Resize', scale=img_scale, keep_ratio=False), # modify the LetterResize to mmdet.Resize\n",
+        "    dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),\n",
+        "    dict(\n",
+        "        type='mmdet.PackDetInputs',\n",
+        "        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n",
+        "                   'scale_factor'))\n",
+        "]\n",
+        "```\n",
+        "\n",
+        "Let's choose the `data/cat/images/IMG_20221020_112705.jpg` image as an example to visualize the output feature maps of YOLOv5 backbone and neck layers.\n",
+        "\n",
+        "**1. Visualize the three channels of YOLOv5 backbone**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!python demo/featmap_vis_demo.py data/cat/images/IMG_20221020_112705.jpg \\\n",
+        "                                 configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \\\n",
+        "                                 work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \\\n",
+        "                                 --target-layers backbone \\\n",
+        "                                 --channel-reduction squeeze_mean"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "<div align=center>\n",
+        "<img src=\"https://user-images.githubusercontent.com/17425982/220292217-b343a6f4-0c88-4fdb-9680-35d0ff8e5bdb.png\" width=\"800\" alt=\"image\"/>\n",
+        "</div>\n",
+        "\n",
+        "The result will be saved to the output folder in current path. Three output feature maps plotted in the above figure correspond to small, medium and large output feature maps. As the backbone of this training is not actually involved in training, it can be seen from the above figure that the big object cat is predicted on the small feature map, which is in line with the idea of hierarchical detection of object detection.\n",
+        "\n",
+        "**2. Visualize the three channels of YOLOv5 neck**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!python demo/featmap_vis_demo.py data/cat/images/IMG_20221020_112705.jpg \\\n",
+        "                                 configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \\\n",
+        "                                 work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \\\n",
+        "                                 --target-layers neck \\\n",
+        "                                 --channel-reduction squeeze_mean"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "<div align=center>\n",
+        "<img src=\"https://user-images.githubusercontent.com/17425982/220293382-0a241415-e717-4688-a718-5f6d5c844785.png\" width=\"800\" alt=\"image\"/>\n",
+        "</div>\n",
+        "\n",
+        "As can be seen from the above figure, because neck is involved in training, and we also reset anchor, the three output feature maps are forced to simulate the same scale object, resulting in the three output maps of neck are similar, which destroys the original pre-training distribution of backbone. At the same time, it can also be seen that 40 epochs are not enough to train the above dataset, and the feature maps do not perform well.\n",
+        "\n",
+        "**3. Grad-Based CAM visualization**\n",
+        "\n",
+        "Based on the above feature map visualization, we can analyze Grad CAM at the feature layer of bbox level.\n",
+        "\n",
+        "Install `grad-cam` package:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "%pip install \"grad-cam\""
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "(a) View Grad CAM of the minimum output feature map of the neck"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!python demo/boxam_vis_demo.py data/cat/images/IMG_20221020_112705.jpg \\\n",
+        "                                configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \\\n",
+        "                                work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \\\n",
+        "                                --target-layer neck.out_layers[2]"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "9v-dMkePvHMg"
+      },
+      "source": [
+        "<div align=center>\n",
+        "<img src=\"https://user-images.githubusercontent.com/17425982/220298462-b0631f27-2366-4864-915a-a4ee21acd4b9.png\" width=\"800\" alt=\"image\"/>\n",
+        "</div>\n",
+        "\n",
+        "(b) View Grad CAM of the medium output feature map of the neck"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "p9H9u0A-3KAD",
+        "outputId": "32ca5a56-052f-4930-f53c-41cc3a9dc619"
+      },
+      "outputs": [],
+      "source": [
+        "!python demo/boxam_vis_demo.py data/cat/images/IMG_20221020_112705.jpg \\\n",
+        "                               configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \\\n",
+        "                               work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \\\n",
+        "                               --target-layer neck.out_layers[1]"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "(c) View Grad CAM of the maximum output feature map of the neck"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "MrKan1U43uUY",
+        "outputId": "690f8414-a76b-4fa6-e600-7cc874ce1914"
+      },
+      "outputs": [],
+      "source": [
+        "!python demo/boxam_vis_demo.py data/cat/images/IMG_20221020_112705.jpg \\\n",
+        "                               configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \\\n",
+        "                               work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \\\n",
+        "                               --target-layer neck.out_layers[0]"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "<div align=center>\n",
+        "<img src=\"https://user-images.githubusercontent.com/17425982/220297905-e23369db-d383-48f9-b15e-528a70ec7b23.png\" width=\"800\" alt=\"image\"/>\n",
+        "</div>\n",
+        "\n",
+        "## EasyDeploy deployment\n",
+        "\n",
+        "Here we'll use MMYOLO's [EasyDeploy](../../../projects/easydeploy/) to demonstrate the transformation deployment and basic inference of model.\n",
+        "\n",
+        "First you need to follow EasyDeploy's [basic documentation](../../../projects/easydeploy/docs/model_convert.md) controls own equipment installed for each library.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "%pip install onnx\n",
+        "%pip install onnx-simplifier # Install if you want to use simplify\n",
+        "%pip install tensorrt        # If you have GPU environment and need to output TensorRT model you need to continue execution"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Once installed, you can use the following command to transform and deploy the trained model on the cat dataset with one click. The current ONNX version is 1.13.0 and TensorRT version is 8.5.3.1, so keep the `--opset` value of 11. The remaining parameters need to be adjusted according to the config used. Here we export the CPU version of ONNX with the `--backend` set to 1."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 534
+        },
+        "id": "YsRFEecU5C0w",
+        "outputId": "c26011d4-2836-4715-cd6b-68836294db33"
+      },
+      "outputs": [],
+      "source": [
+        "!python projects/easydeploy/tools/export.py \\\n",
+        "\t    configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \\\n",
+        "\t    work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \\\n",
+        "\t    --work-dir work_dirs/yolov5_s-v61_fast_1xb12-40e_cat \\\n",
+        "        --img-size 640 640 \\\n",
+        "        --batch 1 \\\n",
+        "        --device cpu \\\n",
+        "        --simplify \\\n",
+        "\t    --opset 11 \\\n",
+        "\t    --backend 1 \\\n",
+        "\t    --pre-topk 1000 \\\n",
+        "\t    --keep-topk 100 \\\n",
+        "\t    --iou-threshold 0.65 \\\n",
+        "\t    --score-threshold 0.25\n"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "q1EY415x3Idx"
+      },
+      "source": [
+        "On success, you will get the converted ONNX model under `work-dir`, which is named `end2end.onnx` by default.\n",
+        "\n",
+        "Let's use `end2end.onnx` model to perform a basic image inference:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!python projects/easydeploy/tools/image-demo.py \\\n",
+        "        data/cat/images/IMG_20210728_205312.jpg \\\n",
+        "        configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \\\n",
+        "        work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/end2end.onnx \\\n",
+        "        --device cpu"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "IrjiBa5YwDQM"
+      },
+      "source": [
+        "After successful inference, the result image will be generated in the `output` folder of the default MMYOLO root directory. If you want to see the result without saving it, you can add `--show` to the end of the above command. For convenience, the following is the generated result.\n",
+        "\n",
+        "<div align=center>\n",
+        "<img src=\"https://user-images.githubusercontent.com/7219519/221061210-b91e0b5b-652d-4dfc-8451-86a9a36f7d04.png\" width=\"800\" alt=\"image\"/>\n",
+        "</div>\n",
+        "\n",
+        "Let's go on to convert the engine file for TensorRT, because TensorRT needs to be specific to the current environment and deployment version, so make sure to export the parameters, here we export the TensorRT8 file, the `--backend` is 2."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "d8zxczqiBLoB"
+      },
+      "outputs": [],
+      "source": [
+        "!python projects/easydeploy/tools/export.py \\\n",
+        "        configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \\\n",
+        "        work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \\\n",
+        "        --work-dir work_dirs/yolov5_s-v61_fast_1xb12-40e_cat \\\n",
+        "        --img-size 640 640 \\\n",
+        "        --batch 1 \\\n",
+        "        --device cuda:0 \\\n",
+        "        --simplify \\\n",
+        "        --opset 11 \\\n",
+        "        --backend 2 \\\n",
+        "        --pre-topk 1000 \\\n",
+        "        --keep-topk 100 \\\n",
+        "        --iou-threshold 0.65 \\\n",
+        "        --score-threshold 0.25"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "The resulting `end2end.onnx` is the ONNX file for the TensorRT8 deployment, which we will use to complete the TensorRT engine transformation."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 1000
+        },
+        "id": "QFh8rIsX_kVw",
+        "outputId": "c5bd6929-03a8-400e-be1e-581f32b23f61"
+      },
+      "outputs": [],
+      "source": [
+        "!python projects/easydeploy/tools/build_engine.py \\\n",
+        "        work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/end2end.onnx \\\n",
+        "        --img-size 640 640 \\\n",
+        "        --device cuda:0"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Successful execution will generate the `end2end.engine` file under `work-dir`:\n",
+        "\n",
+        "```shell\n",
+        "work_dirs/yolov5_s-v61_fast_1xb12-40e_cat\n",
+        "├── 202302XX_XXXXXX\n",
+        "│   ├── 202302XX_XXXXXX.log\n",
+        "│   └── vis_data\n",
+        "│       ├── 202302XX_XXXXXX.json\n",
+        "│       ├── config.py\n",
+        "│       └── scalars.json\n",
+        "├── best_coco\n",
+        "│   └── bbox_mAP_epoch_40.pth\n",
+        "├── end2end.engine\n",
+        "├── end2end.onnx\n",
+        "├── epoch_30.pth\n",
+        "├── epoch_40.pth\n",
+        "├── last_checkpoint\n",
+        "└── yolov5_s-v61_fast_1xb12-40e_cat.py\n",
+        "```\n",
+        "\n",
+        "Let's continue use `image-demo.py` for image inference:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 1000
+        },
+        "id": "rOqXEi-jAI7Y",
+        "outputId": "2a21aaaa-d4ba-498a-f985-2a6a2b8d348f"
+      },
+      "outputs": [],
+      "source": [
+        "!python projects/easydeploy/tools/image-demo.py \\\n",
+        "        data/cat/images/IMG_20210728_205312.jpg \\\n",
+        "        configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \\\n",
+        "        work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/end2end.engine \\\n",
+        "        --device cuda:0"
+      ]
+    },
+    {
+      "attachments": {},
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ocHGUUEA_TjI"
+      },
+      "source": [
+        "<div align=center>\n",
+        "<img src=\"https://user-images.githubusercontent.com/7219519/221061291-e7490bb6-5f0c-45ab-9fc4-caf2b62419d6.png\" width=\"800\" alt=\"image\"/>\n",
+        "</div>\n",
+        "\n",
+        "This completes the transformation deployment of the trained model and checks the inference results. This is the end of the tutorial.\n",
+        "\n",
+        "If you encounter problems during training or testing, please check the [common troubleshooting steps](https://mmyolo.readthedocs.io/en/dev/recommended_topics/troubleshooting_steps.html) first and feel free to open an [issue](https://github.com/open-mmlab/mmyolo/issues/new/choose) if you still can't solve it.\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "provenance": [],
+      "toc_visible": true
+    },
+    "gpuClass": "standard",
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/demo/boxam_vis_demo.py b/demo/boxam_vis_demo.py
index e5c5df6c9..278574f89 100644
--- a/demo/boxam_vis_demo.py
+++ b/demo/boxam_vis_demo.py
@@ -18,18 +18,18 @@
 from mmengine import Config, DictAction, MessageHub
 from mmengine.utils import ProgressBar
 
-from mmyolo.utils.boxam_utils import (BoxAMDetectorVisualizer,
-                                      BoxAMDetectorWrapper, DetAblationLayer,
-                                      DetBoxScoreTarget, GradCAM,
-                                      GradCAMPlusPlus, reshape_transform)
-from mmyolo.utils.misc import get_file_list
-
 try:
     from pytorch_grad_cam import AblationCAM, EigenCAM
 except ImportError:
     raise ImportError('Please run `pip install "grad-cam"` to install '
                       'pytorch_grad_cam package.')
 
+from mmyolo.utils.boxam_utils import (BoxAMDetectorVisualizer,
+                                      BoxAMDetectorWrapper, DetAblationLayer,
+                                      DetBoxScoreTarget, GradCAM,
+                                      GradCAMPlusPlus, reshape_transform)
+from mmyolo.utils.misc import get_file_list
+
 GRAD_FREE_METHOD_MAP = {
     'ablationcam': AblationCAM,
     'eigencam': EigenCAM,
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 65689dd56..fc65431a2 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -26,7 +26,7 @@ RUN apt-get update \
 
 # Install MMEngine , MMCV and MMDet
 RUN pip install --no-cache-dir openmim && \
-    mim install --no-cache-dir "mmengine>=0.6.0" "mmcv>=2.0.0rc4,<2.1.0" "mmdet>=3.0.0rc6,<3.1.0"
+    mim install --no-cache-dir "mmengine>=0.6.0" "mmcv>=2.0.0rc4,<2.1.0" "mmdet>=3.0.0,<4.0.0"
 
 # Install MMYOLO
 RUN git clone https://github.com/open-mmlab/mmyolo.git /mmyolo && \
diff --git a/docker/Dockerfile_deployment b/docker/Dockerfile_deployment
index 1a0a226a8..8ea1e380b 100644
--- a/docker/Dockerfile_deployment
+++ b/docker/Dockerfile_deployment
@@ -30,7 +30,7 @@ RUN wget -q https://github.com/microsoft/onnxruntime/releases/download/v${ONNXRU
 
 # Install OPENMIM MMENGINE MMDET
 RUN pip install --no-cache-dir openmim \
-    && mim install --no-cache-dir "mmengine>=0.6.0" "mmdet>=3.0.0rc6,<3.1.0" \
+    && mim install --no-cache-dir "mmengine>=0.6.0" "mmdet>=3.0.0,<4.0.0" \
     && mim install --no-cache-dir opencv-python==4.5.5.64 opencv-python-headless==4.5.5.64
 
 RUN git clone https://github.com/open-mmlab/mmcv.git -b 2.x mmcv \
diff --git a/docs/en/common_usage/ms_training_testing.md b/docs/en/common_usage/ms_training_testing.md
new file mode 100644
index 000000000..b7d88f632
--- /dev/null
+++ b/docs/en/common_usage/ms_training_testing.md
@@ -0,0 +1,39 @@
+# Multi-scale training and testing
+
+## Multi-scale training
+
+The popular YOLOv5, YOLOv6, YOLOv7, YOLOv8 and RTMDet algorithms are supported in MMYOLO currently, and their default configuration is single-scale 640x640 training. There are two implementations of multi-scale training commonly used in the MM family of open source libraries
+
+1. Each image output in `train_pipeline` is at variable scale, and pad different scales of input images to the same scale by [stack_batch](https://github.com/open-mmlab/mmengine/blob/dbae83c52fa54d6dda08b6692b124217fe3b2135/mmengine/model/base_model/data_preprocessor.py#L260-L261) function in [DataPreprocessor](https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/data_preprocessors/data_preprocessor.py). Most of the algorithms in MMDet are implemented using this approach.
+2. Each image output in `train_pipeline` is at a fixed scale, and `DataPreprocessor` performs up- and down-sampling of image batches for multi-scale training directly.
+
+Both two multi-scale training approaches are supported in MMYOLO. Theoretically, the first implementation can generate richer scales, but its training efficiency is not as good as the second one due to its independent augmentation of a single image. Therefore, we recommend using the second approach.
+
+Take `configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py` configuration as an example, its default configuration is 640x640 fixed scale training, suppose you want to implement training in multiples of 32 and multi-scale range (480, 800), you can refer to YOLOX practice by [YOLOXBatchSyncRandomResize](https://github.com/open-mmlab/mmyolo/blob/dc85144fab20a970341550794857a2f2f9b11564/mmyolo/models/data_preprocessors/data_preprocessor.py#L20) in the DataPreprocessor.
+
+Create a new configuration under the `configs/yolov5` path named `configs/yolov5/yolov5_s-v61_fast_1xb12-ms-40e_cat.py` with the following contents.
+
+```python
+_base_ = 'yolov5_s-v61_fast_1xb12-40e_cat.py'
+
+model = dict(
+    data_preprocessor=dict(
+        type='YOLOv5DetDataPreprocessor',
+        pad_size_divisor=32,
+        batch_augments=[
+            dict(
+                type='YOLOXBatchSyncRandomResize',
+                # multi-scale range (480, 800)
+                random_size_range=(480, 800),
+                # The output scale needs to be divisible by 32
+                size_divisor=32,
+                interval=1)
+        ])
+)
+```
+
+The above configuration will enable multi-scale training. We have already provided this configuration under `configs/yolov5/` for convenience. The rest of the YOLO family of algorithms are similar.
+
+## Multi-scale testing
+
+MMYOLO multi-scale testing is equivalent to Test-Time Enhancement TTA and is currently supported, see [Test-Time Augmentation TTA](./tta.md).
diff --git a/docs/en/get_started/15_minutes_instance_segmentation.md b/docs/en/get_started/15_minutes_instance_segmentation.md
index c66a2f283..b42e25f64 100644
--- a/docs/en/get_started/15_minutes_instance_segmentation.md
+++ b/docs/en/get_started/15_minutes_instance_segmentation.md
@@ -1,3 +1,332 @@
 # 15 minutes to get started with MMYOLO instance segmentation
 
+Instance segmentation is a task in computer vision that aims to segment each object in an image and assign each object a unique identifier.
+
+Unlike semantic segmentation, instance segmentation not only segments out different categories in an image, but also separates different instances of the same category.
+
+<div align=center>
+<img src="https://github.com/open-mmlab/mmyolo/assets/87774050/6fd6316f-d78d-48a5-a413-86e7a74583fd" alt="Instance Segmentation" width="100%"/>
+</div>
+
+Taking the downloadable balloon dataset as an example, I will guide you through a 15-minute easy introduction to MMYOLO instance segmentation. The entire process includes the following steps:
+
+- [Installation](#installation)
+- [Dataset](#dataset)
+- [Config](#config)
+- [Training](#training)
+- [Testing](#testing)
+- [EasyDeploy](#easydeploy-deployment)
+
+In this tutorial, we will use YOLOv5-s as an example. For the demo configuration of the balloon dataset with other YOLO series algorithms, please refer to the corresponding algorithm configuration folder.
+
+## Installation
+
+Assuming you've already installed Conda in advance, then install PyTorch using the following commands.
+
+```{note}
+Note: Since this repo uses OpenMMLab 2.0, it is better to create a new conda virtual environment to prevent conflicts with the repo installed in OpenMMLab 1.0.
+```
+
+```shell
+conda create -n mmyolo python=3.8 -y
+conda activate mmyolo
+# If you have GPU
+conda install pytorch torchvision -c pytorch
+# If you only have CPU
+# conda install pytorch torchvision cpuonly -c pytorch
+```
+
+Install MMYOLO and dependency libraries using the following commands.
+
+```shell
+git clone https://github.com/open-mmlab/mmyolo.git
+cd mmyolo
+pip install -U openmim
+mim install -r requirements/mminstall.txt
+# Install albumentations
+mim install -r requirements/albu.txt
+# Install MMYOLO
+mim install -v -e .
+# "-v" means verbose, or more output
+# "-e" means installing a project in editable mode,
+# thus any local modifications made to the code will take effect without reinstallation.
+```
+
+For details about how to configure the environment, see [Installation and verification](./installation.md).
+
+## Dataset
+
+The Balloon dataset is a single-class dataset that consists of 74 images and includes annotated information required for training. Here is an example image from the dataset:
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/87774050/236993643-f581b087-9231-48a5-810b-97a5f31abe63.png" alt="balloon dataset"/>
+</div>
+
+You can download and use it directly by the following command:
+
+```shell
+python tools/misc/download_dataset.py --dataset-name balloon --save-dir ./data/balloon --unzip --delete
+python ./tools/dataset_converters/balloon2coco.py
+```
+
+The data for the MMYOLO project is located in the MMYOLO project directory. The `train.json` and `val.json` files store the annotations in COCO format, while the `data/balloon/train` and `data/balloon/val` directories contain all the images for the dataset.
+
+## Config
+
+Taking YOLOv5 algorithm as an example, considering the limited GPU memory of users, we need to modify some default training parameters to make them run smoothly. The key parameters to be modified are as follows:
+
+- YOLOv5 is an Anchor-Based algorithm, and different datasets need to calculate suitable anchors adaptively.
+- The default config uses 8 GPUs with a batch size of 16 per GPU. Now change it to a single GPU with a batch size of 12.
+- In principle, the learning rate should be linearly scaled accordingly when the batch size is changed, but actual measurements have found that this is not necessary.
+
+To perform the specific operation, create a new configuration file named `yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py` in the `configs/yolov5/ins_seg` folder. For convenience, we have already provided this configuration file. Copy the following contents into the configuration file.
+
+```python
+_base_ = './yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py'  # noqa
+
+data_root = 'data/balloon/' # dataset root
+# Training set annotation file of json path
+train_ann_file = 'train.json'
+train_data_prefix = 'train/'  # Dataset prefix
+# Validation set annotation file of json path
+val_ann_file = 'val.json'
+val_data_prefix = 'val/'
+metainfo = {
+    'classes': ('balloon', ), # dataset category name
+    'palette': [
+        (220, 20, 60),
+    ]
+}
+num_classes = 1
+# Set batch size to 4
+train_batch_size_per_gpu = 4
+# dataloader num workers
+train_num_workers = 2
+log_interval = 1
+#####################
+train_dataloader = dict(
+    batch_size=train_batch_size_per_gpu,
+    num_workers=train_num_workers,
+    dataset=dict(
+        data_root=data_root,
+        metainfo=metainfo,
+        data_prefix=dict(img=train_data_prefix),
+        ann_file=train_ann_file))
+val_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        metainfo=metainfo,
+        data_prefix=dict(img=val_data_prefix),
+        ann_file=val_ann_file))
+test_dataloader = val_dataloader
+val_evaluator = dict(ann_file=data_root + val_ann_file)
+test_evaluator = val_evaluator
+default_hooks = dict(logger=dict(interval=log_interval))
+#####################
+
+model = dict(bbox_head=dict(head_module=dict(num_classes=num_classes)))
+```
+
+The above configuration inherits from `yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py` and updates configurations such as `data_root`, `metainfo`, `train_dataloader`, `val_dataloader`, `num_classes`, etc., based on the characteristics of the balloon dataset.
+
+## Training
+
+```shell
+python tools/train.py configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py
+```
+
+After running the training command mentioned above, the folder `work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance` will be automatically generated. The weight files and the training configuration file for this session will be saved in this folder. On a lower-end GPU like the GTX 1660, the entire training process will take approximately 30 minutes.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/87774050/236995027-00a16a9e-2a2d-44cc-be8a-e2c8a36ff77f.png" alt="image"/>
+</div>
+
+The performance on `val.json` is as follows:
+
+```text
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.330
+ Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.509
+ Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.317
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.103
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.417
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.150
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.396
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.454
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.317
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.525
+```
+
+The above performance is obtained by printing using the COCO API, where -1 indicates the absence of objects of that scale.
+
+### Some Notes
+
+The key warnings are printed during training:
+
+- You are using `YOLOv5Head` with num_classes == 1. The loss_cls will be 0. This is a normal phenomenon.
+
+The warning is because the `num_classes` currently trained is 1, the loss of the classification branch is always 0 according to the community of the YOLOv5 algorithm, which is a normal phenomenon.
+
+### Training is resumed after the interruption
+
+If you stop training, you can add `--resume` to the end of the training command and the program will automatically resume training with the latest weights file from `work_dirs`.
+
+```shell
+python tools/train.py configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py --resume
+```
+
+### Save GPU memory strategy
+
+The above config requires about 3G RAM, so if you don't have enough, consider turning on mixed-precision training
+
+```shell
+python tools/train.py configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py --amp
+```
+
+### Training visualization
+
+MMYOLO currently supports local, TensorBoard, WandB and other back-end visualization. The default is to use local visualization, and you can switch to WandB and other real-time visualization of various indicators in the training process.
+
+#### 1 WandB
+
+WandB visualization need registered in website, and in the https://wandb.ai/settings for wandb API Keys.
+
+<div align=center>
+<img src="https://cdn.vansin.top/img/20220913212628.png" alt="image"/>
+</div>
+
+```shell
+pip install wandb
+# After running wandb login, enter the API Keys obtained above, and the login is successful.
+wandb login
+```
+
+Add the wandb config at the end of config file we just created: `configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py`.
+
+```python
+visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')])
+```
+
+Running the training command and you will see the loss, learning rate, and coco/bbox_mAP visualizations in the link.
+
+```shell
+python tools/train.py configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py
+```
+
+#### 2 Tensorboard
+
+Install Tensorboard package using the following command:
+
+```shell
+pip install tensorboard
+```
+
+Add the `tensorboard` config at the end of config file we just created: `configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py`.
+
+```python
+visualizer = dict(vis_backends=[dict(type='LocalVisBackend'),dict(type='TensorboardVisBackend')])
+```
+
+After re-running the training command, Tensorboard file will be generated in the visualization folder `work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance/{timestamp}/vis_data`.
+We can use Tensorboard to view the loss, learning rate, and coco/bbox_mAP visualizations from a web link by running the following command:
+
+```shell
+tensorboard --logdir=work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance
+```
+
+## Testing
+
+```shell
+python tools/test.py configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py \
+                     work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance/best_coco_bbox_mAP_epoch_300.pth \
+                     --show-dir show_results
+```
+
+Run the above test command, you can not only get the AP performance printed in the **Training** section, You can also automatically save the result images to the `work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance/{timestamp}/show_results` folder. Below is one of the result images, the left image is the actual annotation, and the right image is the inference result of the model.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/87774050/236996421-46cfd38c-0827-4251-8216-408dfa9e03dd.jpg" alt="result_img"/>
+</div>
+
+You can also visualize model inference results in a browser window if you use `WandbVisBackend` or `TensorboardVisBackend`.
+
+## Feature map visualization
+
+MMYOLO provides visualization scripts for feature map to analyze the current model training. Please refer to [Feature Map Visualization](../recommended_topics/visualization.md)
+
+Due to the bias of direct visualization of `test_pipeline`, we need to modify the `test_pipeline` of `configs/yolov5/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py`
+
+```python
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile',
+        backend_args=_base_.backend_args),
+    dict(type='YOLOv5KeepRatioResize', scale=img_scale),
+    dict(
+        type='LetterResize',
+        scale=img_scale,
+        allow_scale_up=False,
+        pad_val=dict(img=114)),
+    dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),
+    dict(
+        type='mmdet.PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'pad_param'))
+]
+```
+
+to the following config:
+
+```python
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile',
+        backend_args=_base_.backend_args),
+    dict(type='mmdet.Resize', scale=img_scale, keep_ratio=False), # modify the LetterResize to mmdet.Resize
+    dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),
+    dict(
+        type='mmdet.PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+```
+
+Let's choose the `data/balloon/train/3927754171_9011487133_b.jpg` image as an example to visualize the output feature maps of YOLOv5 backbone and neck layers.
+
+**1. Visualize the three channels of YOLOv5s backbone**
+
+```shell
+python demo/featmap_vis_demo.py data/balloon/train/3927754171_9011487133_b.jpg \
+    configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py \
+    work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance/best_coco_bbox_mAP_epoch_300.pth \ --target-layers backbone \
+    --channel-reduction squeeze_mean
+```
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/87774050/236997582-233e292f-5e96-4e44-9e92-9e0787f302fc.jpg" width="800" alt="image"/>
+</div>
+
+The result will be saved to the output folder in current path. Three output feature maps plotted in the above figure correspond to small, medium and large output feature maps.
+
+**2. Visualize the three channels of YOLOv5 neck**
+
+```shell
+python demo/featmap_vis_demo.py data/balloon/train/3927754171_9011487133_b.jpg \
+    configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py \
+    work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance/best_coco_bbox_mAP_epoch_300.pth \ --target-layers neck \
+    --channel-reduction squeeze_mean
+```
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/87774050/236997860-719d2e18-7767-4129-a072-b21c97a5502a.jpg" width="800" alt="image"/>
+</div>
+**3. Grad-Based CAM visualization**
+
 TODO
+
+## EasyDeploy deployment
+
+TODO
+
+The full content above can be viewed in [15_minutes_object_detection.ipynb](../../../demo/15_minutes_object_detection.ipynb). This is the end of the tutorial. If you encounter problems during training or testing, please check the [common troubleshooting steps](../recommended_topics/troubleshooting_steps.md) first and feel free to open an [issue](https://github.com/open-mmlab/mmyolo/issues/new/choose) if you still can't solve it.
diff --git a/docs/en/get_started/15_minutes_object_detection.md b/docs/en/get_started/15_minutes_object_detection.md
index e34a48eef..354b2e708 100644
--- a/docs/en/get_started/15_minutes_object_detection.md
+++ b/docs/en/get_started/15_minutes_object_detection.md
@@ -15,11 +15,15 @@ Take the small dataset of cat as an example, you can easily learn MMYOLO object
 - [Testing](#testing)
 - [EasyDeploy](#easydeploy-deployment)
 
-In this article, we take YOLOv5-s as an example. For the rest of the YOLO series algorithms, please see the corresponding algorithm configuration folder.
+In this tutorial, we take YOLOv5-s as an example. For the rest of the YOLO series algorithms, please see the corresponding algorithm configuration folder.
 
 ## Installation
 
-Assuming you've already installed Conda in advance, install PyTorch
+Assuming you've already installed Conda in advance, then install PyTorch using the following commands.
+
+```{note}
+Note: Since this repo uses OpenMMLab 2.0, it is better to create a new conda virtual environment to prevent conflicts with the repo installed in OpenMMLab 1.0.
+```
 
 ```shell
 conda create -n mmyolo python=3.8 -y
@@ -30,7 +34,7 @@ conda install pytorch torchvision -c pytorch
 # conda install pytorch torchvision cpuonly -c pytorch
 ```
 
-Install MMYOLO and dependency libraries
+Install MMYOLO and dependency libraries using the following commands.
 
 ```shell
 git clone https://github.com/open-mmlab/mmyolo.git
@@ -46,11 +50,7 @@ mim install -v -e .
 # thus any local modifications made to the code will take effect without reinstallation.
 ```
 
-```{note}
-Note: Since this repo uses OpenMMLab 2.0, it is better to create a new conda virtual environment to prevent conflicts with the repo installed in OpenMMLab 1.0.
-```
-
-For details about how to configure the environment, see [Installation and verification](./installation.md)
+For details about how to configure the environment, see [Installation and verification](./installation.md).
 
 ## Dataset
 
@@ -106,7 +106,7 @@ anchors = [
 ]
 # Max training 40 epoch
 max_epochs = 40
-# bs = 12
+# Set batch size to 12
 train_batch_size_per_gpu = 12
 # dataloader num workers
 train_num_workers = 4
@@ -258,7 +258,7 @@ python tools/train.py configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py
 
 #### 2 Tensorboard
 
-Install Tensorboard environment
+Install Tensorboard package:
 
 ```shell
 pip install tensorboard
@@ -270,11 +270,11 @@ Add the `tensorboard` config at the end of config file we just created: `configs
 visualizer = dict(vis_backends=[dict(type='LocalVisBackend'),dict(type='TensorboardVisBackend')])
 ```
 
-After re-running the training command, Tensorboard file will be generated in the visualization folder `work_dirs/yolov5_s-v61_fast_1xb12-40e_cat.py/{timestamp}/vis_data`.
+After re-running the training command, Tensorboard file will be generated in the visualization folder `work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/{timestamp}/vis_data`.
 We can use Tensorboard to view the loss, learning rate, and coco/bbox_mAP visualizations from a web link by running the following command:
 
 ```shell
-tensorboard --logdir=work_dirs/yolov5_s-v61_fast_1xb12-40e_cat.py
+tensorboard --logdir=work_dirs/yolov5_s-v61_fast_1xb12-40e_cat
 ```
 
 ## Testing
@@ -297,13 +297,13 @@ You can also visualize model inference results in a browser window if you use 'W
 
 MMYOLO provides visualization scripts for feature map to analyze the current model training. Please refer to [Feature Map Visualization](../recommended_topics/visualization.md)
 
-Due to the bias of direct visualization of `test_pipeline`, we need to `configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py` of `test_pipeline`
+Due to the bias of direct visualization of `test_pipeline`, we need to modify the `test_pipeline` of `configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py`
 
 ```python
 test_pipeline = [
     dict(
         type='LoadImageFromFile',
-        file_client_args=_base_.file_client_args),
+        backend_args=_base_.backend_args),
     dict(type='YOLOv5KeepRatioResize', scale=img_scale),
     dict(
         type='LetterResize',
@@ -318,13 +318,13 @@ test_pipeline = [
 ]
 ```
 
-modify to the following config:
+to the following config:
 
 ```python
 test_pipeline = [
     dict(
         type='LoadImageFromFile',
-        file_client_args=_base_.file_client_args),
+        backend_args=_base_.backend_args),
     dict(type='mmdet.Resize', scale=img_scale, keep_ratio=False), # modify the LetterResize to mmdet.Resize
     dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),
     dict(
@@ -372,13 +372,19 @@ As can be seen from the above figure, because neck is involved in training, and
 
 Based on the above feature map visualization, we can analyze Grad CAM at the feature layer of bbox level.
 
+Install `grad-cam` package:
+
+```shell
+pip install "grad-cam"
+```
+
 (a) View Grad CAM of the minimum output feature map of the neck
 
 ```shell
 python demo/boxam_vis_demo.py data/cat/images/IMG_20221020_112705.jpg \
-                                configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \
-                                work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \
-                                --target-layer neck.out_layers[2]
+                              configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \
+                              work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \
+                              --target-layer neck.out_layers[2]
 ```
 
 <div align=center>
@@ -389,9 +395,9 @@ python demo/boxam_vis_demo.py data/cat/images/IMG_20221020_112705.jpg \
 
 ```shell
 python demo/boxam_vis_demo.py data/cat/images/IMG_20221020_112705.jpg \
-                                configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \
-                                work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \
-                                --target-layer neck.out_layers[1]
+                              configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \
+                              work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \
+                              --target-layer neck.out_layers[1]
 ```
 
 <div align=center>
@@ -402,9 +408,9 @@ python demo/boxam_vis_demo.py data/cat/images/IMG_20221020_112705.jpg \
 
 ```shell
 python demo/boxam_vis_demo.py data/cat/images/IMG_20221020_112705.jpg \
-                                configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \
-                                work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \
-                                --target-layer neck.out_layers[0]
+                              configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \
+                              work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \
+                              --target-layer neck.out_layers[0]
 ```
 
 <div align=center>
@@ -526,4 +532,4 @@ Here we choose to save the inference results under `output` instead of displayin
 
 This completes the transformation deployment of the trained model and checks the inference results. This is the end of the tutorial.
 
-The full content above can be viewed: [15_minutes_object_detection.ipynb](<>). If you encounter problems during training or testing, please check the \[common troubleshooting steps\](... /recommended_topics/troubleshooting_steps.md) first and feel free to raise an issue if you still can't solve it.
+The full content above can be viewed in [15_minutes_object_detection.ipynb](https://github.com/open-mmlab/mmyolo/blob/dev/demo/15_minutes_object_detection.ipynb). If you encounter problems during training or testing, please check the [common troubleshooting steps](../recommended_topics/troubleshooting_steps.md) first and feel free to open an [issue](https://github.com/open-mmlab/mmyolo/issues/new/choose) if you still can't solve it.
diff --git a/docs/en/get_started/dependencies.md b/docs/en/get_started/dependencies.md
index 06802025b..0d7fc6ad0 100644
--- a/docs/en/get_started/dependencies.md
+++ b/docs/en/get_started/dependencies.md
@@ -4,7 +4,8 @@ Compatible MMEngine, MMCV and MMDetection versions are shown as below. Please in
 
 | MMYOLO version |   MMDetection version    |     MMEngine version     |      MMCV version       |
 | :------------: | :----------------------: | :----------------------: | :---------------------: |
-|      main      | mmdet>=3.0.0rc6, \<3.1.0 | mmengine>=0.6.0, \<1.0.0 | mmcv>=2.0.0rc4, \<2.1.0 |
+|      main      |  mmdet>=3.0.0, \<3.1.0   | mmengine>=0.7.1, \<1.0.0 | mmcv>=2.0.0rc4, \<2.1.0 |
+|     0.6.0      |  mmdet>=3.0.0, \<3.1.0   | mmengine>=0.7.1, \<1.0.0 | mmcv>=2.0.0rc4, \<2.1.0 |
 |     0.5.0      | mmdet>=3.0.0rc6, \<3.1.0 | mmengine>=0.6.0, \<1.0.0 | mmcv>=2.0.0rc4, \<2.1.0 |
 |     0.4.0      | mmdet>=3.0.0rc5, \<3.1.0 | mmengine>=0.3.1, \<1.0.0 | mmcv>=2.0.0rc0, \<2.1.0 |
 |     0.3.0      | mmdet>=3.0.0rc5, \<3.1.0 | mmengine>=0.3.1, \<1.0.0 | mmcv>=2.0.0rc0, \<2.1.0 |
diff --git a/docs/en/get_started/installation.md b/docs/en/get_started/installation.md
index 113f29041..3259acfbb 100644
--- a/docs/en/get_started/installation.md
+++ b/docs/en/get_started/installation.md
@@ -8,14 +8,14 @@
 pip install -U openmim
 mim install "mmengine>=0.6.0"
 mim install "mmcv>=2.0.0rc4,<2.1.0"
-mim install "mmdet>=3.0.0rc6,<3.1.0"
+mim install "mmdet>=3.0.0,<4.0.0"
 ```
 
 If you are currently in the mmyolo project directory, you can use the following simplified commands
 
 ```shell
 cd mmyolo
-pip install -U openmom
+pip install -U openmim
 mim install -r requirements/mminstall.txt
 ```
 
diff --git a/docs/en/index.rst b/docs/en/index.rst
index a4edc8ef7..1a0ab6c3b 100644
--- a/docs/en/index.rst
+++ b/docs/en/index.rst
@@ -19,15 +19,16 @@ You can switch between Chinese and English documents in the top-right corner of
    :caption: Recommended Topics
 
    recommended_topics/contributing.md
+   recommended_topics/training_testing_tricks.md
    recommended_topics/model_design.md
    recommended_topics/algorithm_descriptions/index.rst
+   recommended_topics/application_examples/index.rst
    recommended_topics/replace_backbone.md
    recommended_topics/complexity_analysis.md
    recommended_topics/labeling_to_deployment_tutorials.md
    recommended_topics/visualization.md
    recommended_topics/deploy/index.rst
    recommended_topics/troubleshooting_steps.md
-   recommended_topics/industry_examples.md
    recommended_topics/mm_basics.md
    recommended_topics/dataset_preparation.md
 
@@ -38,6 +39,7 @@ You can switch between Chinese and English documents in the top-right corner of
    common_usage/resume_training.md
    common_usage/syncbn.md
    common_usage/amp_training.md
+   common_usage/ms_training_testing.md
    common_usage/tta.md
    common_usage/plugins.md
    common_usage/freeze_layers.md
diff --git a/docs/en/notes/changelog.md b/docs/en/notes/changelog.md
index 310b930b0..fa3e1a776 100644
--- a/docs/en/notes/changelog.md
+++ b/docs/en/notes/changelog.md
@@ -1,5 +1,43 @@
 # Changelog
 
+## v0.6.0 (15/8/2023)
+
+### Highlights
+
+- Support YOLOv5 instance segmentation
+- Support YOLOX-Pose based on MMPose
+- Add 15 minutes instance segmentation tutorial.
+- YOLOv5 supports using mask annotation to optimize bbox
+- Add Multi-scale training and testing docs
+
+### New Features
+
+- Add training and testing tricks doc (#659)
+- Support setting the cache_size_limit parameter and support mmdet 3.0.0 (#707)
+- Support YOLOv5u and YOLOv6 3.0 inference (#624, #744)
+- Support model-only inference (#733)
+- Add YOLOv8 deepstream config (#633)
+- Add ionogram example in MMYOLO application (#643)
+
+### Bug Fixes
+
+- Fix the browse_dataset for visualization of test and val (#641)
+- Fix installation doc error (#662)
+- Fix yolox-l ckpt link (#677)
+- Fix typos in the YOLOv7 and YOLOv8 diagram (#621, #710)
+- Adjust the order of package imports in `boxam_vis_demo.py` (#655)
+
+### Improvements
+
+- Optimize the `convert_kd_ckpt_to_student.py` file (#647)
+- Add en doc of `FAQ` and `training_testing_tricks` (#691,#693)
+
+### Contributors
+
+A total of 21 developers contributed to this release.
+
+Thank @Lum1104,@azure-wings,@FeiGeChuanShu,@Lingrui Gu,@Nioolek,@huayuan4396,@RangeKing,@danielhonies,@yechenzhi,@JosonChan1998,@kitecats,@Qingrenn,@triple-Mu,@kikefdezl,@zhangrui-wolf,@xin-li-67,@Ben-Louis,@zgzhengSEU,@VoyagerXvoyagerx,@tang576225574,@hhaAndroid
+
 ## v0.5.0 (2/3/2023)
 
 ### Highlights
diff --git a/docs/en/recommended_topics/algorithm_descriptions/yolov8_description.md b/docs/en/recommended_topics/algorithm_descriptions/yolov8_description.md
index a4923b121..70f1686b4 100644
--- a/docs/en/recommended_topics/algorithm_descriptions/yolov8_description.md
+++ b/docs/en/recommended_topics/algorithm_descriptions/yolov8_description.md
@@ -3,7 +3,7 @@
 ## 0 Introduction
 
 <div align=center >
-<img alt="YOLOv8-P5_structure" src="https://user-images.githubusercontent.com/27466624/211974251-8de633c8-090c-47c9-ba52-4941dc9e3a48.jpg"/>
+<img alt="YOLOv8-P5_structure" src="https://user-images.githubusercontent.com/27466624/222869864-1955f054-aa6d-4a80-aed3-92f30af28849.jpg"/>
 Figure 1：YOLOv8-P5
 </div>
 
@@ -201,7 +201,7 @@ In particular, to ensure that the feature map and image are shown aligned, the o
 test_pipeline = [
     dict(
         type='LoadImageFromFile',
-        file_client_args=_base_.file_client_args),
+        backend_args=_base_.backend_args),
     dict(type='mmdet.Resize', scale=img_scale, keep_ratio=False), # change
     dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),
     dict(
diff --git a/docs/en/recommended_topics/application_examples/index.rst b/docs/en/recommended_topics/application_examples/index.rst
new file mode 100644
index 000000000..03c091d19
--- /dev/null
+++ b/docs/en/recommended_topics/application_examples/index.rst
@@ -0,0 +1,7 @@
+MMYOLO application examples
+********************
+
+.. toctree::
+   :maxdepth: 1
+
+   ionogram_detection.md
diff --git a/docs/en/recommended_topics/application_examples/ionogram_detection.md b/docs/en/recommended_topics/application_examples/ionogram_detection.md
new file mode 100644
index 000000000..a1bc7cc91
--- /dev/null
+++ b/docs/en/recommended_topics/application_examples/ionogram_detection.md
@@ -0,0 +1,307 @@
+# A benchmark for ionogram real-time object detection based on MMYOLO
+
+## Dataset
+
+Digital ionogram is the most important way to obtain real-time ionospheric information.
+Ionospheric structure detection is of great research significance for accurate extraction of ionospheric key parameters.
+
+This study utilize 4311 ionograms with different seasons obtained by the Chinese Academy of Sciences in Hainan, Wuhan, and Huailai to establish a dataset. The six structures, including Layer E, Es-l, Es-c, F1, F2, and Spread F are manually annotated using [labelme](https://github.com/wkentaro/labelme). [Dataset Download](https://github.com/VoyagerXvoyagerx/Ionogram_detection/releases/download/Dataset/Iono4311.zip)
+
+<div align=center>
+<img width="40%" src="https://user-images.githubusercontent.com/67947949/223638535-c4583d88-aa5a-4f21-b35a-e6e8328c9bd4.jpg"/>
+
+Preview of annotated images
+
+</div>
+
+1. Dataset prepration
+
+After downloading the data, put it in the root directory of the MMYOLO repository, and use `unzip test.zip` (for Linux) to unzip it to the current folder. The structure of the unzipped folder is as follows:
+
+```shell
+Iono4311/
+├── images
+|      ├── 20130401005200.png
+|      └── ...
+└── labels
+       ├── 20130401005200.json
+       └── ...
+```
+
+The `images` directory contains input images，while the `labels` directory contains annotation files generated by labelme.
+
+2. Convert the dataset into COCO format
+
+Use the script `tools/dataset_converters/labelme2coco.py` to convert labelme labels to COCO labels.
+
+```shell
+python tools/dataset_converters/labelme2coco.py --img-dir ./Iono4311/images \
+                                                --labels-dir ./Iono4311/labels \
+                                                --out ./Iono4311/annotations/annotations_all.json
+```
+
+3. Check the converted COCO labels
+
+To confirm that the conversion process went successfully, use the following command to display the COCO labels on the images.
+
+```shell
+python tools/analysis_tools/browse_coco_json.py --img-dir ./Iono4311/images \
+                                                --ann-file ./Iono4311/annotations/annotations_all.json
+```
+
+4. Divide dataset into training set, validation set and test set
+
+Set 70% of the images in the dataset as the training set, 15% as the validation set, and 15% as the test set.
+
+```shell
+python tools/misc/coco_split.py --json ./Iono4311/annotations/annotations_all.json \
+                                --out-dir ./Iono4311/annotations \
+                                --ratios 0.7 0.15 0.15 \
+                                --shuffle \
+                                --seed 14
+```
+
+The file tree after division is as follows:
+
+```shell
+Iono4311/
+├── annotations
+│   ├── annotations_all.json
+│   ├── class_with_id.txt
+│   ├── test.json
+│   ├── train.json
+│   └── val.json
+├── classes_with_id.txt
+├── images
+├── labels
+├── test_images
+├── train_images
+└── val_images
+```
+
+## Config files
+
+The configuration files are stored in the directory `/projects/misc/ionogram_detection/`.
+
+1. Dataset analysis
+
+To perform a dataset analysis, a sample of 200 images from the dataset can be analyzed using the `tools/analysis_tools/dataset_analysis.py` script.
+
+```shell
+python tools/analysis_tools/dataset_analysis.py projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py \
+                                                --out-dir output
+```
+
+Part of the output is as follows:
+
+```shell
+The information obtained is as follows:
++------------------------------+
+| Information of dataset class |
++---------------+--------------+
+| Class name    | Bbox num     |
++---------------+--------------+
+| E             | 98           |
+| Es-l          | 27           |
+| Es-c          | 46           |
+| F1            | 100          |
+| F2            | 194          |
+| Spread-F      | 6            |
++---------------+--------------+
+```
+
+This indicates that the distribution of categories in the dataset is unbalanced.
+
+<div align=center>
+<img width="100%" src="https://user-images.githubusercontent.com/67947949/223640412-4008a0a1-0626-419d-90bf-fb7ce6f26fc9.jpg"/>
+
+Statistics of object sizes for each category
+
+</div>
+
+According to the statistics, small objects are predominant in the E, Es-l, Es-c, and F1 categories, while medium-sized objects are more common in the F2 and Spread F categories.
+
+2. Visualization of the data processing part in the config
+
+Taking YOLOv5-s as an example, according to the `train_pipeline` in the config file, the data augmentation strategies used during training include：
+
+- Mosaic augmentation
+- Random affine
+- Albumentations (include various digital image processing methods)
+- HSV augmentation
+- Random affine
+
+Use the **'pipeline'** mode of the script `tools/analysis_tools/browse_dataset.py` to obtains all intermediate images in the data pipeline.
+
+```shell
+python tools/analysis_tools/browse_dataset.py projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py \
+                                              -m pipeline \
+                                              --out-dir output
+```
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/67947949/223914228-abcd017d-a068-4dcd-9d91-e6b546540060.png"/>
+
+Visualization for intermediate images in the data pipeline
+
+</div>
+
+3. Optimize anchor size
+
+Use the script `tools/analysis_tools/optimize_anchors.py` to obtain prior anchor box sizes suitable for the dataset.
+
+```shell
+python tools/analysis_tools/optimize_anchors.py projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py \
+                                                --algorithm v5-k-means \
+                                                --input-shape 640 640 \
+                                                --prior-match-thr 4.0 \
+                                                --out-dir work_dirs/dataset_analysis_5_s
+```
+
+4. Model complexity analysis
+
+With the config file, the parameters and FLOPs can be calculated by the script `tools/analysis_tools/get_flops.py`. Take yolov5-s as an example:
+
+```shell
+python tools/analysis_tools/get_flops.py projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py
+```
+
+The following output indicates that the model has 7.947G FLOPs with the input shape (640, 640), and a total of 7.036M learnable parameters.
+
+```shell
+==============================
+Input shape: torch.Size([640, 640])
+Model Flops: 7.947G
+Model Parameters: 7.036M
+==============================
+```
+
+## Train and test
+
+1. Train
+
+**Training visualization**: By following the tutorial of [Annotation-to-deployment workflow for custom dataset](https://mmyolo.readthedocs.io/en/dev/recommended_topics/labeling_to_deployment_tutorials.html#id11), this example uses [wandb](https://wandb.ai/site) to visulize training.
+
+**Debug tricks**: During the process of debugging code, sometimes it is necessary to train for several epochs, such as debugging the validation process or checking whether the checkpoint saving meets expectations. For datasets inherited from `BaseDataset` (such as `YOLOv5CocoDataset` in this example),  setting `indices` in the `dataset` field can specify the number of samples per epoch to reduce the iteration time.
+
+```python
+train_dataloader = dict(
+    batch_size=train_batch_size_per_gpu,
+    num_workers=train_num_workers,
+    dataset=dict(
+        _delete_=True,
+        type='RepeatDataset',
+        times=1,
+        dataset=dict(
+            type=_base_.dataset_type,
+            indices=200,  # set indices=200，represent every epoch only iterator 200 samples
+            data_root=data_root,
+            metainfo=metainfo,
+            ann_file=train_ann_file,
+            data_prefix=dict(img=train_data_prefix),
+            filter_cfg=dict(filter_empty_gt=False, min_size=32),
+            pipeline=_base_.train_pipeline)))
+```
+
+**Start training**：
+
+```shell
+python tools/train.py projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py
+```
+
+2. Test
+
+Specify the path of the config file and the model to start the test:
+
+```shell
+python tools/test.py projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py \
+                     work_dirs/yolov5_s-v61_fast_1xb96-100e_ionogram/xxx
+```
+
+## Experiments and results
+
+### Choose a suitable batch size
+
+- Often, the batch size governs the training speed, and the ideal batch size will be the largest batch size supported by the available hardware.
+- If the video memory is not yet fully utilized, doubling the batch size should result in a corresponding doubling (or close to doubling) of the training throughput. This is equivalent to maintaining a constant (or nearly constant) time per step as the batch size increases.
+- Automatic Mixed Precision (AMP) is a technique to accelerate the training with minimal loss in accuracy. To enable AMP training, add `--amp` to the end of the training command.
+
+Hardware information:
+
+- GPU：V100 with 32GB memory
+- CPU：10-core CPU with 40GB memory
+
+Results：
+
+| Model    | Epoch(best) | AMP   | Batchsize | Num workers | Memory Allocated | Training Time | Val mAP |
+| -------- | ----------- | ----- | --------- | ----------- | ---------------- | ------------- | ------- |
+| YOLOv5-s | 100(82)     | False | 32        | 6           | 35.07%           | 54 min        | 0.575   |
+| YOLOv5-s | 100(96)     | True  | 32        | 6           | 24.93%           | 49 min        | 0.578   |
+| YOLOv5-s | 100(100)    | False | 96        | 6           | 96.64%           | 48 min        | 0.571   |
+| YOLOv5-s | 100(100)    | True  | 96        | 6           | 54.66%           | **37** min    | 0.575   |
+| YOLOv5-s | 100(90)     | True  | 144       | 6           | 77.06%           | 39 min        | 0.573   |
+| YOLOv5-s | 200(148)    | True  | 96        | 6           | 54.66%           | 72 min        | 0.575   |
+| YOLOv5-s | 200(188)    | True  | 96        | **8**       | 54.66%           | 67 min        | 0.576   |
+
+<div align=center>
+<img width="60%" src="https://user-images.githubusercontent.com/67947949/223635966-948c8424-1ba8-4df0-92d7-079029dc1231.png">
+
+The proportion of data loading time to the total time of each step.
+
+</div>
+
+Based on the results above, we can conclude that
+
+- AMP has little impact on the accuracy of the model, but can significantly reduce memory usage while training.
+- Increasing batch size by three times does not reduce the training time by a corresponding factor of three. According to the `data_time` recorded during training, the larger the batch size, the larger the `data_time`, indicating that data loading has become the bottleneck limiting the training speed. Increasing `num_workers`, the number of processes used to load data, can accelerate the training speed.
+
+### Ablation studies
+
+In order to obtain a training pipeline applicable to the dataset, the following ablation studies with the YOLOv5-s model as an example are performed.
+
+#### Data augmentation
+
+| Aug Method | [config](/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_aug0.py) | [config](/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb32-100e_ionogram_mosaic.py) | [config](/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_mosaic_affine.py) | [config](/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_mosaic_affine_albu_hsv.py) | [config](/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py) |
+| ---------- | ------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------- |
+| Mosaic     |                                                                                                  | √                                                                                                  | √                                                                                                         | √                                                                                                                  | √                                                                                           |
+| Affine     |                                                                                                  |                                                                                                    | √                                                                                                         | √                                                                                                                  | √                                                                                           |
+| Albu       |                                                                                                  |                                                                                                    |                                                                                                           | √                                                                                                                  | √                                                                                           |
+| HSV        |                                                                                                  |                                                                                                    |                                                                                                           | √                                                                                                                  | √                                                                                           |
+| Flip       |                                                                                                  |                                                                                                    |                                                                                                           |                                                                                                                    | √                                                                                           |
+| Val mAP    | 0.507                                                                                            | 0.550                                                                                              | 0.572                                                                                                     | 0.567                                                                                                              | 0.575                                                                                       |
+
+The results indicate that mosaic augmentation and random affine transformation can significantly improve the performance on the validation set.
+
+#### Using pre-trained models
+
+If you prefer not to use pre-trained weights, you can simply set `load_from = None` in the config file. For experiments that do not use pre-trained weights, it is recommended to increase the base learning rate by a factor of four and extend the number of training epochs to 200 to ensure adequate model training.
+
+| Model    | Epoch(best) | FLOPs(G) | Params(M) | Pretrain | Val mAP | Config                                                                                           |
+| -------- | ----------- | -------- | --------- | -------- | ------- | ------------------------------------------------------------------------------------------------ |
+| YOLOv5-s | 100(82)     | 7.95     | 7.04      | Coco     | 0.575   | [config](/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py)      |
+| YOLOv5-s | 200(145)    | 7.95     | 7.04      | None     | 0.565   | [config](/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-200e_ionogram_pre0.py) |
+| YOLOv6-s | 100(54)     | 24.2     | 18.84     | Coco     | 0.584   | [config](/projects/misc/ionogram_detection/yolov6/yolov6_s_fast_1xb32-100e_ionogram.py)          |
+| YOLOv6-s | 200(188)    | 24.2     | 18.84     | None     | 0.557   | [config](/projects/misc/ionogram_detection/yolov6/yolov6_s_fast_1xb32-200e_ionogram_pre0.py)     |
+
+<div align=center>
+<img width="60%" src="https://user-images.githubusercontent.com/67947949/223641016-9ded0d11-62b8-45f4-be5b-bd4ffae3ec21.png">
+
+Comparison of loss reduction during training
+
+</div>
+
+The loss reduction curve shows that when using pre-trained weights, the loss decreases faster. It can be seen that even using models pre-trained on natural image datasets can accelerate model convergence when fine-tuned on radar image datasets.
+
+### Benchmark for ionogram object detection
+
+| Model       | epoch(best) | FLOPs(G) | Params(M) | pretrain | val mAP | test mAP | Config                                                                                      | Log                                                                                                           |
+| ----------- | ----------- | -------- | --------- | -------- | ------- | -------- | ------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------- |
+| YOLOv5-s    | 100(82)     | 7.95     | 7.04      | Coco     | 0.575   | 0.584    | [config](/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py) | [log](https://github.com/VoyagerXvoyagerx/Ionogram_detection/blob/main/logs/yolov5_s_20230105_213510.json)    |
+| YOLOv5-m    | 100(70)     | 24.05    | 20.89     | Coco     | 0.587   | 0.586    | [config](/projects/misc/ionogram_detection/yolov5/yolov5_m-v61_fast_1xb32-100e_ionogram.py) | [log](https://github.com/VoyagerXvoyagerx/Ionogram_detection/blob/main/logs/yolov5_m_20230106_004642.json)    |
+| YOLOv6-s    | 100(54)     | 24.2     | 18.84     | Coco     | 0.584   | 0.594    | [config](/projects/misc/ionogram_detection/yolov6/yolov6_s_fast_1xb32-100e_ionogram.py)     | [log](https://github.com/VoyagerXvoyagerx/Ionogram_detection/blob/main/logs/yolov6_s_20230107_003207.json)    |
+| YOLOv6-m    | 100(76)     | 37.08    | 44.42     | Coco     | 0.590   | 0.590    | [config](/projects/misc/ionogram_detection/yolov6/yolov6_m_fast_1xb32-100e_ionogram.py)     | [log](https://github.com/VoyagerXvoyagerx/Ionogram_detection/blob/main/logs/yolov6_m_20230107_201029.json)    |
+| YOLOv6-l    | 100(76)     | 71.33    | 58.47     | Coco     | 0.605   | 0.597    | [config](/projects/misc/ionogram_detection/yolov6/yolov6_l_fast_1xb32-100e_ionogram.py)     | [log](https://github.com/VoyagerXvoyagerx/Ionogram_detection/blob/main/logs/yolov6_l_20230108_005634.json)    |
+| YOLOv7-tiny | 100(78)     | 6.57     | 6.02      | Coco     | 0.549   | 0.568    | [config](/projects/misc/ionogram_detection/yolov7/yolov7_tiny_fast_1xb16-100e_ionogram.py)  | [log](https://github.com/VoyagerXvoyagerx/Ionogram_detection/blob/main/logs/yolov7_tiny_20230215_202837.json) |
+| YOLOv7-x    | 100(58)     | 94.27    | 70.85     | Coco     | 0.602   | 0.595    | [config](/projects/misc/ionogram_detection/yolov7/yolov7_x_fast_1xb16-100e_ionogram.py)     | [log](https://github.com/VoyagerXvoyagerx/Ionogram_detection/blob/main/logs/yolov7_x_20230110_165832.json)    |
+| rtmdet-tiny | 100(100)    | 8.03     | 4.88      | Coco     | 0.582   | 0.589    | [config](/projects/misc/ionogram_detection/rtmdet/rtmdet_tiny_fast_1xb32-100e_ionogram.py)  | [log](https://github.com/VoyagerXvoyagerx/Ionogram_detection/blob/main/logs/rtmdet_tiny_20230310_125440.json) |
+| rtmdet-s    | 100(92)     | 14.76    | 8.86      | Coco     | 0.588   | 0.585    | [config](/projects/misc/ionogram_detection/rtmdet/rtmdet_s_fast_1xb32-100e_ionogram.py)     | [log](https://github.com/VoyagerXvoyagerx/Ionogram_detection/blob/main/logs/rtmdet_s_20230310_163853.json)    |
diff --git a/docs/en/recommended_topics/deploy/mmdeploy_guide.md b/docs/en/recommended_topics/deploy/mmdeploy_guide.md
index 69258f540..096d39fbc 100644
--- a/docs/en/recommended_topics/deploy/mmdeploy_guide.md
+++ b/docs/en/recommended_topics/deploy/mmdeploy_guide.md
@@ -4,7 +4,7 @@
 
 MMDeploy is an open-source deep learning model deployment toolset. It is a part of the [OpenMMLab](https://openmmlab.com/) project, and provides **a unified experience of exporting different models** to various platforms and devices of the OpenMMLab series libraries. Using MMDeploy, developers can easily export the specific compiled SDK they need from the training result, which saves a lot of effort.
 
-More detailed introduction and guides can be found [here](https://github.com/open-mmlab/mmdeploy/blob/dev-1.x/docs/en/get_started.md)
+More detailed introduction and guides can be found [here](https://mmdeploy.readthedocs.io/en/latest/get_started.html)
 
 ## Supported Algorithms
 
@@ -19,6 +19,14 @@ Currently our deployment kit supports on the following models and backends:
 
 Note: ncnn and other inference backends support are coming soon.
 
+## Installation
+
+Please install mmdeploy by following [this](https://mmdeploy.readthedocs.io/en/latest/get_started.html) guide.
+
+```{note}
+If you install mmdeploy prebuilt package, please also clone its repository by 'git clone https://github.com/open-mmlab/mmdeploy.git --depth=1' to get the 'tools' file for deployment.
+```
+
 ## How to Write Config for MMYOLO
 
 All config files related to the deployment are located at [`configs/deploy`](../../../configs/deploy/).
@@ -45,7 +53,7 @@ codebase_config = dict(
 
 - `score_threshold`: set the score threshold to filter candidate bboxes before `nms`
 - `confidence_threshold`: set the confidence threshold to filter candidate bboxes before `nms`
-- `iou_threshold`: set the `iou` threshold for removing duplicates in `nums`
+- `iou_threshold`: set the `iou` threshold for removing duplicates in `nms`
 - `max_output_boxes_per_class`: set the maximum number of bboxes for each class
 - `pre_top_k`: set the number of fixedcandidate bboxes before `nms`, sorted by scores
 - `keep_top_k`: set the number of output candidate bboxs after `nms`
@@ -61,7 +69,7 @@ Taking `YOLOv5` of MMYOLO as an example, here are the details:
 _base_ = '../../yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py'
 
 test_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(
         type='LetterResize',
         scale=_base_.img_scale,
@@ -87,7 +95,7 @@ test_dataloader = dict(
 
 #### 2. Deployment Config
 
-Here we still use the `YOLOv5` in MMYOLO as the example. We can use [`detection_onnxruntime_static.py`](https://github.com/open-mmlab/mmyolo/blob/main/configs/deploy/detection_onnxruntime_static.py) as the config to deploy \`YOLOv5\` to \`ONNXRuntim\` with static inputs.
+Here we still use the `YOLOv5` in MMYOLO as the example. We can use [`detection_onnxruntime_static.py`](https://github.com/open-mmlab/mmyolo/blob/main/configs/deploy/detection_onnxruntime_static.py) as the config to deploy `YOLOv5` to `ONNXRuntime` with static inputs.
 
 ```python
 _base_ = ['./base_static.py']
@@ -128,7 +136,7 @@ backend_config = dict(
 use_efficientnms = False
 ```
 
-`backend_config` indices the backend with `type=‘tensorrt’`.
+`backend_config` indices the backend with `type='tensorrt'`.
 
 Different from `ONNXRuntime` deployment configuration, `TensorRT` needs to specify the input image size and the parameters required to build the engine file, including:
 
@@ -206,6 +214,8 @@ Note: Int8 quantization support will soon be released.
 
 ### Usage
 
+#### Deploy with MMDeploy Tools
+
 Set the root directory of `MMDeploy` as an env parameter `MMDEPLOY_DIR` using `export MMDEPLOY_DIR=/the/root/path/of/MMDeploy` command.
 
 ```shell
@@ -237,6 +247,125 @@ python3 ${MMDEPLOY_DIR}/tools/deploy.py \
 - `--show`: show the result on screen or not
 - `--dump-info`: output SDK information or not
 
+#### Deploy with MMDeploy API
+
+Suppose the working directory is the root path of mmyolo. Take [YoloV5](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py) model as an example. You can download its checkpoint from [here](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth), and then convert it to onnx model as follows:
+
+```python
+from mmdeploy.apis import torch2onnx
+from mmdeploy.backend.sdk.export_info import export2SDK
+
+img = 'demo/demo.jpg'
+work_dir = 'mmdeploy_models/mmyolo/onnx'
+save_file = 'end2end.onnx'
+deploy_cfg = 'configs/deploy/detection_onnxruntime_dynamic.py'
+model_cfg = 'configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py'
+model_checkpoint = 'checkpoints/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth'
+device = 'cpu'
+
+# 1. convert model to onnx
+torch2onnx(img, work_dir, save_file, deploy_cfg, model_cfg,
+           model_checkpoint, device)
+
+# 2. extract pipeline info for inference by MMDeploy SDK
+export2SDK(deploy_cfg, model_cfg, work_dir, pth=model_checkpoint,
+           device=device)
+```
+
+## Model specification
+
+Before moving on to model inference chapter, let's know more about the converted result structure which is very important for model inference. It is saved in the directory specified with `--wodk_dir`.
+
+The converted results are saved in the working directory `mmdeploy_models/mmyolo/onnx` in the previous example. It includes:
+
+```
+mmdeploy_models/mmyolo/onnx
+├── deploy.json
+├── detail.json
+├── end2end.onnx
+└── pipeline.json
+```
+
+in which,
+
+- **end2end.onnx**: backend model which can be inferred by ONNX Runtime
+- ***xxx*.json**: the necessary information for mmdeploy SDK
+
+The whole package **mmdeploy_models/mmyolo/onnx** is defined as **mmdeploy SDK model**, i.e., **mmdeploy SDK model** includes both backend model and inference meta information.
+
+## Model inference
+
+### Backend model inference
+
+Take the previous converted `end2end.onnx` model as an example, you can use the following code to inference the model and visualize the results.
+
+```python
+from mmdeploy.apis.utils import build_task_processor
+from mmdeploy.utils import get_input_shape, load_config
+import torch
+
+deploy_cfg = 'configs/deploy/detection_onnxruntime_dynamic.py'
+model_cfg = 'configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py'
+device = 'cpu'
+backend_model = ['mmdeploy_models/mmyolo/onnx/end2end.onnx']
+image = 'demo/demo.jpg'
+
+# read deploy_cfg and model_cfg
+deploy_cfg, model_cfg = load_config(deploy_cfg, model_cfg)
+
+# build task and backend model
+task_processor = build_task_processor(model_cfg, deploy_cfg, device)
+model = task_processor.build_backend_model(backend_model)
+
+# process input image
+input_shape = get_input_shape(deploy_cfg)
+model_inputs, _ = task_processor.create_input(image, input_shape)
+
+# do model inference
+with torch.no_grad():
+    result = model.test_step(model_inputs)
+
+# visualize results
+task_processor.visualize(
+    image=image,
+    model=model,
+    result=result[0],
+    window_name='visualize',
+    output_file='work_dir/output_detection.png')
+```
+
+With the above code, you can find the inference result `output_detection.png` in `work_dir`.
+
+### SDK model inference
+
+You can also perform SDK model inference like following,
+
+```python
+from mmdeploy_runtime import Detector
+import cv2
+
+img = cv2.imread('demo/demo.jpg')
+
+# create a detector
+detector = Detector(model_path='mmdeploy_models/mmyolo/onnx',
+                    device_name='cpu', device_id=0)
+# perform inference
+bboxes, labels, masks = detector(img)
+
+# visualize inference result
+indices = [i for i in range(len(bboxes))]
+for index, bbox, label_id in zip(indices, bboxes, labels):
+    [left, top, right, bottom], score = bbox[0:4].astype(int), bbox[4]
+    if score < 0.3:
+        continue
+
+    cv2.rectangle(img, (left, top), (right, bottom), (0, 255, 0))
+
+cv2.imwrite('work_dir/output_detection.png', img)
+```
+
+Besides python API, mmdeploy SDK also provides other FFI (Foreign Function Interface), such as C, C++, C#, Java and so on. You can learn their usage from [demos](https://github.com/open-mmlab/mmdeploy/tree/main/demo).
+
 ## How to Evaluate Model
 
 ### Usage
@@ -248,40 +377,38 @@ python3 ${MMDEPLOY_DIR}/tools/test.py \
     ${DEPLOY_CFG} \
     ${MODEL_CFG} \
     --model ${BACKEND_MODEL_FILES} \
-    [--out ${OUTPUT_PKL_FILE}] \
-    [--format-only] \
-    [--metrics ${METRICS}] \
-    [--show] \
-    [--show-dir ${OUTPUT_IMAGE_DIR}] \
-    [--show-score-thr ${SHOW_SCORE_THR}] \
     --device ${DEVICE} \
+    --work-dir ${WORK_DIR} \
     [--cfg-options ${CFG_OPTIONS}] \
-    [--metric-options ${METRIC_OPTIONS}]
+    [--show] \
+    [--show-dir ${OUTPUT_IMAGE_DIR}] \
+    [--interval ${INTERVAL}] \
+    [--wait-time ${WAIT_TIME}] \
     [--log2file work_dirs/output.txt]
-    [--batch-size ${BATCH_SIZE}]
     [--speed-test] \
     [--warmup ${WARM_UP}] \
-    [--log-interval ${LOG_INTERVERL}]
+    [--log-interval ${LOG_INTERVERL}] \
+    [--batch-size ${BATCH_SIZE}] \
+    [--uri ${URI}]
 ```
 
 ### Parameter Description
 
-- `deploy_cfg`: set the deployment config file path
-- `model_cfg`: set the MMYOLO model config file path
-- `--model`: set the converted model. For example, if we exported a TensorRT model, we need to pass in the file path with the suffix ".engine"
-- `--out`: save the output result in pickle format, use only when you need it
-- `--format-only`: format the output without evaluating it. It is useful when you want to format the result into a specific format and submit it to a test server
-- `--metrics`: use the specific metric supported in MMYOLO to evaluate, such as "proposal" in COCO format data.
-- `--show`: show the evaluation result on screen or not
-- `--show-dir`: save the evaluation result to this directory, valid only when specified
-- `--show-score-thr`: show the threshold for the detected bboxes or not
-- `--device`: indicate the device to run the model. Note that some backends limit the running devices. For example, TensorRT must run on CUDA
-- `--cfg-options`: pass in additional configs, which will override the current deployment configs
-- `--metric-options`: add custom options for metrics. The key-value pair format in xxx=yyy will be the kwargs of the dataset.evaluate() method
-- `--log2file`: save the evaluation results (with the speed) to a file
-- `--batch-size`: set the batch size for inference, which will override the `samples_per_gpu` in data config. The default value is `1`, however, not every model supports `batch_size > 1`
-- `--speed-test`: test the inference speed or not
-- `--warmup`: warm up before speed test or not, works only when `speed-test` is specified
-- `--log-interval`: set the interval between each log, works only when `speed-test` is specified
+- `deploy_cfg`: set the deployment config file path.
+- `model_cfg`: set the MMYOLO model config file path.
+- `--model`: set the converted model. For example, if we exported a TensorRT model, we need to pass in the file path with the suffix ".engine".
+- `--device`: indicate the device to run the model. Note that some backends limit the running devices. For example, TensorRT must run on CUDA.
+- `--work-dir`: the directory to save the file containing evaluation metrics.
+- `--cfg-options`: pass in additional configs, which will override the current deployment configs.
+- `--show`: show the evaluation result on screen or not.
+- `--show-dir`: save the evaluation result to this directory, valid only when specified.
+- `--interval`: set the display interval between each two evaluation results.
+- `--wait-time`: set the display time of each window.
+- `--log2file`: log evaluation results and speed to file.
+- `--speed-test`: test the inference speed or not.
+- `--warmup`: warm up before speed test or not, works only when `speed-test` is specified.
+- `--log-interval`: the interval between each log, works only when `speed-test` is specified.
+- `--batch-size`: set the batch size for inference, which will override the `samples_per_gpu` in data config. The default value is `1`, however, not every model supports `batch_size > 1`.
+- `--uri`: Remote ipv4:port or ipv6:port for inference on edge device.
 
 Note: other parameters in `${MMDEPLOY_DIR}/tools/test.py` are used for speed test, they will not affect the evaluation results.
diff --git a/docs/en/recommended_topics/deploy/mmdeploy_yolov5.md b/docs/en/recommended_topics/deploy/mmdeploy_yolov5.md
index 7eb85b24d..321a6734f 100644
--- a/docs/en/recommended_topics/deploy/mmdeploy_yolov5.md
+++ b/docs/en/recommended_topics/deploy/mmdeploy_yolov5.md
@@ -28,7 +28,7 @@ Here is a example in [`yolov5_s-static.py`](https://github.com/open-mmlab/mmyolo
 _base_ = '../../yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py'
 
 test_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(
         type='LetterResize',
         scale=_base_.img_scale,
@@ -113,7 +113,7 @@ batch_shapes_cfg = dict(
     extra_pad_ratio=0.5)
 
 test_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(type='YOLOv5KeepRatioResize', scale=img_scale),
     dict(
         type='LetterResize',
@@ -252,6 +252,7 @@ python3 ${MMDEPLOY_DIR}/tools/deploy.py \
     --work-dir work_dir \
     --show \
     --device cpu
+    --dump-info
 ```
 
 #### TensorRT
@@ -265,19 +266,20 @@ python3 ${MMDEPLOY_DIR}/tools/deploy.py \
     --work-dir work_dir \
     --show \
     --device cuda:0
+    --dump-info
 ```
 
 When convert the model using the above commands, you will find the following files under the `work_dir` folder:
 
-![image](https://user-images.githubusercontent.com/92794867/199377596-605c3493-c1e0-435d-bc97-2e46846ac87d.png)
+![image](https://github.com/open-mmlab/mmdeploy/assets/110151316/760f3f7f-aa23-46cf-987c-717d3490246f)
 
 or
 
-![image](https://user-images.githubusercontent.com/92794867/199377848-a771f9c5-6bd6-49a1-9f58-e7e7b96c800f.png)
+![image](https://github.com/open-mmlab/mmdeploy/assets/110151316/732bcd9a-fca0-40ba-b5af-540a47eb9c35)
 
-After exporting to `onnxruntime`, you will get three files as shown in Figure 1, where `end2end.onnx` represents the exported `onnxruntime` model.
+After exporting to `onnxruntime`, you will get six files as shown in Figure 1, where `end2end.onnx` represents the exported `onnxruntime` model. The `xxx.json` are the meta info for `MMDeploy SDK` inference.
 
-After exporting to `TensorRT`, you will get the four files as shown in Figure 2, where `end2end.onnx` represents the exported intermediate model. `MMDeploy` uses this model to automatically continue to convert the `end2end.engine` model for `TensorRT `Deployment.
+After exporting to `TensorRT`, you will get the seven files as shown in Figure 2, where `end2end.onnx` represents the exported intermediate model. `MMDeploy` uses this model to automatically continue to convert the `end2end.engine` model for `TensorRT `Deployment. The `xxx.json` are the meta info for `MMDeploy SDK` inference.
 
 ## How to Evaluate Model
 
@@ -429,4 +431,142 @@ python3 ${MMDEPLOY_DIR}/tools/profiler.py \
 
 ## Model Inference
 
-TODO
+### Backend Model Inference
+
+#### ONNXRuntime
+
+For the converted model `end2end.onnx`，you can do the inference with the following code：
+
+```python
+from mmdeploy.apis.utils import build_task_processor
+from mmdeploy.utils import get_input_shape, load_config
+import torch
+
+deploy_cfg = './configs/deploy/detection_onnxruntime_dynamic.py'
+model_cfg = '../mmyolo/configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py'
+device = 'cpu'
+backend_model = ['./work_dir/end2end.onnx']
+image = '../mmyolo/demo/demo.jpg'
+
+# read deploy_cfg and model_cfg
+deploy_cfg, model_cfg = load_config(deploy_cfg, model_cfg)
+
+# build task and backend model
+task_processor = build_task_processor(model_cfg, deploy_cfg, device)
+model = task_processor.build_backend_model(backend_model)
+
+# process input image
+input_shape = get_input_shape(deploy_cfg)
+model_inputs, _ = task_processor.create_input(image, input_shape)
+
+# do model inference
+with torch.no_grad():
+    result = model.test_step(model_inputs)
+
+# visualize results
+task_processor.visualize(
+    image=image,
+    model=model,
+    result=result[0],
+    window_name='visualize',
+    output_file='work_dir/output_detection.png')
+```
+
+#### TensorRT
+
+For the converted model `end2end.engine`，you can do the inference with the following code：
+
+```python
+from mmdeploy.apis.utils import build_task_processor
+from mmdeploy.utils import get_input_shape, load_config
+import torch
+
+deploy_cfg = './configs/deploy/detection_tensorrt_dynamic-192x192-960x960.py'
+model_cfg = '../mmyolo/configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py'
+device = 'cuda:0'
+backend_model = ['./work_dir/end2end.engine']
+image = '../mmyolo/demo/demo.jpg'
+
+# read deploy_cfg and model_cfg
+deploy_cfg, model_cfg = load_config(deploy_cfg, model_cfg)
+
+# build task and backend model
+task_processor = build_task_processor(model_cfg, deploy_cfg, device)
+model = task_processor.build_backend_model(backend_model)
+
+# process input image
+input_shape = get_input_shape(deploy_cfg)
+model_inputs, _ = task_processor.create_input(image, input_shape)
+
+# do model inference
+with torch.no_grad():
+    result = model.test_step(model_inputs)
+
+# visualize results
+task_processor.visualize(
+    image=image,
+    model=model,
+    result=result[0],
+    window_name='visualize',
+    output_file='work_dir/output_detection.png')
+```
+
+### SDK Model Inference
+
+#### ONNXRuntime
+
+For the converted model `end2end.onnx`，you can do the SDK inference with the following code：
+
+```python
+from mmdeploy_runtime import Detector
+import cv2
+
+img = cv2.imread('../mmyolo/demo/demo.jpg')
+
+# create a detector
+detector = Detector(model_path='work_dir',
+                    device_name='cpu', device_id=0)
+# perform inference
+bboxes, labels, masks = detector(img)
+
+# visualize inference result
+indices = [i for i in range(len(bboxes))]
+for index, bbox, label_id in zip(indices, bboxes, labels):
+    [left, top, right, bottom], score = bbox[0:4].astype(int), bbox[4]
+    if score < 0.3:
+        continue
+
+    cv2.rectangle(img, (left, top), (right, bottom), (0, 255, 0))
+
+cv2.imwrite('work_dir/output_detection.png', img)
+```
+
+#### TensorRT
+
+For the converted model `end2end.engine`，you can do the SDK inference with the following code：
+
+```python
+from mmdeploy_runtime import Detector
+import cv2
+
+img = cv2.imread('../mmyolo/demo/demo.jpg')
+
+# create a detector
+detector = Detector(model_path='work_dir',
+                    device_name='cuda', device_id=0)
+# perform inference
+bboxes, labels, masks = detector(img)
+
+# visualize inference result
+indices = [i for i in range(len(bboxes))]
+for index, bbox, label_id in zip(indices, bboxes, labels):
+    [left, top, right, bottom], score = bbox[0:4].astype(int), bbox[4]
+    if score < 0.3:
+        continue
+
+    cv2.rectangle(img, (left, top), (right, bottom), (0, 255, 0))
+
+cv2.imwrite('work_dir/output_detection.png', img)
+```
+
+Besides python API, mmdeploy SDK also provides other FFI (Foreign Function Interface), such as C, C++, C#, Java and so on. You can learn their usage from [demos](https://github.com/open-mmlab/mmdeploy/tree/main/demo).
diff --git a/docs/en/recommended_topics/industry_examples.md b/docs/en/recommended_topics/industry_examples.md
deleted file mode 100644
index 2380143b9..000000000
--- a/docs/en/recommended_topics/industry_examples.md
+++ /dev/null
@@ -1 +0,0 @@
-# MMYOLO industry examples
diff --git a/docs/en/recommended_topics/training_testing_tricks.md b/docs/en/recommended_topics/training_testing_tricks.md
new file mode 100644
index 000000000..48ce25f8b
--- /dev/null
+++ b/docs/en/recommended_topics/training_testing_tricks.md
@@ -0,0 +1,310 @@
+# Training testing tricks
+
+MMYOLO has already supported most of the YOLO series object detection related algorithms. Different algorithms may involve some practical tricks. This section will describe in detail the commonly used training and testing tricks supported by MMYOLO based on the implemented object detection algorithms.
+
+## Training tricks
+
+### Improve performance of detection
+
+#### 1. Multi-scale training
+
+In the field of object detection, multi-scale training is a very common trick. However, in YOLO, most of the models are trained with a single-scale input of 640x640. There are two reasons for this:
+
+1. Single-scale training is faster than multi-scale training. When the training epoch is at 300 or 500, training efficiency is a major concern for users. Multi-scale training will be slower.
+2. Multi-scale augmentation is implied in the training pipeline, which is equivalent to the application of multi-scale training, such as the 'Mosaic', 'RandomAffine' and 'Resize', so there is no need to introduce the multi-scale training of model input again.
+
+Through experiments on the COCO dataset, it is founded that the multi-scale training is introduced directly after the output of YOLOv5's DataLoader, the actual performance improvement is very small. If you want to start multi-scale training for YOLO series algorithms in MMYOLO, you can refer to [ms_training_testing](../common_usage/ms_training_testing.md),
+however, this does not mean that there are no significant gains in user-defined dataset fine-tuning mode
+
+#### 2 Use Mask annotation to optimize object detection performance
+
+When the dataset annotation is complete, such as boundary box annotation and instance segmentation annotation exist at the same time, but only part of the annotation is required for the task, the task can be trained with complete data annotation to improve the performance.
+In object detection, we can also learn from instance segmentation annotation to improve the performance of object detection. The following is the detection result of additional instance segmentation annotation optimization introduced by YOLOv8. The performance gains are shown below:
+
+<div align="center">
+<img src="https://user-images.githubusercontent.com/17425982/224920799-597ee962-5997-48b3-9499-25f885d7a421.png" width="1000"/>
+</div>
+
+As shown in the figure, different scale models have different degrees of performance improvement.
+It is important to note that 'Mask Refine' only functions in the data enhancement phase and does not require any changes to other training parts of the model and does not affect the speed of training. The details are as follows:
+
+<div align="center">
+<img src="https://user-images.githubusercontent.com/17425982/224922191-a52cb410-d08d-455a-bd38-08b83266cc5f.png" width="1000"/>
+</div>
+
+The above-mentioned Mask represents a data augmentation transformation in which instance segmentation annotations play a key role.
+The application of this technique to other YOLO series has varying degrees of increase.
+
+#### 3 Turn off strong augmentation in the later stage of training to improve detection performance
+
+This strategy is proposed for the first time in YOLOX algorithm and can greatly improve the detection performance.
+The paper points out that Mosaic+MixUp can greatly improve the target detection performance, but the training pictures are far from the real distribution of natural pictures, and Mosaic's large number of cropping operations will bring many inaccurate label boxes,
+therefore, YOLOX proposes to turn off the strong enhancement in the last 15 epochs and use the weaker enhancement instead, so that the detector can avoid the influence of inaccurate labeled boxes and complete the final convergence under the data distribution of the natural picture.
+
+This strategy has been applied to most YOLO algorithms. Taking YOLOv8 as an example, its data augmentation pipeline is shown as follows:
+
+<div align="center">
+<img src="https://user-images.githubusercontent.com/17425982/224923285-dd23e419-1d9d-4ee6-bec6-af0a6ef5fed0.png" width="400"/>
+</div>
+
+However, when to turn off the strong augmentation is a hyper-parameter. If you turn off the strong augmentation too early, it may not give full play to Mosaic and other strong augmentation effects. If you turn off the strong enhancement too late, it will have no gain because it has been overfitted before. This phenomenon can be observed in YOLOv8 experiment
+
+| Backbone | Mask Refine |   box AP    | Epoch of best mAP |
+| :------: | :---------: | :---------: | :---------------: |
+| YOLOv8-n |     No      |    37.2     |        500        |
+| YOLOv8-n |     Yes     | 37.4 (+0.2) |        499        |
+| YOLOv8-s |     No      |    44.2     |        430        |
+| YOLOv8-s |     Yes     | 45.1 (+0.9) |        460        |
+| YOLOv8-m |     No      |    49.8     |        460        |
+| YOLOv8-m |     Yes     | 50.6 (+0.8) |        480        |
+| YOLOv8-l |     No      |    52.1     |        460        |
+| YOLOv8-l |     Yes     | 53.0 (+0.9) |        491        |
+| YOLOv8-x |     No      |    52.7     |        450        |
+| YOLOv8-x |     Yes     | 54.0 (+1.3) |        460        |
+
+As can be seen from the above table:
+
+- Large models trained on COCO dataset for 500 epochs are prone to overfitting, and disabling strong augmentations such as Mosaic may not be effective in reducing overfitting in such cases.
+- Using Mask annotations can alleviate overfitting and improve performance
+
+#### 4 Add pure background images to suppress false positives
+
+For non-open-world datasets in object detection, both training and testing are conducted on a fixed set of classes, and there is a possibility of producing false positives when applied to images with classes that have not been trained. A common mitigation strategy is to add a certain proportion of pure background images.
+In most YOLO series, the function of suppressing false positives by adding pure background images is enabled by default. Users only need to set train_dataloader.dataset.filter_cfg.filter_empty_gt to False, indicating that pure background images should not be filtered out during training.
+
+#### 5 Maybe the AdamW works wonders
+
+YOLOv5, YOLOv6, YOLOv7 and YOLOv8 all adopt the SGD optimizer, which is strict about parameter settings, while AdamW is on the contrary, which is not so sensitive to learning rate. If user fine-tune a custom-dataset can try to select the AdamW optimizer. We did a simple trial in YOLOX and found that replacing the optimizer with AdamW on the tiny, s, and m scale models all had some improvement.
+
+|  Backbone  | Size | Batch Size | RTMDet-Hyp |   Box AP    |
+| :--------: | :--: | :--------: | :--------: | :---------: |
+| YOLOX-tiny | 416  |    8xb8    |     No     |    32.7     |
+| YOLOX-tiny | 416  |   8xb32    |    Yes     | 34.3 (+1.6) |
+|  YOLOX-s   | 640  |    8xb8    |     No     |    40.7     |
+|  YOLOX-s   | 640  |   8xb32    |    Yes     | 41.9 (+1.2) |
+|  YOLOX-m   | 640  |    8xb8    |     No     |    46.9     |
+|  YOLOX-m   | 640  |   8xb32    |    Yes     | 47.5 (+0.6) |
+
+More details can be found in [configs/yolox/README.md](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolox/README.md#--results-and-models).
+
+#### 6 Consider ignore scenarios to avoid uncertain annotations
+
+Take CrowdHuman as an example, a crowded pedestrian detection dataset. Here's a typical image:
+
+<div align="center">
+<img src="https://user-images.githubusercontent.com/17425982/224928241-89dac006-392b-445d-87e8-a9e268825401.png" width="1000"/>
+</div>
+
+The image is sourced from [detectron2 issue](https://github.com/facebookresearch/detectron2/issues/1909). The area marked with a yellow cross indicates the `iscrowd` label. There are two reasons for this:
+
+- This area is not a real person, such as the person on the poster
+- The area is too crowded to mark
+
+In this scenario, you cannot simply delete such annotations, because once you delete them, it means treating them as background areas during training. However, they are different from the background. Firstly, the people on the posters are very similar to real people, and there are indeed people in crowded areas that are difficult to annotate. If you simply train them as background, it will cause false negatives. The best approach is to treat the crowded area as an ignored region, where any output in this area is directly ignored, with no loss calculated and no model fitting enforced.
+
+MMYOLO quickly and easily verifies the function of 'iscrowd' annotation on YOLOv5. The performance is as follows:
+
+| Backbone | ignore_iof_thr | box AP50(CrowDHuman Metric) |  MR  |  JI   |
+| :------: | :------------: | :-------------------------: | :--: | :---: |
+| YOLOv5-s |       -1       |            85.79            | 48.7 | 75.33 |
+| YOLOv5-s |      0.5       |            86.17            | 48.8 | 75.87 |
+
+`ignore_iof_thr` set to -1 indicates that the ignored labels are not considered, and it can be seen that the performance is improved to a certain extent, more details can be found in  [CrowdHuman results](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov5/README.md#crowdhuman). If you encounter similar situations in your custom dataset, it is recommended that you consider using `ignore` labels to avoid uncertain annotations.
+
+#### 7 Use knowledge distillation
+
+Knowledge distillation is a widely used technique that can transfer the performance of a large model to a smaller model, thereby improving the detection performance of the smaller model. Currently, MMYOLO and MMRazor have supported this feature and conducted initial verification on RTMDet.
+
+|     Model      |   box AP    |
+| :------------: | :---------: |
+|  RTMDet-tiny   |    41.0     |
+| RTMDet-tiny \* | 41.8 (+0.8) |
+|    RTMDet-s    |    44.6     |
+|  RTMDet-s \*   | 45.7 (+1.1) |
+|    RTMDet-m    |    49.3     |
+|  RTMDet-m \*   | 50.2 (+0.9) |
+|    RTMDet-l    |    51.4     |
+|  RTMDet-l \*   | 52.3 (+0.9) |
+
+`*` indicates the result of using the large model distillation, more details can be found in [Distill RTMDet](https://github.com/open-mmlab/mmyolo/tree/main/configs/rtmdet/distillation).
+
+#### 8 Stronger augmentation parameters are used for larger models
+
+If you have modified the model based on the default configuration or replaced the backbone network, it is recommended to scale the data augmentation parameters based on the current model size. Generally, larger models require stronger augmentation parameters, otherwise they may not fully leverage the benefits of large models. Conversely, if strong augmentations are applied to small models, it may result in underfitting. Taking RTMDet as an example, we can observe the data augmentation parameters for different model sizes.
+
+<div align="center">
+<img src="https://user-images.githubusercontent.com/17425982/224936237-a31add46-77ff-4595-b3d9-c7b574f59c84.png" width="800"/>
+</div>
+
+`random_resize_ratio_range` represents the random scaling range of `RandomResize`, and `mosaic_max_cached_images/mixup_max_cached_images` represents the number of cached images during `Mosaic/MixUp` augmentation, which can be used to adjust the strength of augmentation. The YOLO series models all follow the same set of parameter settings principles.
+
+### Accelerate training speed
+
+#### 1 Enable cudnn_benchmark for single-scale training
+
+Most of the input image sizes in the YOLO series algorithms are fixed, which is single-scale training. In this case, you can turn on cudnn_benchmark to accelerate the training speed. This parameter is mainly set for PyTorch's cuDNN underlying library, and setting this flag can allow the built-in cuDNN to automatically find the most efficient algorithm that is best suited for the current configuration to optimize the running efficiency. If this flag is turned on in multi-scale mode, it will continuously search for the optimal algorithm, which may slow down the training speed instead.
+
+To enable `cudnn_benchmark` in MMYOLO, you can set `env_cfg = dict(cudnn_benchmark=True)` in the configuration.
+
+#### 2 Use Mosaic and MixUp with caching
+
+If you have applied Mosaic and MixUp in your data augmentation, and after investigating the training bottleneck, it is found that the random image reading is causing the issue, then it is recommended to replace the regular Mosaic and MixUp with the cache-enabled versions proposed in RTMDet.
+
+| Data Aug | Use cache | ms/100 imgs |
+| :------: | :-------: | :---------: |
+|  Mosaic  |    No     |    87.1     |
+|  Mosaic  |    Yes    |    24.0     |
+|  MixUp   |    No     |    19.3     |
+|  MixUp   |    Yes    |    12.4     |
+
+Mosaic and MixUp involve mixing multiple images, and their time consumption is K times that of ordinary data augmentation (K is the number of images mixed). For example, in YOLOv5, when doing Mosaic each time, the information of 4 images needs to be reloaded from the hard disk. However, the cached version of Mosaic and MixUp only needs to reload the current image, while the remaining images involved in the mixed augmentation are obtained from the cache queue, greatly improving efficiency by sacrificing a certain amount of memory space.
+
+<div align=center>
+<img alt="data cache" src="https://user-images.githubusercontent.com/33799979/192730011-90e2a28d-e163-4399-bf87-d3012007d8c3.png" width=800 />
+</div>
+
+As shown in the figure, N preloaded images and label data are stored in the cache queue. In each training step, only one new image and its label data need to be loaded and updated in the cache queue. (Images in the cache queue can be duplicated, as shown in the figure with img3 appearing twice.) If the length of the cache queue exceeds the preset length, a random image will be popped out. When it is necessary to perform mixed data augmentation, only the required images need to be randomly selected from the cache for concatenation or other processing, without the need to load all images from the hard disk, thus saving image loading time.
+
+### Reduce the number of hyperparameter
+
+YOLOv5 provides some practical methods for reducing the number of hyperparameter, which are described below.
+
+#### 1 Adaptive loss weighting, reducing one hyperparameter
+
+In general, it can be challenging to set hyperparameters specifically for different tasks or categories. YOLOv5 proposes some adaptive methods for scaling loss weights based on the number of classes and the number of detection output layers have been proposed based on practical experience, as shown below:
+
+```python
+# scaled based on number of detection layers
+loss_cls=dict(
+    type='mmdet.CrossEntropyLoss',
+    use_sigmoid=True,
+    reduction='mean',
+    loss_weight=loss_cls_weight *
+    (num_classes / 80 * 3 / num_det_layers)),
+loss_bbox=dict(
+    type='IoULoss',
+    iou_mode='ciou',
+    bbox_format='xywh',
+    eps=1e-7,
+    reduction='mean',
+    loss_weight=loss_bbox_weight * (3 / num_det_layer
+    return_iou=True),
+loss_obj=dict(
+    type='mmdet.CrossEntropyLoss',
+    use_sigmoid=True,
+    reduction='mean',
+    loss_weight=loss_obj_weight *
+    ((img_scale[0] / 640)**2 * 3 / num_det_layers)),
+```
+
+`loss_cls` can adaptively scale `loss_weight` based on the custom number of classes and the number of detection layers, `loss_bbox` can adaptively calculate based on the number of detection layers, and `loss_obj` can adaptively scale based on the input image size and the number of detection layers. This strategy allows users to avoid setting Loss weight hyperparameters.
+It should be noted that this is only an empirical principle and not necessarily the optimal setting combination, it should be used as a reference.
+
+#### 2 Adaptive Weight Decay and Loss output values base on Batch Size, reducing two hyperparameters
+
+In general,when training on different `Batch Size`, it is necessary to follow the rule of automatic learning rate scaling. However, validation on various datasets shows that YOLOv5 can achieve good results without scaling the learning rate when changing the Batch Size, and sometimes scaling can even lead to worse results. The reason lies in the technique of `Weight Decay` and Loss output based on `Batch Size` adaptation in the code. In YOLOv5, `Weight Decay` and Loss output values will be scaled based on the total `Batch Size` being trained. The corresponding code is:
+
+```python
+# https://github.com/open-mmlab/mmyolo/blob/dev/mmyolo/engine/optimizers/yolov5_optim_constructor.py#L86
+if 'batch_size_per_gpu' in optimizer_cfg:
+    batch_size_per_gpu = optimizer_cfg.pop('batch_size_per_gpu')
+    # No scaling if total_batch_size is less than
+    # base_total_batch_size, otherwise linear scaling.
+    total_batch_size = get_world_size() * batch_size_per_gpu
+    accumulate = max(
+        round(self.base_total_batch_size / total_batch_size), 1)
+    scale_factor = total_batch_size * \
+        accumulate / self.base_total_batch_size
+    if scale_factor != 1:
+        weight_decay *= scale_factor
+        print_log(f'Scaled weight_decay to {weight_decay}', 'current')
+```
+
+```python
+# https://github.com/open-mmlab/mmyolo/blob/dev/mmyolo/models/dense_heads/yolov5_head.py#L635
+ _, world_size = get_dist_info()
+ return dict(
+     loss_cls=loss_cls * batch_size * world_size,
+     loss_obj=loss_obj * batch_size * world_size,
+     loss_bbox=loss_box * batch_size * world_size)
+```
+
+The weight of Loss varies in different Batch Sizes, and generally, the larger Batch Size means most larger the Loss and gradient. I personally speculate that this can be equivalent to a scenario of linearly increasing learning rate when Batch Size increases.
+In fact, from the [YOLOv5 Study: mAP vs Batch-Size](https://github.com/ultralytics/yolov5/discussions/2452) of YOLOv5, it can be found that it is desirable for users to achieve similar performance without modifying other parameters when modifying the Batch Size. The above two strategies are very good training techniques.
+
+### Save memory on GPU
+
+How to reduce training memory usage is a frequently discussed issue, and there are many techniques involved. The training executor of MMYOLO comes from MMEngine, so you can refer to the MMEngine documentation for how to reduce training memory usage. Currently, MMEngine supports gradient accumulation, gradient checkpointing, and large model training techniques, details of which can be found in the
+[SAVE MEMORY ON GPU](https://mmengine.readthedocs.io/zh_CN/latest/common_usage/save_gpu_memory.html).
+
+## Testing trick
+
+### Balance between inference speed and testing accuracy
+
+During model performance testing, we generally require a higher mAP, but in practical applications or inference, we want the model to perform faster while maintaining low false positive and false negative rates. In other words, the testing only focuses on mAP while ignoring post-processing and evaluation speed, while in practical applications, a balance between speed and accuracy is pursued.
+In the YOLO series, it is possible to achieve a balance between speed and accuracy by controlling certain parameters. In this example, we will describe this in detail using YOLOv5.
+
+#### 1 Avoiding multiple class outputs for a single detection box during inference
+
+YOLOv5 uses BCE Loss (use_sigmoid=True) during the training of the classification branch. Assuming there are 4 object categories, the number of categories output by the classification branch is 4 instead of 5. Moreover, due to the use of sigmoid instead of softmax prediction, it is possible to predict multiple detection boxes that meet the filtering threshold at a certain position, which means that there may be a situation where one predicted bbox corresponds to multiple predicted labels. This is shown in the figure below:
+
+<div align=center>
+<img alt="multi-label" src="https://user-images.githubusercontent.com/17425982/226282295-8ef53a89-e33e-4fd5-8d60-417db2d5a140.png" width=800 />
+</div>
+
+Generally, when calculating mAP, the filtering threshold is set to 0.001. Due to the non-competitive prediction mode of sigmoid, one box may correspond to multiple labels. This calculation method can increase the recall rate when calculating mAP, but it may not be convenient for practical applications.
+
+One common approach is to increase the filtering threshold. However, if you don't want to have many false negatives, it is recommended to set the `multi_label` parameter to False. It is located in the configuration file at `mode.test_cfg.multi_label` and its default value is True, which allows one detection box to correspond to multiple labels.
+
+#### 2 Simplify test pipeline
+
+Note that the test pipeline for YOLOv5 is as follows:
+
+```python
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='YOLOv5KeepRatioResize', scale=img_scale),
+    dict(
+        type='LetterResize',
+        scale=img_scale,
+        allow_scale_up=False,
+        pad_val=dict(img=114)),
+    dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),
+    dict(
+        type='mmdet.PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'pad_param'))
+]
+```
+
+It uses two different Resizes with different functions, with the aim of improving the mAP value during evaluation. In actual deployment, you can simplify this pipeline as shown below:
+
+```python
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='LetterResize',
+        scale=_base_.img_scale,
+        allow_scale_up=True,
+        use_mini_pad=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='mmdet.PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'pad_param'))
+]
+```
+
+In practical applications, YOLOv5 algorithm uses a simplified pipeline with multi_label set to False, score_thr increased to 0.25, and iou_threshold reduced to 0.45.
+In the YOLOv5 configuration, we provide a set of configuration parameters for detection on the ground, as detailed in [yolov5_s-v61_syncbn-detect_8xb16-300e_coco.py](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov5/yolov5_s-v61_syncbn-detect_8xb16-300e_coco.py).
+
+#### 3 Batch Shape speeds up the testing speed
+
+Batch Shape is a testing technique proposed in YOLOv5 that can speed up inference. The idea is to no longer require that all images in the testing process be 640x640, but to test at variable scales, as long as the shapes within the current batch are the same. This approach can reduce additional image pixel padding and speed up the inference process. The specific implementation of Batch Shape can be found in the [link](https://github.com/open-mmlab/mmyolo/blob/main/mmyolo/datasets/utils.py#L55).
+Almost all algorithms in MMYOLO default to enabling the Batch Shape strategy during testing. If users want to disable this feature, you can set `val_dataloader.dataset.batch_shapes_cfg=None`.
+
+In practical applications, because dynamic shape is not as fast and efficient as fixed shape. Therefore, this strategy is generally not used in real-world scenarios.
+
+### TTA improves test accuracy
+
+Data augmentation with TTA (Test Time Augmentation) is a versatile trick that can improve the performance of object detection models and is particularly useful in competition scenarios. MMYOLO has already supported TTA, and it can be enabled simply by adding `--tta` when testing. For more details, please refer to the [TTA](https://github.com/open-mmlab/mmyolo/blob/dev/docs/zh_cn/common_usage/tta.md).
diff --git a/docs/en/recommended_topics/visualization.md b/docs/en/recommended_topics/visualization.md
index 30caa9e11..f986648f3 100644
--- a/docs/en/recommended_topics/visualization.md
+++ b/docs/en/recommended_topics/visualization.md
@@ -90,7 +90,7 @@ The original `test_pipeline` is:
 test_pipeline = [
     dict(
         type='LoadImageFromFile',
-        file_client_args=_base_.file_client_args),
+        backend_args=_base_.backend_args),
     dict(type='YOLOv5KeepRatioResize', scale=img_scale),
     dict(
         type='LetterResize',
@@ -111,7 +111,7 @@ Change to the following version:
 test_pipeline = [
     dict(
         type='LoadImageFromFile',
-        file_client_args=_base_.file_client_args),
+        backend_args=_base_.backend_args),
     dict(type='mmdet.Resize', scale=img_scale, keep_ratio=False), # change the  LetterResize to mmdet.Resize
     dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),
     dict(
@@ -197,7 +197,7 @@ The original `test_pipeline` is:
 test_pipeline = [
     dict(
         type='LoadImageFromFile',
-        file_client_args=_base_.file_client_args),
+        backend_args=_base_.backend_args),
     dict(type='YOLOv5KeepRatioResize', scale=img_scale),
     dict(
         type='LetterResize',
@@ -218,7 +218,7 @@ Change to the following version:
 test_pipeline = [
     dict(
         type='LoadImageFromFile',
-        file_client_args=_base_.file_client_args),
+        backend_args=_base_.backend_args),
     dict(type='mmdet.Resize', scale=img_scale, keep_ratio=False), # change the  LetterResize to mmdet.Resize
     dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),
     dict(
diff --git a/docs/en/tutorials/config.md b/docs/en/tutorials/config.md
index 01937f300..448452243 100644
--- a/docs/en/tutorials/config.md
+++ b/docs/en/tutorials/config.md
@@ -86,12 +86,10 @@ The training and testing data flow of YOLOv5 have a certain difference. We will
 ```python
 dataset_type = 'CocoDataset'  # Dataset type, this will be used to define the dataset
 data_root = 'data/coco/'  # Root path of data
-file_client_args = dict(backend='disk')  # file client arguments, default backend loads from local disk
 
 pre_transform = [ # Training data loading pipeline
     dict(
-        type='LoadImageFromFile', # First pipeline to load images from file path
-        file_client_args=file_client_args),  # file client arguments, default backend loads from local disk
+        type='LoadImageFromFile'), # First pipeline to load images from file path
     dict(type='LoadAnnotations', # Second pipeline to load annotations for current image
          with_bbox=True) # Whether to use bounding box, True for detection
 ]
@@ -156,8 +154,7 @@ In the testing phase of YOLOv5, the [Letter Resize](https://github.com/open-mmla
 ```python
 test_pipeline = [ # Validation/ Testing dataloader config
     dict(
-        type='LoadImageFromFile', # First pipeline to load images from file path
-        file_client_args=file_client_args),  # file client arguments, default backend loads from local disk
+        type='LoadImageFromFile'), # First pipeline to load images from file path
     dict(type='YOLOv5KeepRatioResize', # Second pipeline to resize images with the same aspect ratio
          scale=img_scale), # Pipeline that resizes the images
     dict(
@@ -475,8 +472,7 @@ train_pipeline = [
 
 test_pipeline = [
     dict(
-        type='LoadImageFromFile',
-        file_client_args={{_base_.file_client_args}}),
+        type='LoadImageFromFile'),
     dict(type='YOLOv5KeepRatioResize', scale=img_scale),
     dict(
         type='LetterResize',
@@ -517,7 +513,6 @@ E.g:
 ```python
 _base_ = '../_base_/default_runtime.py'
 
-file_client_args = {{_base_.file_client_args}}  # `file_client_args` equals to `file_client_args` that defined in the _base_ config
 pre_transform = _base_.pre_transform # `pre_transform` equals to `pre_transform` in the _base_ config
 ```
 
diff --git a/docs/en/tutorials/custom_installation.md b/docs/en/tutorials/custom_installation.md
index 327de64ec..604a77a30 100644
--- a/docs/en/tutorials/custom_installation.md
+++ b/docs/en/tutorials/custom_installation.md
@@ -75,7 +75,7 @@ thus we only need to install MMEngine, MMCV, MMDetection, and MMYOLO with the fo
 !pip3 install openmim
 !mim install "mmengine>=0.6.0"
 !mim install "mmcv>=2.0.0rc4,<2.1.0"
-!mim install "mmdet>=3.0.0rc6,<3.1.0"
+!mim install "mmdet>=3.0.0,<4.0.0"
 ```
 
 **Step 2.** Install MMYOLO from the source.
diff --git a/docs/en/tutorials/faq.md b/docs/en/tutorials/faq.md
index b79f27207..ca2a0b25f 100644
--- a/docs/en/tutorials/faq.md
+++ b/docs/en/tutorials/faq.md
@@ -8,12 +8,94 @@ Why do we need to launch MMYOLO? Why do we need to open a separate repository in
 
 **(1) Unified operation and inference platform**
 
-At present, there are very many improved algorithms for YOLO in the field of target detection, and they are very popular, but such algorithms are based on different frameworks for different back-end implementations, and there are large differences, lacking a unified and convenient fair evaluation process from training to deployment.
+At present, there are very many improved algorithms for YOLO in the field of target detection, and they are very popular, but such algorithms are based on different frameworks for different back-end implementations, and there are significant differences, lacking a unified and convenient fair evaluation process from training to deployment.
 
 **(2) Protocol limitations**
 
-As we all know, YOLOv5 and its derived algorithms such as YOLOv6 and YOLOv7 are GPL 3.0 protocols, which are different from the Apache protocol of MMDetection. Due to the protocol issue, it is not possible to incorporate MMYOLO directly into MMDetection.
+As we all know, YOLOv5 and its derived algorithms, such as YOLOv6 and YOLOv7 are GPL 3.0 protocols, which differ from the Apache protocol of MMDetection. Therefore, due to the protocol issue, it is not possible to incorporate MMYOLO directly into MMDetection.
 
 **(3) Multitasking support**
 
-There is another far-reaching reason: **MMYOLO tasks are not limited to MMDetection**, and more tasks will be supported in the future, such as MMPose based keypoint related applications and MMTracking based tracking related applications, so it is not suitable to be directly incorporated into MMDetection.
+There is another far-reaching reason: **MMYOLO tasks are not limited to MMDetection**, and more tasks will be supported in the future, such as MMPose based keypoint-related applications and MMTracking based tracking related applications, so it is not suitable to be directly incorporated into MMDetection.
+
+## What is the projects folder used for?
+
+The `projects` folder is newly introduced in OpenMMLab 2.0. There are three primary purposes:
+
+1. facilitate community contributors: Since OpenMMLab series codebases have a rigorous code management process, this inevitably leads to long algorithm reproduction cycles, which is not friendly to community contributions.
+2. facilitate rapid support for new algorithms: A long development cycle can also lead to another problem users may not be able to experience the latest algorithms as soon as possible.
+3. facilitate rapid support for new approaches and features: New approaches or new features may be incompatible with the current design of the codebases and cannot be quickly incorporated.
+
+In summary, the `projects` folder solves the problems of slow support for new algorithms and complicated support for new features due to the long algorithm reproduction cycle. Each folder in `projects` is an entirely independent project, and community users can quickly support some algorithms in the current version through `projects`. This allows the community to quickly use new algorithms and features that are difficult to adapt in the current version. When the design is stable or the code meets the merge specification, it will be considered to merge into the main branch.
+
+## Why does the performance drop significantly by switching the YOLOv5 backbone to Swin?
+
+In [Replace the backbone network](../recommended_topics/replace_backbone.md), we provide many tutorials on replacing the backbone module. However, you may not get a desired result once you replace the module and start directly training the model. This is because different networks have very distinct hyperparameters. Take the backbones of Swin and YOLOv5 as an example. Swin belongs to the transformer family, and the YOLOv5 is a convolutional network. Their training optimizers, learning rates, and other hyperparameters are different. If we force using Swin as the backbone of YOLOv5 and try to get a moderate performance, we must modify many parameters.
+
+## How to use the components implemented in all MM series repositories?
+
+In OpenMMLab 2.0, we have enhanced the ability to use different modules across MM series libraries. Currently, users can call any module that has been registered in MM series algorithm libraries via `MM Algorithm Library A. Module Name`. We demonstrated using MMClassification backbones in the [Replace the backbone network](../recommended_topics/replace_backbone.md). Other modules can be used in the same way.
+
+## Can pure background pictures be added in MMYOLO for training?
+
+Adding pure background images to training can suppress the false positive rate in most scenarios, and this feature has already been supported for most datasets. Take `YOLOv5CocoDataset` as an example. The control parameter is `train_dataloader.dataset.filter_cfg.filter_empty_gt`. If `filter_empty_gt` is True, the pure background images will be filtered out and not used in training, and vice versa. Most of the algorithms in MMYOLO have added this feature by default.
+
+## Is there a script to calculate the inference FPS in MMYOLO?
+
+MMYOLO is based on MMDet 3.x, which provides a [benchmark script](https://github.com/open-mmlab/mmdetection/blob/3.x/tools/analysis_tools/benchmark.py) to calculate the inference FPS. We recommend using `mim` to run the script in MMDet directly across the library instead of copying them to MMYOLO. More details about `mim` usages can be found at [Use mim to run scripts from other OpenMMLab repositories](../common_usage/mim_usage.md).
+
+## What is the difference between MMDeploy and EasyDeploy?
+
+MMDeploy is developed and maintained by the OpenMMLab deployment team to provide model deployment solutions for the OpenMMLab series algorithms, which support various inference backends and customization features. EasyDeploy is an easier and more lightweight deployment project provided by the community. However, it does not support as many features as MMDeploy. Users can choose which one to use in MMYOLO according to their needs.
+
+## How to check the AP of every category in COCOMetric?
+
+Just set `test_evaluator.classwise` to True or add `--cfg-options test_evaluator.classwise=True` when running the test script.
+
+## Why doesn't MMYOLO support the auto-learning rate scaling feature as MMDet?
+
+It is because the YOLO series algorithms are not very well suited for linear scaling. We have verified on several datasets that the performance is better without the auto-scaling based on batch size.
+
+## Why is the weight size of my trained model larger than the official one?
+
+The reason is that user-trained weights usually include extra data such as `optimizer`, `ema_state_dict`, and `message_hub`, which are removed when we publish the models. While on the contrary, the weight users trained by themselves are kept. You can use the [publish_model.py](https://github.com/open-mmlab/mmyolo/blob/main/tools/misc/publish_model.py) to remove these unnecessary components.
+
+## Why does the RTMDet cost more graphics memory during the training than YOLOv5?
+
+It is due to the assigner in RTMDet. YOLOv5 uses a simple and efficient shape-matching assigner, while RTMDet uses a dynamic soft label assigner for entire batch computation. Therefore, it consumes more memory in its internal cost matrix, especially when there are too many labeled bboxes in the current batch. We are considering solving this problem soon.
+
+## Do I need to reinstall MMYOLO after modifying some code?
+
+Without adding any new python code, and if you installed the MMYOLO by `mim install -v -e .`, any new modifications will take effect without reinstalling. However, if you add new python codes and are using them, you need to reinstall with `mim install -v -e .`.
+
+## How to use multiple versions of MMYOLO to develop?
+
+If users have multiple versions of the MMYOLO, such as mmyolo-v1 and mmyolo-v2. They can specify the target version of their MMYOLO by using this command in the shell:
+
+```shell
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH
+```
+
+Users can unset the `PYTHONPATH` when they want to reset to the default MMYOLO by this command:
+
+```shell
+unset PYTHONPATH
+```
+
+## How to save the best checkpoints during the training?
+
+Users can choose what metrics to filter the best models by setting the `default_hooks.checkpoint.save_best` in the configuration. Take the COCO dataset detection task as an example. Users can customize the `default_hooks.checkpoint.save_best` with these parameters:
+
+1. `auto` works based on the first evaluation metric in the validation set.
+2. `coco/bbox_mAP` works based on `bbox_mAP`.
+3. `coco/bbox_mAP_50` works based on `bbox_mAP_50`.
+4. `coco/bbox_mAP_75` works based on `bbox_mAP_75`.
+5. `coco/bbox_mAP_s` works based on `bbox_mAP_s`.
+6. `coco/bbox_mAP_m` works based on `bbox_mAP_m`.
+7. `coco/bbox_mAP_l` works based on `bbox_mAP_l`.
+
+In addition, users can also choose the filtering logic by setting `default_hooks.checkpoint.rule` in the configuration. For example, `default_hooks.checkpoint.rule=greater` means that the larger the indicator is, the better it is. More details can be found at [checkpoint_hook](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py).
+
+## How to train and test with non-square input sizes?
+
+The default configurations of the YOLO series algorithms are mostly squares like 640x640 or 1280x1280. However, if users want to train with a non-square shape, they can modify the `image_scale` to the desired value in the configuration. A more detailed example could be found at [yolov5_s-v61_fast_1xb12-40e_608x352_cat.py](https://github.com/open-mmlab/mmyolo/tree/dev/configs/yolov5/yolov5_s-v61_fast_1xb12-40e_608x352_cat.py).
diff --git a/docs/en/tutorials/warning_notes.md b/docs/en/tutorials/warning_notes.md
index 54ed973da..791cd9d4b 100644
--- a/docs/en/tutorials/warning_notes.md
+++ b/docs/en/tutorials/warning_notes.md
@@ -1 +1,24 @@
 # Common Warning Notes
+
+The purpose of this document is to collect warning messages that users often find confusing, and provide explanations to facilitate understanding.
+
+## xxx registry in mmyolo did not set import location
+
+The warning message complete information is that The xxx registry in mmyolo did not set import location. Fallback to call `mmyolo.utils.register_all_modules` instead.
+
+This warning means that a module was not set with an import location when importing it, making it impossible to determine its location. Therefore, `mmyolo.utils.register_all_modules` is automatically called to trigger the package import.
+This warning belongs to the very low-level module warning in MMEngine, which may be difficult for users to understand, but it has no impact on the actual use and can be ignored directly.
+
+## save_param_schedulers is true but self.param_schedulers is None
+
+The following information is an example using the YOLOv5 algorithm. This is because the parameter scheduler strategy `YOLOv5ParamSchedulerHook` has been rewritten in YOLOv5, so the ParamScheduler designed in MMEngine is not used. However, `save_param_schedulers` is not set to False in the YOLOv5 configuration.
+
+First of all, this warning has no impact on performance and resuming training. If users think this warning affects experience, you can set `default_hooks.checkpoint.save_param_scheduler` to False, or set `--cfg-options default_hooks.checkpoint.save_param_scheduler=False` when training via the command line.
+
+## The loss_cls will be 0. This is a normal phenomenon.
+
+This is related to specific algorithms. Taking YOLOv5 as an example, its classification loss only considers positive samples. If the number of classes is 1, then the classification loss and object loss are functionally redundant. Therefore, in the design, when the number of classes is 1, the loss_cls is not calculated and is always 0. This is a normal phenomenon.
+
+## The model and loaded state dict do not match exactly
+
+Whether this warning will affect performance needs to be determined based on more information. If it occurs during fine-tuning, it is a normal phenomenon that the COCO pre-trained weights of the Head module cannot be loaded due to the user's custom class differences, and it will not affect performance.
diff --git a/docs/en/useful_tools/log_analysis.md b/docs/en/useful_tools/log_analysis.md
index 6b3e60402..c45170aaa 100644
--- a/docs/en/useful_tools/log_analysis.md
+++ b/docs/en/useful_tools/log_analysis.md
@@ -2,7 +2,7 @@
 
 ## Curve plotting
 
-`tools/analysis_tools/analyze_logs.py` plots loss/mAP curves given a training log file. Run `pip install seaborn` first to install the dependency.
+`tools/analysis_tools/analyze_logs.py` in MMDetection plots loss/mAP curves given a training log file. Run `pip install seaborn` first to install the dependency.
 
 ```shell
 mim run mmdet analyze_logs plot_curve \
diff --git a/docs/zh_cn/common_usage/ms_training_testing.md b/docs/zh_cn/common_usage/ms_training_testing.md
new file mode 100644
index 000000000..1f271c54d
--- /dev/null
+++ b/docs/zh_cn/common_usage/ms_training_testing.md
@@ -0,0 +1,41 @@
+# 多尺度训练和测试
+
+## 多尺度训练
+
+MMYOLO 中目前支持了主流的 YOLOv5、YOLOv6、YOLOv7、YOLOv8 和 RTMDet 等算法，其默认配置均为单尺度 640x640 训练。 在 MM 系列开源库中常用的多尺度训练有两种实现方式：
+
+1. 在 `train_pipeline` 中输出的每张图都是不定尺度的，然后在 [DataPreprocessor](https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/data_preprocessors/data_preprocessor.py) 中将不同尺度的输入图片
+   通过 [stack_batch](https://github.com/open-mmlab/mmengine/blob/dbae83c52fa54d6dda08b6692b124217fe3b2135/mmengine/model/base_model/data_preprocessor.py#L260-L261) 函数填充到同一尺度，从而组成 batch 进行训练。MMDet 中大部分算法都是采用这个实现方式。
+2. 在 `train_pipeline` 中输出的每张图都是固定尺度的，然后直接在 `DataPreprocessor` 中进行 batch 张图片的上下采样，从而实现多尺度训练功能
+
+在 MMYOLO 中两种多尺度训练方式都是支持的。理论上第一种实现方式所生成的尺度会更加丰富，但是由于其对单张图进行独立增强，训练效率不如第二种方式。所以我们更推荐使用第二种方式。
+
+以 `configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py` 配置为例，其默认配置采用的是 640x640 固定尺度训练，假设想实现以 32 为倍数，且多尺度范围为 (480, 800) 的训练方式，则可以参考 YOLOX 做法通过 DataPreprocessor 中的 [YOLOXBatchSyncRandomResize](https://github.com/open-mmlab/mmyolo/blob/dc85144fab20a970341550794857a2f2f9b11564/mmyolo/models/data_preprocessors/data_preprocessor.py#L20) 实现。
+
+在 `configs/yolov5` 路径下新建配置，命名为 `configs/yolov5/yolov5_s-v61_fast_1xb12-ms-40e_cat.py`，其内容如下：
+
+```python
+_base_ = 'yolov5_s-v61_fast_1xb12-40e_cat.py'
+
+model = dict(
+    data_preprocessor=dict(
+        type='YOLOv5DetDataPreprocessor',
+        pad_size_divisor=32,
+        batch_augments=[
+            dict(
+                type='YOLOXBatchSyncRandomResize',
+                # 多尺度范围是 480~800
+                random_size_range=(480, 800),
+                # 输出尺度需要被 32 整除
+                size_divisor=32,
+                # 每隔 1 个迭代改变一次输出输出
+                interval=1)
+        ])
+)
+```
+
+上述配置就可以实现多尺度训练了。为了方便，我们已经在 `configs/yolov5/` 下已经提供了该配置。其余 YOLO 系列算法也是类似做法。
+
+## 多尺度测试
+
+MMYOLO 多尺度测试功能等同于测试时增强 TTA，目前已经支持，详情请查看 [测试时增强 TTA](./tta.md) 。
diff --git a/docs/zh_cn/get_started/15_minutes_instance_segmentation.md b/docs/zh_cn/get_started/15_minutes_instance_segmentation.md
index 48fe3ca90..2b9e6aab8 100644
--- a/docs/zh_cn/get_started/15_minutes_instance_segmentation.md
+++ b/docs/zh_cn/get_started/15_minutes_instance_segmentation.md
@@ -1,3 +1,330 @@
 # 15 分钟上手 MMYOLO 实例分割
 
+实例分割是计算机视觉中的一个任务，旨在将图像中的每个对象都分割出来，并为每个对象分配一个唯一的标识符。与语义分割不同，实例分割不仅分割出图像中的不同类别，还将同一类别的不同实例分开。
+
+<div align=center>
+<img src="https://github.com/open-mmlab/mmyolo/assets/87774050/6fd6316f-d78d-48a5-a413-86e7a74583fd" alt="Instance Segmentation" width="100%"/>
+</div>
+
+以可供下载的气球 balloon 小数据集为例，带大家 15 分钟轻松上手 MMYOLO 实例分割。整个流程包含如下步骤：
+
+- [环境安装](#环境安装)
+- [数据集准备](#数据集准备)
+- [配置准备](#配置准备)
+- [模型训练](#模型训练)
+- [模型测试](#模型测试)
+- [EasyDeploy 模型部署](#easydeploy-模型部署)
+
+本文以 YOLOv5-s 为例，其余 YOLO 系列算法的气球 balloon 小数据集 demo 配置请查看对应的算法配置文件夹下。
+
+## 环境安装
+
+假设你已经提前安装好了 Conda，接下来安装 PyTorch
+
+```shell
+conda create -n mmyolo python=3.8 -y
+conda activate mmyolo
+# 如果你有 GPU
+conda install pytorch torchvision -c pytorch
+# 如果你是 CPU
+# conda install pytorch torchvision cpuonly -c pytorch
+```
+
+安装 MMYOLO 和依赖库
+
+```shell
+git clone https://github.com/open-mmlab/mmyolo.git
+cd mmyolo
+pip install -U openmim
+mim install -r requirements/mminstall.txt
+# Install albumentations
+mim install -r requirements/albu.txt
+# Install MMYOLO
+mim install -v -e .
+# "-v" 指详细说明，或更多的输出
+# "-e" 表示在可编辑模式下安装项目，因此对代码所做的任何本地修改都会生效，从而无需重新安装。
+```
+
+```{note}
+温馨提醒：由于本仓库采用的是 OpenMMLab 2.0，请最好新建一个 conda 虚拟环境，防止和 OpenMMLab 1.0 已经安装的仓库冲突。
+```
+
+详细环境配置操作请查看 [安装和验证](./installation.md)
+
+## 数据集准备
+
+Balloon 数据集是一个包括 74 张图片的单类别数据集, 包括了训练所需的标注信息。 样例图片如下所示：
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/87774050/236993643-f581b087-9231-48a5-810b-97a5f31abe63.png" alt="balloon dataset"/>
+</div>
+
+你只需执行如下命令即可下载并且直接用起来
+
+```shell
+python tools/misc/download_dataset.py --dataset-name balloon --save-dir ./data/balloon --unzip --delete
+python ./tools/dataset_converters/balloon2coco.py
+```
+
+data 位于 mmyolo 工程目录下， `train.json`, `val.json` 中存放的是 COCO 格式的标注，`data/balloon/train`, `data/balloon/val` 中存放的是所有图片
+
+## 配置准备
+
+以 YOLOv5 算法为例，考虑到用户显存和内存有限，我们需要修改一些默认训练参数来让大家愉快的跑起来，核心需要修改的参数如下
+
+- YOLOv5 是 Anchor-Based 类算法，不同的数据集需要自适应计算合适的 Anchor
+- 默认配置是 8 卡，每张卡 batch size 为 16，现将其改成单卡，每张卡 batch size 为 4
+- 原则上 batch size 改变后，学习率也需要进行线性缩放，但是实测发现不需要
+
+具体操作为在 `configs/yolov5/ins_seg` 文件夹下新建 `yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py` 配置文件(为了方便大家直接使用，我们已经提供了该配置)，并把以下内容复制配置文件中。
+
+```python
+_base_ = './yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py'  # noqa
+
+data_root = 'data/balloon/'
+# 训练集标注路径
+train_ann_file = 'train.json'
+train_data_prefix = 'train/'  # 训练集图片路径
+# 测试集标注路径
+val_ann_file = 'val.json'
+val_data_prefix = 'val/'  # 验证集图片路径
+metainfo = {
+    'classes': ('balloon', ), # 数据集类别名称
+    'palette': [
+        (220, 20, 60),
+    ]
+}
+num_classes = 1
+# 批处理大小batch size设置为 4
+train_batch_size_per_gpu = 4
+# dataloader 加载进程数
+train_num_workers = 2
+log_interval = 1
+#####################
+train_dataloader = dict(
+    batch_size=train_batch_size_per_gpu,
+    num_workers=train_num_workers,
+    dataset=dict(
+        data_root=data_root,
+        metainfo=metainfo,
+        data_prefix=dict(img=train_data_prefix),
+        ann_file=train_ann_file))
+val_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        metainfo=metainfo,
+        data_prefix=dict(img=val_data_prefix),
+        ann_file=val_ann_file))
+test_dataloader = val_dataloader
+val_evaluator = dict(ann_file=data_root + val_ann_file)
+test_evaluator = val_evaluator
+default_hooks = dict(logger=dict(interval=log_interval))
+#####################
+
+model = dict(bbox_head=dict(head_module=dict(num_classes=num_classes)))
+```
+
+以上配置从 `yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py` 中继承，并根据 balloon 数据的特点更新了 `data_root`、`metainfo`、`train_dataloader`、`val_dataloader`、`num_classes` 等配置。
+
+## 模型训练
+
+```shell
+python tools/train.py configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py
+```
+
+运行以上训练命令 `work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance` 文件夹会被自动生成，权重文件以及此次的训练配置文件将会保存在此文件夹中。 在 1660 低端显卡上，整个训练过程大概需要 30 分钟。
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/87774050/236995027-00a16a9e-2a2d-44cc-be8a-e2c8a36ff77f.png" alt="image"/>
+</div>
+
+在 `val.json` 上性能如下所示：
+
+```text
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.330
+ Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.509
+ Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.317
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.103
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.417
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.150
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.396
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.454
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.317
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.525
+```
+
+上述性能是通过 COCO API 打印，其中 -1 表示不存在对于尺度的物体。
+
+### 一些注意事项
+
+在训练过程中会打印如下关键警告：
+
+- You are using `YOLOv5Head` with num_classes == 1. The loss_cls will be 0. This is a normal phenomenon.
+
+这个警告都不会对性能有任何影响。第一个警告是说明由于当前训练的类别数是 1，根据 YOLOv5 算法的社区， 分类分支的 loss 始终是 0，这是正常现象。
+
+### 中断后恢复训练
+
+如果训练中途停止，可以在训练命令最后加上 `--resume` ,程序会自动从 `work_dirs` 中加载最新的权重文件恢复训练。
+
+```shell
+python tools/train.py configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py --resume
+```
+
+### 节省显存策略
+
+上述配置大概需要 1.0G 显存，如果你的显存不够，可以考虑开启混合精度训练
+
+```shell
+python tools/train.py configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py --amp
+```
+
+### 训练可视化
+
+MMYOLO 目前支持本地、TensorBoard 以及 WandB 等多种后端可视化，默认是采用本地可视化方式，你可以切换为 WandB 等实时可视化训练过程中各类指标。
+
+#### 1 WandB 可视化使用
+
+WandB 官网注册并在 https://wandb.ai/settings 获取到 WandB 的 API Keys。
+
+<div align=center>
+<img src="https://cdn.vansin.top/img/20220913212628.png" alt="image"/>
+</div>
+
+```shell
+pip install wandb
+# 运行了 wandb login 后输入上文中获取到的 API Keys ，便登录成功。
+wandb login
+```
+
+在 `configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py` 配置文件最后添加 WandB 配置
+
+```python
+visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')])
+```
+
+重新运行训练命令便可以在命令行中提示的网页链接中看到 loss、学习率和 coco/bbox_mAP 等数据可视化了。
+
+```shell
+python tools/train.py configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py
+```
+
+#### 2 Tensorboard 可视化使用
+
+安装 Tensorboard 环境
+
+```shell
+pip install tensorboard
+```
+
+同上述在配置文件 `configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py`配置的最后添加 `tensorboard` 配置
+
+```python
+visualizer = dict(vis_backends=[dict(type='LocalVisBackend'), dict(type='TensorboardVisBackend')])
+```
+
+重新运行训练命令后，Tensorboard 文件会生成在可视化文件夹 `work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance/{timestamp}/vis_data` 下，
+运行下面的命令便可以在网页链接使用 Tensorboard 查看 loss、学习率和 coco/bbox_mAP 等可视化数据了：
+
+```shell
+tensorboard --logdir=work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance
+```
+
+## 模型测试
+
+```shell
+python tools/test.py configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py \
+                     work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance/best_coco_bbox_mAP_epoch_300.pth \
+                     --show-dir show_results
+```
+
+运行以上测试命令， 你不仅可以得到**模型训练**部分所打印的 AP 性能，还可以将推理结果图片自动保存至 `work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance/{timestamp}/show_results` 文件夹中。下面为其中一张结果图片，左图为实际标注，右图为模型推理结果。
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/87774050/236996421-46cfd38c-0827-4251-8216-408dfa9e03dd.jpg" alt="result_img"/>
+</div>
+
+如果你使用了 `WandbVisBackend` 或者 `TensorboardVisBackend`，则还可以在浏览器窗口可视化模型推理结果。
+
+## 特征图相关可视化
+
+MMYOLO 中提供了特征图相关可视化脚本，用于分析当前模型训练效果。 详细使用流程请参考 [特征图可视化](../recommended_topics/visualization.md)
+
+由于 `test_pipeline` 直接可视化会存在偏差，故将需要 `configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py` 中 `test_pipeline`
+
+```python
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile',
+        backend_args=_base_.backend_args),
+    dict(type='YOLOv5KeepRatioResize', scale=img_scale),
+    dict(
+        type='LetterResize',
+        scale=img_scale,
+        allow_scale_up=False,
+        pad_val=dict(img=114)),
+    dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),
+    dict(
+        type='mmdet.PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'pad_param'))
+]
+```
+
+修改为如下配置：
+
+```python
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile',
+        backend_args=_base_.backend_args),
+    dict(type='mmdet.Resize', scale=img_scale, keep_ratio=False), # 删除 YOLOv5KeepRatioResize, 将 LetterResize 修改成 mmdet.Resize
+    dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),
+    dict(
+        type='mmdet.PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))  # 删除 pad_param
+]
+```
+
+我们选择 `data/balloon/train/3927754171_9011487133_b.jpg` 图片作为例子，可视化 YOLOv5 backbone 和 neck 层的输出特征图。
+
+```shell
+python demo/featmap_vis_demo.py data/balloon/train/3927754171_9011487133_b.jpg \
+    configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py \
+    work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance/best_coco_bbox_mAP_epoch_300.pth \ --target-layers backbone \
+    --channel-reduction squeeze_mean
+```
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/87774050/236997582-233e292f-5e96-4e44-9e92-9e0787f302fc.jpg" width="800" alt="image"/>
+</div>
+
+结果会保存到当前路径的 output 文件夹下。上图中绘制的 3 个输出特征图对应大中小输出特征图。
+
+**2. 可视化 YOLOv5 neck 输出的 3 个通道**
+
+```shell
+python demo/featmap_vis_demo.py data/balloon/train/3927754171_9011487133_b.jpg \
+    configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py \
+    work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance/best_coco_bbox_mAP_epoch_300.pth \ --target-layers neck \
+    --channel-reduction squeeze_mean
+```
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/87774050/236997860-719d2e18-7767-4129-a072-b21c97a5502a.jpg" width="800" alt="image"/>
+</div>
+
+**3. Grad-Based CAM 可视化**
+
 TODO
+
+## EasyDeploy 模型部署
+
+TODO
+
+至此本教程结束。
+
+以上完整内容可以查看 [15_minutes_instance_segmentation.ipynb](../../../demo/15_minutes_instance_segmentation.ipynb)。 如果你在训练或者测试过程中碰到问题，请先查看 [常见错误排除步骤](../recommended_topics/troubleshooting_steps.md)， 如果依然无法解决欢迎提 issue。
diff --git a/docs/zh_cn/get_started/15_minutes_object_detection.md b/docs/zh_cn/get_started/15_minutes_object_detection.md
index aeac5e59b..51022baa9 100644
--- a/docs/zh_cn/get_started/15_minutes_object_detection.md
+++ b/docs/zh_cn/get_started/15_minutes_object_detection.md
@@ -256,7 +256,7 @@ python tools/train.py configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py
 
 #### 2 Tensorboard 可视化使用
 
-安装 Tensorboard 环境
+安装 Tensorboard 依赖
 
 ```shell
 pip install tensorboard
@@ -268,11 +268,11 @@ pip install tensorboard
 visualizer = dict(vis_backends=[dict(type='LocalVisBackend'), dict(type='TensorboardVisBackend')])
 ```
 
-重新运行训练命令后，Tensorboard 文件会生成在可视化文件夹 `work_dirs/yolov5_s-v61_fast_1xb12-40e_cat.py/{timestamp}/vis_data` 下，
+重新运行训练命令后，Tensorboard 文件会生成在可视化文件夹 `work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/{timestamp}/vis_data` 下，
 运行下面的命令便可以在网页链接使用 Tensorboard 查看 loss、学习率和 coco/bbox_mAP 等可视化数据了：
 
 ```shell
-tensorboard --logdir=work_dirs/yolov5_s-v61_fast_1xb12-40e_cat.py
+tensorboard --logdir=work_dirs/yolov5_s-v61_fast_1xb12-40e_cat
 ```
 
 ## 模型测试
@@ -301,7 +301,7 @@ MMYOLO 中提供了特征图相关可视化脚本，用于分析当前模型训
 test_pipeline = [
     dict(
         type='LoadImageFromFile',
-        file_client_args=_base_.file_client_args),
+        backend_args=_base_.backend_args),
     dict(type='YOLOv5KeepRatioResize', scale=img_scale),
     dict(
         type='LetterResize',
@@ -322,13 +322,13 @@ test_pipeline = [
 test_pipeline = [
     dict(
         type='LoadImageFromFile',
-        file_client_args=_base_.file_client_args),
-    dict(type='mmdet.Resize', scale=img_scale, keep_ratio=False), # 这里将 LetterResize 修改成 mmdet.Resize
+        backend_args=_base_.backend_args),
+    dict(type='mmdet.Resize', scale=img_scale, keep_ratio=False), # 删除 YOLOv5KeepRatioResize, 将 LetterResize 修改成 mmdet.Resize
     dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),
     dict(
         type='mmdet.PackDetInputs',
         meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
-                   'scale_factor'))
+                   'scale_factor'))  # 删除 pad_param
 ]
 ```
 
@@ -370,6 +370,12 @@ python demo/featmap_vis_demo.py data/cat/images/IMG_20221020_112705.jpg \
 
 基于上述特征图可视化效果，我们可以分析特征层 bbox 级别的 Grad CAM。
 
+安装 `grad-cam` 依赖：
+
+```shell
+pip install "grad-cam"
+```
+
 (a) 查看 neck 输出的最小输出特征图的 Grad CAM
 
 ```shell
@@ -416,28 +422,28 @@ python demo/boxam_vis_demo.py data/cat/images/IMG_20221020_112705.jpg \
 首先需要在当前 MMYOLO 的虚拟环境中按照 EasyDeploy 的 [基本文档](../../../projects/easydeploy/docs/model_convert.md) 对照自己的设备安装好所需的各个库。
 
 ```shell
-pip install onnx
+pip install onnx onnxruntime
 pip install onnx-simplifier # 如果需要使用 simplify 功能需要安装
 pip install tensorrt        # 如果有 GPU 环境并且需要输出 TensorRT 模型需要继续执行
 ```
 
-完成安装后就可以用以下命令对已经训练好的针对 cat 数据集的模型一键转换部署，当前设备的 ONNX 版本为 1.13.0，TensorRT 版本为 8.5.3.1，故可保持 `--opset` 为 11，其余各项参数的具体含义和参数值需要对照使用的 config 文件进行调整。此处我们先导出 CPU 版本的 ONNX 模型，`--backend` 为 1。
+完成安装后就可以用以下命令对已经训练好的针对 cat 数据集的模型一键转换部署，当前设备的 ONNX 版本为 1.13.0，TensorRT 版本为 8.5.3.1，故可保持 `--opset` 为 11，其余各项参数的具体含义和参数值需要对照使用的 config 文件进行调整。此处我们先导出 CPU 版本的 ONNX 模型，`--backend` 为 ONNXRUNTIME。
 
 ```shell
-python projects/easydeploy/tools/export.py \
-	configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \
-	work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \
-	--work-dir work_dirs/yolov5_s-v61_fast_1xb12-40e_cat \
+python projects/easydeploy/tools/export_onnx.py \
+    configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \
+    work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \
+    --work-dir work_dirs/yolov5_s-v61_fast_1xb12-40e_cat \
     --img-size 640 640 \
     --batch 1 \
     --device cpu \
     --simplify \
-	--opset 11 \
-	--backend 1 \
-	--pre-topk 1000 \
-	--keep-topk 100 \
-	--iou-threshold 0.65 \
-	--score-threshold 0.25
+    --opset 11 \
+    --backend ONNXRUNTIME \
+    --pre-topk 1000 \
+    --keep-topk 100 \
+    --iou-threshold 0.65 \
+    --score-threshold 0.25
 ```
 
 成功运行后就可以在 `work-dir` 下得到转换后的 ONNX 模型，默认使用 `end2end.onnx` 命名。
@@ -446,7 +452,7 @@ python projects/easydeploy/tools/export.py \
 
 ```shell
 python projects/easydeploy/tools/image-demo.py \
-    data/cat/images/IMG_20210728_205312.jpg \
+    data/cat/images/IMG_20210728_205117.jpg \
     configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \
     work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/end2end.onnx \
     --device cpu
@@ -488,7 +494,7 @@ python projects/easydeploy/tools/build_engine.py \
 
 成功执行后会在 `work-dir` 下生成 `end2end.engine` 文件：
 
-```shell
+```text
 work_dirs/yolov5_s-v61_fast_1xb12-40e_cat
 ├── 202302XX_XXXXXX
 │   ├── 202302XX_XXXXXX.log
@@ -524,4 +530,4 @@ python projects/easydeploy/tools/image-demo.py \
 
 这样我们就完成了将训练完成的模型进行转换部署并且检查推理结果的工作。至此本教程结束。
 
-以上完整内容可以查看 [15_minutes_object_detection.ipynb](<>)。 如果你在训练或者测试过程中碰到问题，请先查看 [常见错误排除步骤](../recommended_topics/troubleshooting_steps.md)， 如果依然无法解决欢迎提 issue。
+以上完整内容可以查看 [15_minutes_object_detection.ipynb](../../..//demo/15_minutes_object_detection.ipynb)。 如果你在训练或者测试过程中碰到问题，请先查看 [常见错误排除步骤](../recommended_topics/troubleshooting_steps.md)，如果依然无法解决欢迎提 [issue](https://github.com/open-mmlab/mmyolo/issues/new/choose)。
diff --git a/docs/zh_cn/get_started/article.md b/docs/zh_cn/get_started/article.md
index 0ec160c8b..07f75e42b 100644
--- a/docs/zh_cn/get_started/article.md
+++ b/docs/zh_cn/get_started/article.md
@@ -48,7 +48,7 @@
 | :---: | :--------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
 | 第1讲 |     源码阅读和调试「必备」技巧     | [![Link](https://i2.hdslb.com/bfs/archive/790d2422c879ff20488910da1c4422b667ea6af7.jpg@112w_63h_1c.webp)](https://www.bilibili.com/video/BV1N14y1V7mB)  [![bilibili](https://img.shields.io/badge/dynamic/json?label=views&style=social&logo=bilibili&query=data.stat.view&url=https%3A%2F%2Fapi.bilibili.com%2Fx%2Fweb-interface%2Fview%3Fbvid%3DBV1N14y1V7mB)](https://www.bilibili.com/video/BV1N14y1V7mB) |                                                                       [源码阅读和调试「必备」技巧文档](https://zhuanlan.zhihu.com/p/580885852)                                                                        |
 | 第2讲 |         10分钟换遍主干网络         | [![Link](https://i0.hdslb.com/bfs/archive/c51f1aef7c605856777249a7b4478f44bd69f3bd.jpg@112w_63h_1c.webp)](https://www.bilibili.com/video/BV1JG4y1d7GC)  [![bilibili](https://img.shields.io/badge/dynamic/json?label=views&style=social&logo=bilibili&query=data.stat.view&url=https%3A%2F%2Fapi.bilibili.com%2Fx%2Fweb-interface%2Fview%3Fbvid%3DBV1JG4y1d7GC)](https://www.bilibili.com/video/BV1JG4y1d7GC) | [10分钟换遍主干网络文档](https://zhuanlan.zhihu.com/p/585641598)<br>[10分钟换遍主干网络.ipynb](https://github.com/open-mmlab/OpenMMLabCourse/blob/main/codes/MMYOLO_tutorials/[实用类第二期]10分钟换遍主干网络.ipynb) |
-| 第3讲 | 自定义数据集从标注到部署保姆级教程 | [![Link](https://i2.hdslb.com/bfs/archive/13f566c89a18c9c881713b63ec14da952d4c0b14.jpg@112w_63h_1c.webp)](https://www.bilibili.com/video/BV1RG4y137i5)  [![bilibili](https://img.shields.io/badge/dynamic/json?label=views&style=social&logo=bilibili&query=data.stat.view&url=https%3A%2F%2Fapi.bilibili.com%2Fx%2Fweb-interface%2Fview%3Fbvid%3DBV1RG4y137i5)](https://www.bilibili.com/video/BV1RG4y137i5) |                                             [自定义数据集从标注到部署保姆级教程](https://github.com/open-mmlab/mmyolo/blob/dev/docs/zh_cn/user_guides/custom_dataset.md)                                              |
+| 第3讲 | 自定义数据集从标注到部署保姆级教程 | [![Link](https://i2.hdslb.com/bfs/archive/13f566c89a18c9c881713b63ec14da952d4c0b14.jpg@112w_63h_1c.webp)](https://www.bilibili.com/video/BV1RG4y137i5)  [![bilibili](https://img.shields.io/badge/dynamic/json?label=views&style=social&logo=bilibili&query=data.stat.view&url=https%3A%2F%2Fapi.bilibili.com%2Fx%2Fweb-interface%2Fview%3Fbvid%3DBV1RG4y137i5)](https://www.bilibili.com/video/BV1RG4y137i5) |                                                            [自定义数据集从标注到部署保姆级教程](../recommended_topics/labeling_to_deployment_tutorials.md)                                                            |
 | 第4讲 |      顶会第一步 · 模块自定义       | [![Link](http://i2.hdslb.com/bfs/archive/5b23d41ac57466824eaf185ef806ef734414e93b.jpg@112w_63h_1c.webp)](https://www.bilibili.com/video/BV1yd4y1j7VD)  [![bilibili](https://img.shields.io/badge/dynamic/json?label=views&style=social&logo=bilibili&query=data.stat.view&url=https%3A%2F%2Fapi.bilibili.com%2Fx%2Fweb-interface%2Fview%3Fbvid%3DBV1yd4y1j7VD)](https://www.bilibili.com/video/BV1yd4y1j7VD)  |                                [顶会第一步·模块自定义.ipynb](https://github.com/open-mmlab/OpenMMLabCourse/blob/main/codes/MMYOLO_tutorials/[实用类第四期]顶会第一步·模块自定义.ipynb)                                |
 
 #### 源码解读类
diff --git a/docs/zh_cn/get_started/dependencies.md b/docs/zh_cn/get_started/dependencies.md
index b950519c7..8713c1393 100644
--- a/docs/zh_cn/get_started/dependencies.md
+++ b/docs/zh_cn/get_started/dependencies.md
@@ -4,7 +4,8 @@
 
 | MMYOLO version |   MMDetection version    |     MMEngine version     |      MMCV version       |
 | :------------: | :----------------------: | :----------------------: | :---------------------: |
-|      main      | mmdet>=3.0.0rc5, \<3.1.0 | mmengine>=0.3.1, \<1.0.0 | mmcv>=2.0.0rc4, \<2.1.0 |
+|      main      |  mmdet>=3.0.0, \<3.1.0   | mmengine>=0.7.1, \<1.0.0 | mmcv>=2.0.0rc4, \<2.1.0 |
+|     0.6.0      |  mmdet>=3.0.0, \<3.1.0   | mmengine>=0.7.1, \<1.0.0 | mmcv>=2.0.0rc4, \<2.1.0 |
 |     0.5.0      | mmdet>=3.0.0rc6, \<3.1.0 | mmengine>=0.6.0, \<1.0.0 | mmcv>=2.0.0rc4, \<2.1.0 |
 |     0.4.0      | mmdet>=3.0.0rc5, \<3.1.0 | mmengine>=0.3.1, \<1.0.0 | mmcv>=2.0.0rc0, \<2.1.0 |
 |     0.3.0      | mmdet>=3.0.0rc5, \<3.1.0 | mmengine>=0.3.1, \<1.0.0 | mmcv>=2.0.0rc0, \<2.1.0 |
diff --git a/docs/zh_cn/get_started/installation.md b/docs/zh_cn/get_started/installation.md
index 32927b6e6..be77bccc9 100644
--- a/docs/zh_cn/get_started/installation.md
+++ b/docs/zh_cn/get_started/installation.md
@@ -8,7 +8,7 @@
 pip install -U openmim
 mim install "mmengine>=0.6.0"
 mim install "mmcv>=2.0.0rc4,<2.1.0"
-mim install "mmdet>=3.0.0rc6,<3.1.0"
+mim install "mmdet>=3.0.0,<4.0.0"
 ```
 
 如果你当前已经处于 mmyolo 工程目录下，则可以采用如下简化写法
diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst
index e32f76d8c..9f150ac6c 100644
--- a/docs/zh_cn/index.rst
+++ b/docs/zh_cn/index.rst
@@ -19,15 +19,16 @@
    :caption: 推荐专题
 
    recommended_topics/contributing.md
+   recommended_topics/training_testing_tricks.md
    recommended_topics/model_design.md
    recommended_topics/algorithm_descriptions/index.rst
+   recommended_topics/application_examples/index.rst
    recommended_topics/replace_backbone.md
    recommended_topics/complexity_analysis.md
    recommended_topics/labeling_to_deployment_tutorials.md
    recommended_topics/visualization.md
    recommended_topics/deploy/index.rst
    recommended_topics/troubleshooting_steps.md
-   recommended_topics/industry_examples.md
    recommended_topics/mm_basics.md
    recommended_topics/dataset_preparation.md
 
@@ -38,6 +39,7 @@
    common_usage/resume_training.md
    common_usage/syncbn.md
    common_usage/amp_training.md
+   common_usage/ms_training_testing.md
    common_usage/tta.md
    common_usage/plugins.md
    common_usage/freeze_layers.md
diff --git a/docs/zh_cn/notes/changelog.md b/docs/zh_cn/notes/changelog.md
index bd5110713..90fef5958 100644
--- a/docs/zh_cn/notes/changelog.md
+++ b/docs/zh_cn/notes/changelog.md
@@ -1,5 +1,43 @@
 # 更新日志
 
+## v0.6.0 (15/8/2023)
+
+### 亮点
+
+- 支持 YOLOv5 实例分割
+- 基于 MMPose 支持 YOLOX-Pose
+- 添加 15 分钟的实例分割教程
+- YOLOv5 支持使用 mask 标注来优化边界框
+- 添加多尺度训练和测试文档
+
+### 新特性
+
+- 添加训练和测试技巧文档 (#659)
+- 支持设置 `cache_size_limit` 参数，并支持 mmdet 3.0.0 (#707)
+- 支持 YOLOv5u 和 YOLOv6 3.0 推理 (#624, #744)
+- 支持仅模型推断 (#733)
+- 添加 YOLOv8 deepstream 配置 (#633)
+- 在 MMYOLO 应用程序中添加电离图示例 (#643)
+
+### Bug 修复
+
+- 修复 browse_dataset 以可视化测试和验证集的问题 (#641)
+- 修复安装文档错误 (#662)
+- 修复 yolox-l ckpt 链接 (#677)
+- 修正 YOLOv7 和 YOLOv8 图表中的拼写错误 (#621, #710)
+- 调整 `boxam_vis_demo.py` 中包导入的顺序 (#655)
+
+### 完善
+
+- 优化 `convert_kd_ckpt_to_student.py` 文件 (#647)
+- 添加 FAQ 和 training_testing_tricks 的英文文档 (#691, #693)
+
+### 贡献者
+
+总共 21 位开发者参与了本次版本
+
+感谢 @Lum1104,@azure-wings,@FeiGeChuanShu,@Lingrui Gu,@Nioolek,@huayuan4396,@RangeKing,@danielhonies,@yechenzhi,@JosonChan1998,@kitecats,@Qingrenn,@triple-Mu,@kikefdezl,@zhangrui-wolf,@xin-li-67,@Ben-Louis,@zgzhengSEU,@VoyagerXvoyagerx,@tang576225574,@hhaAndroid
+
 ## v0.5.0 (2/3/2023)
 
 ### 亮点
diff --git a/docs/zh_cn/notes/code_style.md b/docs/zh_cn/notes/code_style.md
index fc6120ccf..6e169b371 100644
--- a/docs/zh_cn/notes/code_style.md
+++ b/docs/zh_cn/notes/code_style.md
@@ -324,9 +324,6 @@ docstring 是对一个类、一个函数功能与 API 接口的详细描述，
                specified, the ``out_dir`` will be the concatenation of
                ``out_dir`` and the last level directory of ``runner.work_dir``.
                Defaults to None. `Changed in version 1.3.15.`
-           file_client_args (dict, optional): Arguments to instantiate a
-               FileClient. See :class:`mmcv.fileio.FileClient` for details.
-               Defaults to None. `New in version 1.3.15.`
 
        Warning:
            Before v1.3.15, the ``out_dir`` argument indicates the path where the
diff --git a/docs/zh_cn/recommended_topics/algorithm_descriptions/yolov8_description.md b/docs/zh_cn/recommended_topics/algorithm_descriptions/yolov8_description.md
index d28ee4423..fb5e218db 100644
--- a/docs/zh_cn/recommended_topics/algorithm_descriptions/yolov8_description.md
+++ b/docs/zh_cn/recommended_topics/algorithm_descriptions/yolov8_description.md
@@ -3,7 +3,7 @@
 ## 0 简介
 
 <div align=center >
-<img alt="YOLOv8-P5_structure" src="https://user-images.githubusercontent.com/27466624/211974251-8de633c8-090c-47c9-ba52-4941dc9e3a48.jpg"/>
+<img alt="YOLOv8-P5_structure" src="https://user-images.githubusercontent.com/27466624/222869864-1955f054-aa6d-4a80-aed3-92f30af28849.jpg"/>
 图 1：YOLOv8-P5 模型结构
 </div>
 
@@ -203,7 +203,7 @@ python demo/featmap_vis_demo.py demo/demo.jpg configs/yolov8/yolov8_s_syncbn_fas
 test_pipeline = [
     dict(
         type='LoadImageFromFile',
-        file_client_args=_base_.file_client_args),
+        backend_args=_base_.backend_args),
     dict(type='mmdet.Resize', scale=img_scale, keep_ratio=False), # 这里将 LetterResize 修改成 mmdet.Resize
     dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),
     dict(
diff --git a/docs/zh_cn/recommended_topics/application_examples/index.rst b/docs/zh_cn/recommended_topics/application_examples/index.rst
new file mode 100644
index 000000000..f552dbe5a
--- /dev/null
+++ b/docs/zh_cn/recommended_topics/application_examples/index.rst
@@ -0,0 +1,7 @@
+MMYOLO 应用范例介绍
+********************
+
+.. toctree::
+   :maxdepth: 1
+
+   ionogram_detection.md
diff --git a/docs/zh_cn/recommended_topics/application_examples/ionogram_detection.md b/docs/zh_cn/recommended_topics/application_examples/ionogram_detection.md
new file mode 100644
index 000000000..84e6daf00
--- /dev/null
+++ b/docs/zh_cn/recommended_topics/application_examples/ionogram_detection.md
@@ -0,0 +1,306 @@
+# 基于 MMYOLO 的频高图实时目标检测 benchmark
+
+## 数据集构建
+
+数字频高图是获取电离层实时信息最重要的途径。电离层结构检测对精准提取电离层关键参数，具有非常重要的研究意义。
+
+利用中国科学院在海南、武汉、怀来获取的不同季节的 4311 张频高图建立数据集，使用 [labelme](https://github.com/wkentaro/labelme) 人工标注出 E 层、Es-c 层、Es-l 层、F1 层、F2 层、Spread F 层共 6 种结构。[数据集下载](https://github.com/VoyagerXvoyagerx/Ionogram_detection/releases/download/Dataset/Iono4311.zip)
+
+<div align=center>
+<img width="40%" src="https://user-images.githubusercontent.com/67947949/223638535-c4583d88-aa5a-4f21-b35a-e6e8328c9bd4.jpg"/>
+
+使用 labelme 标注的图像预览
+
+</div>
+
+1. 数据集准备
+
+下载数据后，放置在 MMYOLO 仓库的根目录下，使用 `unzip test.zip` 命令（linux）解压至当前文件夹。解压后的文件夹结构为：
+
+```shell
+Iono4311/
+├── images
+|      ├── 20130401005200.png
+|      └── ...
+└── labels
+       ├── 20130401005200.json
+       └── ...
+```
+
+其中，`images` 目录下存放输入图片，`labels` 目录下存放使用 labelme 标注得到的 json 文件。
+
+2. 数据集格式转换
+
+使用MMYOLO提供的 `tools/dataset_converters/labelme2coco.py` 脚本将 labelme 格式的标注文件转换为 COCO 格式的标注文件。
+
+```shell
+python tools/dataset_converters/labelme2coco.py --img-dir ./Iono4311/images \
+                                                --labels-dir ./Iono4311/labels \
+                                                --out ./Iono4311/annotations/annotations_all.json
+```
+
+3. 浏览数据集
+
+使用下面的命令可以将 COCO 的 label 在图片上进行显示，这一步可以验证刚刚转换是否有问题。
+
+```shell
+python tools/analysis_tools/browse_coco_json.py --img-dir ./Iono4311/images \
+                                                --ann-file ./Iono4311/annotations/annotations_all.json
+```
+
+4. 划分训练集、验证集、测试集
+
+设置 70% 的图片为训练集，15% 作为验证集，15% 为测试集。
+
+```shell
+python tools/misc/coco_split.py --json ./Iono4311/annotations/annotations_all.json \
+                                --out-dir ./Iono4311/annotations \
+                                --ratios 0.7 0.15 0.15 \
+                                --shuffle \
+                                --seed 14
+```
+
+划分后的文件夹结构:
+
+```shell
+Iono4311/
+├── annotations
+│   ├── annotations_all.json
+│   ├── class_with_id.txt
+│   ├── test.json
+│   ├── train.json
+│   └── val.json
+├── classes_with_id.txt
+├── images
+├── labels
+├── test_images
+├── train_images
+└── val_images
+```
+
+## 配置文件
+
+配置文件存放在目录 `/projects/misc/ionogram_detection/` 下。
+
+1. 数据集分析
+
+使用 `tools/analysis_tools/dataset_analysis.py` 从数据集中采样 200 张图片进行可视化分析：
+
+```shell
+python tools/analysis_tools/dataset_analysis.py projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py \
+                                                --out-dir output
+```
+
+得到以下输出：
+
+```shell
+The information obtained is as follows:
++------------------------------+
+| Information of dataset class |
++---------------+--------------+
+| Class name    | Bbox num     |
++---------------+--------------+
+| E             | 98           |
+| Es-l          | 27           |
+| Es-c          | 46           |
+| F1            | 100          |
+| F2            | 194          |
+| Spread-F      | 6            |
++---------------+--------------+
+```
+
+说明本数据集存在样本不均衡的现象。
+
+<div align=center>
+<img width="100%" src="https://user-images.githubusercontent.com/67947949/223640412-4008a0a1-0626-419d-90bf-fb7ce6f26fc9.jpg"/>
+
+各类别目标大小统计
+
+</div>
+
+根据统计结果，E、Es-l、Esc、F1 类别以小目标居多，F2、Spread F 类主要是中等大小目标。
+
+2. 可视化 config 中的数据处理部分
+
+以 YOLOv5-s 为例，根据配置文件中的 `train_pipeline`，训练时采用的数据增强策略包括：
+
+- 马赛克增强
+- 随机仿射变换
+- Albumentations 数据增强工具包（包括多种数字图像处理方法）
+- HSV 随机增强图像
+- 随机水平翻转
+
+使用 `tools/analysis_tools/browse_dataset.py` 脚本的 **'pipeline'** 模式，可以可视化每个 pipeline 的输出效果:
+
+```shell
+python tools/analysis_tools/browse_dataset.py projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py \
+                                              -m pipeline \
+                                              --out-dir output
+```
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/67947949/223914228-abcd017d-a068-4dcd-9d91-e6b546540060.png"/>
+
+pipeline 输出可视化
+
+</div>
+
+3. 优化 Anchor 尺寸
+
+使用分析工具中的 `tools/analysis_tools/optimize_anchors.py` 脚本得到适用于本数据集的先验锚框尺寸。
+
+```shell
+python tools/analysis_tools/optimize_anchors.py projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py \
+                                                --algorithm v5-k-means \
+                                                --input-shape 640 640 \
+                                                --prior-match-thr 4.0 \
+                                                --out-dir work_dirs/dataset_analysis_5_s
+```
+
+4. 模型复杂度分析
+
+根据配置文件，使用分析工具中的 `tools/analysis_tools/get_flops.py` 脚本可以得到模型的参数量、浮点计算量等信息。以 YOLOv5-s 为例：
+
+```shell
+python tools/analysis_tools/get_flops.py projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py
+```
+
+得到如下输出，表示模型的浮点运算量为 7.947G，一共有 7.036M 个可学习参数。
+
+```shell
+==============================
+Input shape: torch.Size([640, 640])
+Model Flops: 7.947G
+Model Parameters: 7.036M
+==============================
+```
+
+## 训练和测试
+
+1. 训练
+
+训练可视化：本范例按照[标注+训练+测试+部署全流程](https://mmyolo.readthedocs.io/zh_CN/dev/recommended_topics/labeling_to_deployment_tutorials.html#id11)中的步骤安装和配置 [wandb](https://wandb.ai/site)。
+
+调试技巧：在调试代码的过程中，有时需要训练几个 epoch，例如调试验证过程或者权重的保存是否符合期望。对于继承自 `BaseDataset` 的数据集（如本范例中的 `YOLOv5CocoDataset`），在 `train_dataloader` 中的 `dataset` 字段增加 `indices` 参数，即可指定每个 epoch 迭代的样本数，减少迭代时间。
+
+```python
+train_dataloader = dict(
+    batch_size=train_batch_size_per_gpu,
+    num_workers=train_num_workers,
+    dataset=dict(
+        _delete_=True,
+        type='RepeatDataset',
+        times=1,
+        dataset=dict(
+            type=_base_.dataset_type,
+            indices=200,  # 设置 indices=200，表示每个 epoch 只迭代 200 个样本
+            data_root=data_root,
+            metainfo=metainfo,
+            ann_file=train_ann_file,
+            data_prefix=dict(img=train_data_prefix),
+            filter_cfg=dict(filter_empty_gt=False, min_size=32),
+            pipeline=_base_.train_pipeline)))
+```
+
+启动训练：
+
+```shell
+python tools/train.py projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py
+```
+
+2. 测试
+
+指定配置文件和模型的路径以启动测试：
+
+```shell
+python tools/test.py projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py \
+                     work_dirs/yolov5_s-v61_fast_1xb96-100e_ionogram/xxx
+```
+
+## 实验与结果分析
+
+### 选择合适的 batch size
+
+- Batch size 主导了训练速度。通常，理想的 batch size 是是硬件能支持的最大 batch size。
+- 当显存占用没有达到饱和时，如果 batch size 翻倍，训练吞吐量也应该翻倍（或接近翻倍），训练时间应该减半或接近减半。
+- 使用**混合精度训练**可以加快训练速度、减小显存。在执行 `train.py` 脚本时添加 `--amp` 参数即可开启。
+
+硬件信息：
+
+- GPU：V100，显存 32G
+- CPU：10核，内存 40G
+
+实验结果：
+
+| Model    | Epoch(best) | AMP   | Batchsize | Num workers | Memory Allocated | Training Time | Val mAP |
+| -------- | ----------- | ----- | --------- | ----------- | ---------------- | ------------- | ------- |
+| YOLOv5-s | 100(82)     | False | 32        | 6           | 35.07%           | 54 min        | 0.575   |
+| YOLOv5-s | 100(96)     | True  | 32        | 6           | 24.93%           | 49 min        | 0.578   |
+| YOLOv5-s | 100(100)    | False | 96        | 6           | 96.64%           | 48 min        | 0.571   |
+| YOLOv5-s | 100(100)    | True  | 96        | 6           | 54.66%           | **37** min    | 0.575   |
+| YOLOv5-s | 100(90)     | True  | 144       | 6           | 77.06%           | 39 min        | 0.573   |
+| YOLOv5-s | 200(148)    | True  | 96        | 6           | 54.66%           | 72 min        | 0.575   |
+| YOLOv5-s | 200(188)    | True  | 96        | **8**       | 54.66%           | 67 min        | 0.576   |
+
+<div align=center>
+<img width="60%" src="https://user-images.githubusercontent.com/67947949/223635966-948c8424-1ba8-4df0-92d7-079029dc1231.png">
+
+不同 batch size 的训练过程中，数据加载时间 `data_time` 占每步总时长的比例
+
+</div>
+
+分析结果，可以得出以下结论：
+
+- 混合精度训练对模型的精度几乎没有影响，并且可以明显减少显存占用。
+- Batch size 增加 3 倍，和训练时长并没有相应地减小 3 倍。根据训练过程中 `data_time` 的记录，batch size 越大，`data_time` 也越大，说明数据加载成为了限制训练速度的瓶颈。增大加载数据的进程数 `num_workers` 可以加快数据加载。
+
+### 消融实验
+
+为了得到适用于本数据集的训练流水线，以 YOLOv5-s 模型为例，进行以下消融实验。
+
+#### 不同数据增强方法
+
+| Aug Method | [config](/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_aug0.py) | [config](/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb32-100e_ionogram_mosaic.py) | [config](/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_mosaic_affine.py) | [config](/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_mosaic_affine_albu_hsv.py) | [config](/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py) |
+| ---------- | ------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------- |
+| Mosaic     |                                                                                                  | √                                                                                                  | √                                                                                                         | √                                                                                                                  | √                                                                                           |
+| Affine     |                                                                                                  |                                                                                                    | √                                                                                                         | √                                                                                                                  | √                                                                                           |
+| Albu       |                                                                                                  |                                                                                                    |                                                                                                           | √                                                                                                                  | √                                                                                           |
+| HSV        |                                                                                                  |                                                                                                    |                                                                                                           | √                                                                                                                  | √                                                                                           |
+| Flip       |                                                                                                  |                                                                                                    |                                                                                                           |                                                                                                                    | √                                                                                           |
+| Val mAP    | 0.507                                                                                            | 0.550                                                                                              | 0.572                                                                                                     | 0.567                                                                                                              | 0.575                                                                                       |
+
+结果表明，马赛克增强和随机仿射变换可以对验证集表现带来明显的提升。
+
+#### 是否使用预训练权重
+
+在配置文件中，修改 `load_from = None` 即可不使用预训练权重。对不使用预训练权重的实验，将基础学习率增大四倍，训练轮数增加至 200 轮，使模型得到较为充分的训练。
+
+| Model    | Epoch(best) | FLOPs(G) | Params(M) | Pretrain | Val mAP | Config                                                                                           |
+| -------- | ----------- | -------- | --------- | -------- | ------- | ------------------------------------------------------------------------------------------------ |
+| YOLOv5-s | 100(82)     | 7.95     | 7.04      | Coco     | 0.575   | [config](/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py)      |
+| YOLOv5-s | 200(145)    | 7.95     | 7.04      | None     | 0.565   | [config](/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-200e_ionogram_pre0.py) |
+| YOLOv6-s | 100(54)     | 24.2     | 18.84     | Coco     | 0.584   | [config](/projects/misc/ionogram_detection/yolov6/yolov6_s_fast_1xb32-100e_ionogram.py)          |
+| YOLOv6-s | 200(188)    | 24.2     | 18.84     | None     | 0.557   | [config](/projects/misc/ionogram_detection/yolov6/yolov6_s_fast_1xb32-200e_ionogram_pre0.py)     |
+
+<div align=center>
+<img width="60%" src="https://user-images.githubusercontent.com/67947949/223641016-9ded0d11-62b8-45f4-be5b-bd4ffae3ec21.png">
+
+训练过程中的损失下降对比图
+
+</div>
+
+损失下降曲线表明，使用预训练权重时，loss 下降得更快。可见即使是自然图像数据集上预训练的模型，在雷达图像数据集上微调时，也可以加快模型收敛。
+
+### 频高图结构检测 benchmark
+
+| Model       | epoch(best) | FLOPs(G) | Params(M) | pretrain | val mAP | test mAP | Config                                                                                      | Log                                                                                                           |
+| ----------- | ----------- | -------- | --------- | -------- | ------- | -------- | ------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------- |
+| YOLOv5-s    | 100(82)     | 7.95     | 7.04      | Coco     | 0.575   | 0.584    | [config](/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py) | [log](https://github.com/VoyagerXvoyagerx/Ionogram_detection/blob/main/logs/yolov5_s_20230105_213510.json)    |
+| YOLOv5-m    | 100(70)     | 24.05    | 20.89     | Coco     | 0.587   | 0.586    | [config](/projects/misc/ionogram_detection/yolov5/yolov5_m-v61_fast_1xb32-100e_ionogram.py) | [log](https://github.com/VoyagerXvoyagerx/Ionogram_detection/blob/main/logs/yolov5_m_20230106_004642.json)    |
+| YOLOv6-s    | 100(54)     | 24.2     | 18.84     | Coco     | 0.584   | 0.594    | [config](/projects/misc/ionogram_detection/yolov6/yolov6_s_fast_1xb32-100e_ionogram.py)     | [log](https://github.com/VoyagerXvoyagerx/Ionogram_detection/blob/main/logs/yolov6_s_20230107_003207.json)    |
+| YOLOv6-m    | 100(76)     | 37.08    | 44.42     | Coco     | 0.590   | 0.590    | [config](/projects/misc/ionogram_detection/yolov6/yolov6_m_fast_1xb32-100e_ionogram.py)     | [log](https://github.com/VoyagerXvoyagerx/Ionogram_detection/blob/main/logs/yolov6_m_20230107_201029.json)    |
+| YOLOv6-l    | 100(76)     | 71.33    | 58.47     | Coco     | 0.605   | 0.597    | [config](/projects/misc/ionogram_detection/yolov6/yolov6_l_fast_1xb32-100e_ionogram.py)     | [log](https://github.com/VoyagerXvoyagerx/Ionogram_detection/blob/main/logs/yolov6_l_20230108_005634.json)    |
+| YOLOv7-tiny | 100(78)     | 6.57     | 6.02      | Coco     | 0.549   | 0.568    | [config](/projects/misc/ionogram_detection/yolov7/yolov7_tiny_fast_1xb16-100e_ionogram.py)  | [log](https://github.com/VoyagerXvoyagerx/Ionogram_detection/blob/main/logs/yolov7_tiny_20230215_202837.json) |
+| YOLOv7-x    | 100(58)     | 94.27    | 70.85     | Coco     | 0.602   | 0.595    | [config](/projects/misc/ionogram_detection/yolov7/yolov7_x_fast_1xb16-100e_ionogram.py)     | [log](https://github.com/VoyagerXvoyagerx/Ionogram_detection/blob/main/logs/yolov7_x_20230110_165832.json)    |
+| rtmdet-tiny | 100(100)    | 8.03     | 4.88      | Coco     | 0.582   | 0.589    | [config](/projects/misc/ionogram_detection/rtmdet/rtmdet_tiny_fast_1xb32-100e_ionogram.py)  | [log](https://github.com/VoyagerXvoyagerx/Ionogram_detection/blob/main/logs/rtmdet_tiny_20230310_125440.json) |
+| rtmdet-s    | 100(92)     | 14.76    | 8.86      | Coco     | 0.588   | 0.585    | [config](/projects/misc/ionogram_detection/rtmdet/rtmdet_s_fast_1xb32-100e_ionogram.py)     | [log](https://github.com/VoyagerXvoyagerx/Ionogram_detection/blob/main/logs/rtmdet_s_20230310_163853.json)    |
diff --git a/docs/zh_cn/recommended_topics/deploy/mmdeploy_guide.md b/docs/zh_cn/recommended_topics/deploy/mmdeploy_guide.md
index a6a98d3d4..e935d36e9 100644
--- a/docs/zh_cn/recommended_topics/deploy/mmdeploy_guide.md
+++ b/docs/zh_cn/recommended_topics/deploy/mmdeploy_guide.md
@@ -4,7 +4,7 @@
 
 MMDeploy 是 [OpenMMLab](https://openmmlab.com/) 模型部署工具箱，**为各算法库提供统一的部署体验**。基于 MMDeploy，开发者可以轻松从训练 repo 生成指定硬件所需 SDK，省去大量适配时间。
 
-更多介绍和使用指南见 https://github.com/open-mmlab/mmdeploy/blob/dev-1.x/docs/zh_cn/get_started.md
+更多介绍和使用指南见 https://mmdeploy.readthedocs.io/zh_CN/latest/get_started.html
 
 ## 算法支持列表
 
@@ -19,6 +19,14 @@ MMDeploy 是 [OpenMMLab](https://openmmlab.com/) 模型部署工具箱，**为
 
 ncnn 和其他后端的支持会在后续支持。
 
+## 安装
+
+按照[说明](https://mmdeploy.readthedocs.io/zh_CN/latest/get_started.html)安装 mmdeploy。
+
+```{note}
+如果安装的是 mmdeploy 预编译包，那么也请通过 ‘git clone https://github.com/open-mmlab/mmdeploy.git –depth=1’ 下载 mmdeploy 源码。因为它包含了部署时所需的 tools 文件夹。
+```
+
 ## MMYOLO 中部署相关配置说明
 
 所有部署配置文件在 [`configs/deploy`](../../../configs/deploy/) 目录下。
@@ -61,7 +69,7 @@ codebase_config = dict(
 _base_ = '../../yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py'
 
 test_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(
         type='LetterResize',
         scale=_base_.img_scale,
@@ -109,7 +117,7 @@ codebase_config = dict(
 backend_config = dict(type='onnxruntime')
 ```
 
-`backend_config` 中指定了部署后端 `type=‘onnxruntime’`，其他信息可参考第三小节。
+`backend_config` 中指定了部署后端 `type='onnxruntime'`，其他信息可参考第三小节。
 
 `TensorRT` 部署 `YOLOv5` 可以使用 [`detection_tensorrt_static-640x640.py`](https://github.com/open-mmlab/mmyolo/blob/main/configs/deploy/detection_tensorrt_static-640x640.py) 配置。
 
@@ -208,6 +216,8 @@ use_efficientnms = False
 
 ### 使用方法
 
+#### 从源码安装的 MMDeploy
+
 设置 `MMDeploy` 根目录为环境变量 `MMDEPLOY_DIR` ，例如 `export MMDEPLOY_DIR=/the/root/path/of/MMDeploy`
 
 ```shell
@@ -239,6 +249,126 @@ python3 ${MMDEPLOY_DIR}/tools/deploy.py \
 - `--show` : 是否显示检测的结果。
 - `--dump-info` : 是否输出 SDK 信息。
 
+#### 通过 pip install 安装的 MMDeploy
+
+假设当前的工作目录为 mmyolo 的根目录, 那么以 [YoloV5](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py) 模型为例，你可以从[此处](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth)下载对应的 checkpoint，并使用以下代码将之转换为 onnx 模型：
+
+```python
+from mmdeploy.apis import torch2onnx
+from mmdeploy.backend.sdk.export_info import export2SDK
+
+img = 'demo/demo.jpg'
+work_dir = 'mmdeploy_models/mmyolo/onnx'
+save_file = 'end2end.onnx'
+deploy_cfg = 'configs/deploy/detection_onnxruntime_dynamic.py'
+model_cfg = 'configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py'
+model_checkpoint = 'checkpoints/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth'
+device = 'cpu'
+
+# 1. convert model to onnx
+torch2onnx(img, work_dir, save_file, deploy_cfg, model_cfg,
+           model_checkpoint, device)
+
+# 2. extract pipeline info for inference by MMDeploy SDK
+export2SDK(deploy_cfg, model_cfg, work_dir, pth=model_checkpoint,
+           device=device)
+```
+
+## 模型规范
+
+在使用转换后的模型进行推理之前，有必要了解转换结果的结构。 它存放在 `--work-dir` 指定的路路径下。
+
+上例中的`mmdeploy_models/mmyolo/onnx`，结构如下：
+
+```
+mmdeploy_models/mmyolo/onnx
+├── deploy.json
+├── detail.json
+├── end2end.onnx
+└── pipeline.json
+```
+
+重要的是：
+
+- **end2end.onnx**: 推理引擎文件。可用 ONNX Runtime 推理
+- ***xxx*.json**:  mmdeploy SDK 推理所需的 meta 信息
+
+整个文件夹被定义为**mmdeploy SDK model**。换言之，**mmdeploy SDK model**既包括推理引擎，也包括推理 meta 信息。
+
+## 模型推理
+
+### 后端模型推理
+
+以上述模型转换后的 `end2end.onnx` 为例，你可以使用如下代码进行推理：
+
+```python
+from mmdeploy.apis.utils import build_task_processor
+from mmdeploy.utils import get_input_shape, load_config
+import torch
+
+deploy_cfg = 'configs/deploy/detection_onnxruntime_dynamic.py'
+model_cfg = 'configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py'
+device = 'cpu'
+backend_model = ['mmdeploy_models/mmyolo/onnx/end2end.onnx']
+image = 'demo/demo.jpg'
+
+# read deploy_cfg and model_cfg
+deploy_cfg, model_cfg = load_config(deploy_cfg, model_cfg)
+
+# build task and backend model
+task_processor = build_task_processor(model_cfg, deploy_cfg, device)
+model = task_processor.build_backend_model(backend_model)
+
+# process input image
+input_shape = get_input_shape(deploy_cfg)
+model_inputs, _ = task_processor.create_input(image, input_shape)
+
+# do model inference
+with torch.no_grad():
+    result = model.test_step(model_inputs)
+
+# visualize results
+task_processor.visualize(
+    image=image,
+    model=model,
+    result=result[0],
+    window_name='visualize',
+    output_file='work_dir/output_detection.png')
+```
+
+运行上述代码后，你可以在 `work_dir` 中看到推理的结果图片 `output_detection.png`。
+
+### SDK模型推理
+
+你也可以参考如下代码，对 SDK model 进行推理：
+
+```python
+from mmdeploy_runtime import Detector
+import cv2
+
+img = cv2.imread('demo/demo.jpg')
+
+# create a detector
+detector = Detector(model_path='mmdeploy_models/mmyolo/onnx',
+                    device_name='cpu', device_id=0)
+# perform inference
+bboxes, labels, masks = detector(img)
+
+# visualize inference result
+indices = [i for i in range(len(bboxes))]
+for index, bbox, label_id in zip(indices, bboxes, labels):
+    [left, top, right, bottom], score = bbox[0:4].astype(int), bbox[4]
+    if score < 0.3:
+        continue
+
+    cv2.rectangle(img, (left, top), (right, bottom), (0, 255, 0))
+
+cv2.imwrite('work_dir/output_detection.png', img)
+```
+
+除了python API，mmdeploy SDK 还提供了诸如 C、C++、C#、Java等多语言接口。
+你可以参考[样例](https://github.com/open-mmlab/mmdeploy/tree/main/demo)学习其他语言接口的使用方法。
+
 ## 模型评测
 
 当您将 PyTorch 模型转换为后端支持的模型后，您可能需要验证模型的精度，使用 `${MMDEPLOY_DIR}/tools/test.py`
@@ -248,20 +378,19 @@ python3 ${MMDEPLOY_DIR}/tools/test.py \
     ${DEPLOY_CFG} \
     ${MODEL_CFG} \
     --model ${BACKEND_MODEL_FILES} \
-    [--out ${OUTPUT_PKL_FILE}] \
-    [--format-only] \
-    [--metrics ${METRICS}] \
-    [--show] \
-    [--show-dir ${OUTPUT_IMAGE_DIR}] \
-    [--show-score-thr ${SHOW_SCORE_THR}] \
     --device ${DEVICE} \
+    --work-dir ${WORK_DIR} \
     [--cfg-options ${CFG_OPTIONS}] \
-    [--metric-options ${METRIC_OPTIONS}]
+    [--show] \
+    [--show-dir ${OUTPUT_IMAGE_DIR}] \
+    [--interval ${INTERVAL}] \
+    [--wait-time ${WAIT_TIME}] \
     [--log2file work_dirs/output.txt]
-    [--batch-size ${BATCH_SIZE}]
     [--speed-test] \
     [--warmup ${WARM_UP}] \
-    [--log-interval ${LOG_INTERVERL}]
+    [--log-interval ${LOG_INTERVERL}] \
+    [--batch-size ${BATCH_SIZE}] \
+    [--uri ${URI}]
 ```
 
 ### 参数描述
@@ -269,19 +398,18 @@ python3 ${MMDEPLOY_DIR}/tools/test.py \
 - `deploy_cfg`: 部署配置文件。
 - `model_cfg`: MMYOLO 模型配置文件。
 - `--model`: 导出的后端模型。 例如, 如果我们导出了 TensorRT 模型，我们需要传入后缀为 ".engine" 文件路径。
-- `--out`:  保存 pickle 格式的输出结果，仅当您传入这个参数时启用。
-- `--format-only`: 是否格式化输出结果而不进行评估。当您要将结果格式化为特定格式并将其提交到测试服务器时，它很有用。
-- `--metrics`: 用于评估 MMYOLO 中定义的模型的指标，如 COCO 标注格式的 "proposal" 。
-- `--show`: 是否在屏幕上显示评估结果。
-- `--show-dir`: 保存评估结果的目录。(只有给出这个参数才会保存结果)。
-- `--show-score-thr`: 确定是否显示检测边界框的阈值。
 - `--device`: 运行模型的设备。请注意，某些后端会限制设备。例如，TensorRT 必须在 cuda 上运行。
+- `--work-dir`: 模型转换、报告生成的路径。
 - `--cfg-options`: 传入额外的配置，将会覆盖当前部署配置。
-- `--metric-options`: 用于评估的自定义选项。 xxx=yyy 中的键值对格式，将是 dataset.evaluate() 函数的 kwargs。
+- `--show`: 是否在屏幕上显示评估结果。
+- `--show-dir`: 保存评估结果的目录。(只有给出这个参数才会保存结果)。
+- `--interval`: 屏幕上显示评估结果的间隔。
+- `--wait-time`: 每个窗口的显示时间
 - `--log2file`: 将评估结果（和速度）记录到文件中。
-- `--batch-size`: 推理的批量大小，它将覆盖数据配置中的 `samples_per_gpu`。默认为 `1`。请注意，并非所有模型都支持 `batch_size > 1`。
-- `--speed-test`:  是否开启速度测试。
+- `--speed-test`: 是否开启速度测试。
 - `--warmup`: 在计算推理时间之前进行预热，需要先开启 `speed-test`。
 - `--log-interval`: 每个日志之间的间隔，需要先设置 `speed-test`。
+- `--batch-size`: 推理的批量大小，它将覆盖数据配置中的 `samples_per_gpu`。默认为 `1`。请注意，并非所有模型都支持 `batch_size > 1`。
+- `--uri`: 在边缘设备上推理时的 ipv4 或 ipv6 端口号。
 
 注意：`${MMDEPLOY_DIR}/tools/test.py` 中的其他参数用于速度测试。他们不影响评估。
diff --git a/docs/zh_cn/recommended_topics/deploy/mmdeploy_yolov5.md b/docs/zh_cn/recommended_topics/deploy/mmdeploy_yolov5.md
index c48a64062..e035e1764 100644
--- a/docs/zh_cn/recommended_topics/deploy/mmdeploy_yolov5.md
+++ b/docs/zh_cn/recommended_topics/deploy/mmdeploy_yolov5.md
@@ -28,7 +28,7 @@
 _base_ = '../../yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py'
 
 test_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(
         type='LetterResize',
         scale=_base_.img_scale,
@@ -113,7 +113,7 @@ batch_shapes_cfg = dict(
     extra_pad_ratio=0.5)
 
 test_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(type='YOLOv5KeepRatioResize', scale=img_scale),
     dict(
         type='LetterResize',
@@ -251,6 +251,7 @@ python3 ${MMDEPLOY_DIR}/tools/deploy.py \
     --work-dir work_dir \
     --show \
     --device cpu
+    --dump-info
 ```
 
 #### TensorRT
@@ -264,19 +265,20 @@ python3 ${MMDEPLOY_DIR}/tools/deploy.py \
     --work-dir work_dir \
     --show \
     --device cuda:0
+    --dump-info
 ```
 
 当您使用上述命令转换模型时，您将会在 `work_dir` 文件夹下发现以下文件：
 
-![image](https://user-images.githubusercontent.com/92794867/199377596-605c3493-c1e0-435d-bc97-2e46846ac87d.png)
+![image](https://github.com/open-mmlab/mmdeploy/assets/110151316/760f3f7f-aa23-46cf-987c-717d3490246f)
 
 或者
 
-![image](https://user-images.githubusercontent.com/92794867/199377848-a771f9c5-6bd6-49a1-9f58-e7e7b96c800f.png)
+![image](https://github.com/open-mmlab/mmdeploy/assets/110151316/732bcd9a-fca0-40ba-b5af-540a47eb9c35)
 
-在导出 `onnxruntime`模型后，您将得到图1的三个文件，其中 `end2end.onnx` 表示导出的`onnxruntime`模型。
+在导出 `onnxruntime`模型后，您将得到图1的六个文件，其中 `end2end.onnx` 表示导出的`onnxruntime`模型，`xxx.json` 表示 `MMDeploy SDK` 推理所需要的 meta 信息。
 
-在导出 `TensorRT`模型后，您将得到图2的四个文件，其中 `end2end.onnx` 表示导出的中间模型，`MMDeploy`利用该模型自动继续转换获得 `end2end.engine` 模型用于 `TensorRT `部署。
+在导出 `TensorRT`模型后，您将得到图2的七个文件，其中 `end2end.onnx` 表示导出的中间模型，`MMDeploy`利用该模型自动继续转换获得 `end2end.engine` 模型用于 `TensorRT `部署，`xxx.json` 表示 `MMDeploy SDK` 推理所需要的 meta 信息。
 
 ## 模型评测
 
@@ -428,4 +430,143 @@ python3 ${MMDEPLOY_DIR}/tools/profiler.py \
 
 ## 模型推理
 
-TODO
+### 后端模型推理
+
+#### ONNXRuntime
+
+以上述模型转换后的 `end2end.onnx` 为例，您可以使用如下代码进行推理：
+
+```python
+from mmdeploy.apis.utils import build_task_processor
+from mmdeploy.utils import get_input_shape, load_config
+import torch
+
+deploy_cfg = './configs/deploy/detection_onnxruntime_dynamic.py'
+model_cfg = '../mmyolo/configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py'
+device = 'cpu'
+backend_model = ['./work_dir/end2end.onnx']
+image = '../mmyolo/demo/demo.jpg'
+
+# read deploy_cfg and model_cfg
+deploy_cfg, model_cfg = load_config(deploy_cfg, model_cfg)
+
+# build task and backend model
+task_processor = build_task_processor(model_cfg, deploy_cfg, device)
+model = task_processor.build_backend_model(backend_model)
+
+# process input image
+input_shape = get_input_shape(deploy_cfg)
+model_inputs, _ = task_processor.create_input(image, input_shape)
+
+# do model inference
+with torch.no_grad():
+    result = model.test_step(model_inputs)
+
+# visualize results
+task_processor.visualize(
+    image=image,
+    model=model,
+    result=result[0],
+    window_name='visualize',
+    output_file='work_dir/output_detection.png')
+```
+
+#### TensorRT
+
+以上述模型转换后的 `end2end.engine` 为例，您可以使用如下代码进行推理：
+
+```python
+from mmdeploy.apis.utils import build_task_processor
+from mmdeploy.utils import get_input_shape, load_config
+import torch
+
+deploy_cfg = './configs/deploy/detection_tensorrt_dynamic-192x192-960x960.py'
+model_cfg = '../mmyolo/configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py'
+device = 'cuda:0'
+backend_model = ['./work_dir/end2end.engine']
+image = '../mmyolo/demo/demo.jpg'
+
+# read deploy_cfg and model_cfg
+deploy_cfg, model_cfg = load_config(deploy_cfg, model_cfg)
+
+# build task and backend model
+task_processor = build_task_processor(model_cfg, deploy_cfg, device)
+model = task_processor.build_backend_model(backend_model)
+
+# process input image
+input_shape = get_input_shape(deploy_cfg)
+model_inputs, _ = task_processor.create_input(image, input_shape)
+
+# do model inference
+with torch.no_grad():
+    result = model.test_step(model_inputs)
+
+# visualize results
+task_processor.visualize(
+    image=image,
+    model=model,
+    result=result[0],
+    window_name='visualize',
+    output_file='work_dir/output_detection.png')
+```
+
+### SDK 模型推理
+
+#### ONNXRuntime
+
+以上述模型转换后的 `end2end.onnx` 为例，您可以使用如下代码进行 `SDK` 推理：
+
+```python
+from mmdeploy_runtime import Detector
+import cv2
+
+img = cv2.imread('../mmyolo/demo/demo.jpg')
+
+# create a detector
+detector = Detector(model_path='work_dir',
+                    device_name='cpu', device_id=0)
+# perform inference
+bboxes, labels, masks = detector(img)
+
+# visualize inference result
+indices = [i for i in range(len(bboxes))]
+for index, bbox, label_id in zip(indices, bboxes, labels):
+    [left, top, right, bottom], score = bbox[0:4].astype(int), bbox[4]
+    if score < 0.3:
+        continue
+
+    cv2.rectangle(img, (left, top), (right, bottom), (0, 255, 0))
+
+cv2.imwrite('work_dir/output_detection.png', img)
+```
+
+#### TensorRT
+
+以上述模型转换后的 `end2end.engine` 为例，您可以使用如下代码进行 `SDK` 推理：
+
+```python
+from mmdeploy_runtime import Detector
+import cv2
+
+img = cv2.imread('../mmyolo/demo/demo.jpg')
+
+# create a detector
+detector = Detector(model_path='work_dir',
+                    device_name='cuda', device_id=0)
+# perform inference
+bboxes, labels, masks = detector(img)
+
+# visualize inference result
+indices = [i for i in range(len(bboxes))]
+for index, bbox, label_id in zip(indices, bboxes, labels):
+    [left, top, right, bottom], score = bbox[0:4].astype(int), bbox[4]
+    if score < 0.3:
+        continue
+
+    cv2.rectangle(img, (left, top), (right, bottom), (0, 255, 0))
+
+cv2.imwrite('work_dir/output_detection.png', img)
+```
+
+除了python API，mmdeploy SDK 还提供了诸如 C、C++、C#、Java等多语言接口。
+你可以参考[样例](https://github.com/open-mmlab/mmdeploy/tree/main/demo)学习其他语言接口的使用方法。
diff --git a/docs/zh_cn/recommended_topics/industry_examples.md b/docs/zh_cn/recommended_topics/industry_examples.md
deleted file mode 100644
index 19960ce46..000000000
--- a/docs/zh_cn/recommended_topics/industry_examples.md
+++ /dev/null
@@ -1 +0,0 @@
-# MMYOLO 产业范例介绍
diff --git a/docs/zh_cn/recommended_topics/training_testing_tricks.md b/docs/zh_cn/recommended_topics/training_testing_tricks.md
new file mode 100644
index 000000000..ba67063f0
--- /dev/null
+++ b/docs/zh_cn/recommended_topics/training_testing_tricks.md
@@ -0,0 +1,303 @@
+# 训练和测试技巧
+
+MMYOLO 中已经支持了大部分 YOLO 系列目标检测相关算法。不同算法可能涉及到一些实用技巧。本章节将基于所实现的目标检测算法，详细描述 MMYOLO 中已经支持的常用的训练和测试技巧。
+
+## 训练技巧
+
+### 提升检测性能
+
+#### 1 开启多尺度训练
+
+在目标检测领域，多尺度训练是一个非常常用的技巧，但是在 YOLO 中大部分模型的训练输入都是单尺度的 640x640，原因有两个方面：
+
+1. 单尺度训练速度快。当训练 epoch 在 300 或者 500 的时候训练效率是用户非常关注的，多尺度训练会比较慢
+2. 训练 pipeline 中隐含了多尺度增强，等价于应用了多尺度训练，典型的如 `Mosaic`、`RandomAffine` 和 `Resize` 等，故没有必要再次引入模型输入的多尺度训练
+
+在 COCO 数据集上进行了简单实验，如果直接在 YOLOv5 的 DataLoader 输出后再次引入多尺度训练增强实际性能提升非常小，但是这不代表用户自定义数据集微调模式下没有明显增益。如果想在 MMYOLO 中对 YOLO 系列算法开启多尺度训练，可以参考 [多尺度训练文档](../common_usage/ms_training_testing.md)
+
+#### 2 使用 Mask 标注优化目标检测性能
+
+在数据集标注完备例如同时存在边界框和实例分割标注但任务只需要其中部分标注情况下，可以借助完备的数据标注训练单一任务从而提升性能。在目标检测中同样可以借鉴实例分割标注来提升目标检测性能。 以下是 YOLOv8 额外引入实例分割标注优化目标检测结果。 性能增益如下所示：
+
+<div align="center">
+<img src="https://user-images.githubusercontent.com/17425982/224920799-597ee962-5997-48b3-9499-25f885d7a421.png" width="1000"/>
+</div>
+
+从上述曲线图可以看出，不同尺度模型都有了不同程度性能提升。需要注意的是 `Mask Refine` 仅仅的是作用在数据增强阶段，对模型其他训练部分不需要任何改动，且不会影响训练速度。具体如下所示：
+
+<div align="center">
+<img src="https://user-images.githubusercontent.com/17425982/224922191-a52cb410-d08d-455a-bd38-08b83266cc5f.png" width="1000"/>
+</div>
+
+上述的 Mask 表示实例分割标注发挥关键作用的数据增强变换，将该技巧应用到其他 YOLO 系列中均有不同程度涨点。
+
+#### 3 训练后期关闭强增强提升检测性能
+
+该策略是在 YOLOX 算法中第一次被提出可以极大的提升检测性能。 论文中指出虽然 Mosaic+MixUp 可以极大的提升目标检测性能，但是它生成的训练图片远远脱离自然图片的真实分布，并且 Mosaic 大量的裁剪操作会带来很多不准确的标注框，所以 YOLOX 提出在最后 15 个 epoch 关掉强增强，转而使用较弱的增强，从而为了让检测器避开不准确标注框的影响，在自然图片的数据分布下完成最终的收敛。
+
+该策略已经被应用到了大部分 YOLO 算法中，以 YOLOv8 为例其数据增强 pipeline 如下所示：
+
+<div align="center">
+<img src="https://user-images.githubusercontent.com/17425982/224923285-dd23e419-1d9d-4ee6-bec6-af0a6ef5fed0.png" width="400"/>
+</div>
+
+不过在何时关闭强增强是一个超参，如果关闭太早则可能没有充分发挥 Mosaic 等强增强效果，如果关闭太晚则由于之前已经过拟合，此时再关闭则没有任何增益。 在 YOLOv8 实验中可以观察到该现象
+
+| Backbone | Mask Refine |   box AP    | Epoch of best mAP |
+| :------: | :---------: | :---------: | :---------------: |
+| YOLOv8-n |     No      |    37.2     |        500        |
+| YOLOv8-n |     Yes     | 37.4 (+0.2) |        499        |
+| YOLOv8-s |     No      |    44.2     |        430        |
+| YOLOv8-s |     Yes     | 45.1 (+0.9) |        460        |
+| YOLOv8-m |     No      |    49.8     |        460        |
+| YOLOv8-m |     Yes     | 50.6 (+0.8) |        480        |
+| YOLOv8-l |     No      |    52.1     |        460        |
+| YOLOv8-l |     Yes     | 53.0 (+0.9) |        491        |
+| YOLOv8-x |     No      |    52.7     |        450        |
+| YOLOv8-x |     Yes     | 54.0 (+1.3) |        460        |
+
+从上表可以看出：
+
+- 大模型在 COCO 数据集训练 500 epoch 会过拟合，在过拟合情况下再关闭 Mosaic 等强增强效果没有效果
+- 使用 Mask 标注可以缓解过拟合，并且提升性能
+
+#### 4 加入纯背景图片抑制误报率
+
+对于非开放世界数据集目标检测而言，训练和测试都是在固定类别上进行，一旦应用到没有训练过的类别图片上有可能会产生误报，一个常见的缓解策略是加入一定比例的纯背景图片。 在大部分 YOLO 系列中都是默认开启了加入纯背景图片抑制误报率功能，用户只需要设置 `train_dataloader.dataset.filter_cfg.filter_empty_gt` 为 False 即可，表示将纯背景图片不过滤掉加入训练。
+
+#### 5 试试 AdamW 也许效果显著
+
+YOLOv5，YOLOv6，YOLOv7 和 YOLOv8 等都是采用了 SGD 优化器，该参数器对参数的设置比较严格，而 AdamW 则正好相反，其对学习率设置等没有那么敏感。因此如果用户在自定义数据集微调可以尝试选择 AdamW 优化器。我们在 YOLOX 中进行了简单尝试，发现在 tiny、s 和 m 尺度模型上将其优化器替换为 AdamW 均有一定程度涨点。
+
+|  Backbone  | Size | Batch Size | RTMDet-Hyp |   Box AP    |
+| :--------: | :--: | :--------: | :--------: | :---------: |
+| YOLOX-tiny | 416  |    8xb8    |     No     |    32.7     |
+| YOLOX-tiny | 416  |   8xb32    |    Yes     | 34.3 (+1.6) |
+|  YOLOX-s   | 640  |    8xb8    |     No     |    40.7     |
+|  YOLOX-s   | 640  |   8xb32    |    Yes     | 41.9 (+1.2) |
+|  YOLOX-m   | 640  |    8xb8    |     No     |    46.9     |
+|  YOLOX-m   | 640  |   8xb32    |    Yes     | 47.5 (+0.6) |
+
+具体见 [configs/yolox/README.md](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolox/README.md#--results-and-models)。
+
+#### 6 考虑 ignore 场景避免不确定性标注
+
+以 CrowdHuman 为例，其是一个拥挤行人检测数据集，下面是一张典型图片：
+
+<div align="center">
+<img src="https://user-images.githubusercontent.com/17425982/224928241-89dac006-392b-445d-87e8-a9e268825401.png" width="1000"/>
+</div>
+
+图片来自 [detectron2 issue](https://github.com/facebookresearch/detectron2/issues/1909)。黄色打叉的区域表示 `iscrowd` 标注。原因有两个方面：
+
+- 这个区域不是真的人，例如海报上的人
+- 该区域过于拥挤，很难标注
+
+在该场景下，你不能简单的将这类标注删掉，因为你一旦删掉就表示当做背景区域来训练了，但是其和背景是不一样的，首先海报上的人和真人很像，并且拥挤区域确实有人只是不好标注。如果你简单的将其当做背景训练，那么会造成漏报。最合适的做法应该是把拥挤区域当做忽略区域即该区域的任何输出都直接忽略，不计算任何 Loss，不强迫模型拟合。
+
+MMYOLO 在 YOLOv5 上简单快速的验证了 `iscrowd` 标注的作用，性能如下所示：
+
+| Backbone | ignore_iof_thr | box AP50(CrowDHuman Metric) |  MR  |  JI   |
+| :------: | :------------: | :-------------------------: | :--: | :---: |
+| YOLOv5-s |       -1       |            85.79            | 48.7 | 75.33 |
+| YOLOv5-s |      0.5       |            86.17            | 48.8 | 75.87 |
+
+`ignore_iof_thr`为 -1 表示不考虑忽略标签，可以看出性能有一定程度的提升，具体见 [CrowdHuman 结果](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov5/README.md#crowdhuman)。 如果你的自定义数据集上也有上述情况，则建议你考虑 ignore 场景避免不确定性标注。
+
+#### 7 使用知识蒸馏
+
+知识蒸馏是一个被广泛使用的技巧，可以将大模型性能转移到小模型上从而提升小模型检测性能。 目前 MMYOLO 和 MMRazor 已支持了该功能，并在 RTMDet 上进行了初步验证。
+
+|     Model      |   box AP    |
+| :------------: | :---------: |
+|  RTMDet-tiny   |    41.0     |
+| RTMDet-tiny \* | 41.8 (+0.8) |
+|    RTMDet-s    |    44.6     |
+|  RTMDet-s \*   | 45.7 (+1.1) |
+|    RTMDet-m    |    49.3     |
+|  RTMDet-m \*   | 50.2 (+0.9) |
+|    RTMDet-l    |    51.4     |
+|  RTMDet-l \*   | 52.3 (+0.9) |
+
+星号即为采用了大模型蒸馏的结果，详情见 [Distill RTMDet](https://github.com/open-mmlab/mmyolo/tree/main/configs/rtmdet/distillation)。
+
+#### 8 更大的模型用更强的增强参数
+
+如果你基于默认配置修改了模型或者替换了骨干网络，那么建议你基于此刻模型大小来缩放数据增强参数。 一般来说更大的模型需要使用更强的增强参数，否则可能无法发挥大模型的效果，反之如果小模型应用了较强的增强则可能会欠拟合。 以 RTMDet 为例，我们可以观察其不同模型大小的数据增强参数
+
+<div align="center">
+<img src="https://user-images.githubusercontent.com/17425982/224936237-a31add46-77ff-4595-b3d9-c7b574f59c84.png" width="800"/>
+</div>
+
+其中 `random_resize_ratio_range` 表示 `RandomResize` 的随机缩放范围，`mosaic_max_cached_images/mixup_max_cached_images`表示 `Mosaic/MixUp` 增强时候缓存的图片个数，可以用于调整增强的强度。 YOLO 系列模型都是遵循同一套参数设置原则。
+
+### 加快训练速度
+
+#### 1 单尺度训练开启 cudnn_benchmark
+
+YOLO 系列算法中大部分网络输入图片大小都是固定的即单尺度，此时可以开启 `cudnn_benchmark` 来加快训练速度。该参数主要针对 PyTorch 的 cuDNN 底层库进行设置, 设置这个标志可以让内置的 cuDNN 自动寻找最适合当前配置的高效算法来优化运行效率。如果是多尺度模式开启该标志则会不断的寻找最优算法，反而会拖慢训练速度。
+
+在 MMYOLO 中开启 `cudnn_benchmark`，只需要在配置中设置 `env_cfg = dict(cudnn_benchmark=True)`
+
+#### 2 使用带缓存的 Mosaic 和 MixUp
+
+如果你的数据增强中应用了 Mosaic 和 MixUp，并且经过排查训练瓶颈来自图片的随机读取，那么建议将常规的 Mosaic 和 MixUp 替换为 RTMDet 中提出的带缓存的版本。
+
+| Data Aug | Use cache | ms/100 imgs |
+| :------: | :-------: | :---------: |
+|  Mosaic  |    No     |    87.1     |
+|  Mosaic  |    Yes    |    24.0     |
+|  MixUp   |    No     |    19.3     |
+|  MixUp   |    Yes    |    12.4     |
+
+Mosaic 和 MixUp 涉及到多张图片的混合，它们的耗时会是普通数据增强的 K 倍(K 为混入图片的数量)。 如在 YOLOv5 中每次做 Mosaic 时， 4 张图片的信息都需要从硬盘中重新加载。 而带缓存的  Mosaic 和 MixUp 只需要重新载入当前的一张图片，其余参与混合增强的图片则从缓存队列中获取，通过牺牲一定内存空间的方式大幅提升了效率。
+
+<div align=center>
+<img alt="data cache" src="https://user-images.githubusercontent.com/33799979/192730011-90e2a28d-e163-4399-bf87-d3012007d8c3.png" width=800 />
+</div>
+
+如图所示，cache 队列中预先储存了 N 张已加载的图像与标签数据，每一个训练 step 中只需加载一张新的图片及其标签数据并更新到 cache 队列中(cache 队列中的图像可重复，如图中出现两次 img3)，同时如果 cache 队列长度超过预设长度，则随机 pop 一张图，当需要进行混合数据增强时，只需要从 cache 中随机选择需要的图像进行拼接等处理，而不需要全部从硬盘中加载，节省了图像加载的时间。
+
+### 减少超参
+
+YOLOv5 中通过实践提供了一些减少超参数的方法，下面详细说明。
+
+#### 1 Loss 权重自适应，少 1 个超参
+
+一般来说，对于不同的任务或者不同的类别，可能需要针对性的设置超参，而这通常比较难。YOLOv5 中根据实践提出了一些根据类别数和检测输出层个数来自适应缩放 Loss 权重的方法，如下所示：
+
+```python
+# scaled based on number of detection layers
+loss_cls=dict(
+    type='mmdet.CrossEntropyLoss',
+    use_sigmoid=True,
+    reduction='mean',
+    loss_weight=loss_cls_weight *
+    (num_classes / 80 * 3 / num_det_layers)),
+loss_bbox=dict(
+    type='IoULoss',
+    iou_mode='ciou',
+    bbox_format='xywh',
+    eps=1e-7,
+    reduction='mean',
+    loss_weight=loss_bbox_weight * (3 / num_det_layer
+    return_iou=True),
+loss_obj=dict(
+    type='mmdet.CrossEntropyLoss',
+    use_sigmoid=True,
+    reduction='mean',
+    loss_weight=loss_obj_weight *
+    ((img_scale[0] / 640)**2 * 3 / num_det_layers)),
+```
+
+`loss_cls` 可以根据自定义类别数和检测层数对 `loss_weight` 进行自适应缩放，`loss_bbox` 可以根据检测层数进行自适应计算，而 `loss_obj` 可以根据输入图片大小和检测层数进行自适应缩放。这种策略可以让用户不用去设置 Loss 权重超参。
+需要说明的是：这个只是经验规则，并不是说是最佳设置组合，只是作为一个参考。
+
+#### 2 Weight Decay 和 Loss 输出值基于 Batch Size 自适应，少 2 个超参
+
+一般来说，在不同的 `Batch Size` 上进行训练，需要遵循学习率自动缩放规则。但是在各个数据集上验证表明 YOLOv5 实际上在改变 `Batch Size` 时候不缩放学习率也可以取得不错的效果，甚至有时候你缩放了效果还更差。原因就在于代码中存在 `Weight Decay` 和 Loss 输出值基于 `Batch Size` 自适应的技巧。在 YOLOv5 中会基于当前训练的总 `Batch Size` 来缩放 `Weight Decay` 和 Loss 输出值。对应代码为：
+
+```python
+# https://github.com/open-mmlab/mmyolo/blob/dev/mmyolo/engine/optimizers/yolov5_optim_constructor.py#L86
+if 'batch_size_per_gpu' in optimizer_cfg:
+    batch_size_per_gpu = optimizer_cfg.pop('batch_size_per_gpu')
+    # No scaling if total_batch_size is less than
+    # base_total_batch_size, otherwise linear scaling.
+    total_batch_size = get_world_size() * batch_size_per_gpu
+    accumulate = max(
+        round(self.base_total_batch_size / total_batch_size), 1)
+    scale_factor = total_batch_size * \
+        accumulate / self.base_total_batch_size
+    if scale_factor != 1:
+        weight_decay *= scale_factor
+        print_log(f'Scaled weight_decay to {weight_decay}', 'current')
+```
+
+```python
+# https://github.com/open-mmlab/mmyolo/blob/dev/mmyolo/models/dense_heads/yolov5_head.py#L635
+ _, world_size = get_dist_info()
+ return dict(
+     loss_cls=loss_cls * batch_size * world_size,
+     loss_obj=loss_obj * batch_size * world_size,
+     loss_bbox=loss_box * batch_size * world_size)
+```
+
+在不同的 Batch Size 下 Loss 的权重是不一样大的，Batch Size 越大，Loss 就越大，梯度就越大，我个人猜测这可以等价于 Batch Size 增大时候，学习率线性增加的场合。
+实际上从 YOLOv5 的 [YOLOv5 Study: mAP vs Batch-Size](https://github.com/ultralytics/yolov5/discussions/2452) 中可以发现确实是希望用户在修改 Batch Size 时不需要修改其他参数也可以相近的性能。上述两个策略是一个非常不错的训练技巧。
+
+### 减少训练显存
+
+如何减少训练显存是一个经常谈论的问题，所涉及的技术也非常多。 MMYOLO 的训练执行器来自 MMEngine，因此如何减少训练显存可以查阅 MMEngine 的文档。 MMEngine 目前支持梯度累加、梯度检查点和大模型训练技术，详情见
+[节省显存](https://mmengine.readthedocs.io/zh_CN/latest/common_usage/save_gpu_memory.html)。
+
+## 测试技巧
+
+### 推理速度和测试精度的平衡
+
+在模型性能测试时候，我们一般是要求 mAP 越高越好，但是在实际应用或者推理时候我们希望在保证低误报率和漏报率情况下模型推理越快越好，或者说测试只关注 mAP 而忽略了后处理和评估速度，而实际落地应用时候会追求速度和精度的平衡。
+在 YOLO 系列中可以通过控制某些参数实现速度和精度平衡，下面以 YOLOv5 为例对其进行详细描述。
+
+#### 1 推理时避免一个检测框输出多个类别
+
+YOLOv5 在训练分类分支时候采用的是 BCE Loss 即 `use_sigmoid=True`。假设物体类别数是 4，那么分类分支输出的类别数是 4 而不是 5，并且由于使用的是 sigmoid 而非 softmax 预测模式，很可能在某个位置预测出多个满足过滤阈值的检测框，也就是会出现一个预测 bbox 对应多个预测 label 的情况。如下图所示
+
+<div align=center>
+<img alt="multi-label" src="https://user-images.githubusercontent.com/17425982/226282295-8ef53a89-e33e-4fd5-8d60-417db2d5a140.png" width=800 />
+</div>
+
+一般在计算 mAP 时候过滤阈值为 0.001，由于 sigmoid 非竞争性预测模式会导致一个框对应多个 label。这种计算方式可以提高 mAP 计算时候的召回率，但是实际落地应用会不方便。
+
+一个常用的办法就是提高过滤阈值，但是如果你不需要出现较多漏报，此时推荐你修改 `multi_label` 参数为 False，其位于配置的 `mode.test_cfg.multi_label` 中，默认值是 True 表示允许一个检测框对应多个 label。
+
+#### 2 简化 test pipeline
+
+注意到 YOLOv5 的 test pipeline 为如下：
+
+```python
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='YOLOv5KeepRatioResize', scale=img_scale),
+    dict(
+        type='LetterResize',
+        scale=img_scale,
+        allow_scale_up=False,
+        pad_val=dict(img=114)),
+    dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),
+    dict(
+        type='mmdet.PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'pad_param'))
+]
+```
+
+其使用了两个不同功能的 Resize，目的依然是提高评估时候的 mAP 值。在实际落地应用时候你可以简化该 pipeline，如下所示：
+
+```python
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='LetterResize',
+        scale=_base_.img_scale,
+        allow_scale_up=True,
+        use_mini_pad=True),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='mmdet.PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'pad_param'))
+]
+```
+
+实际上 YOLOv5 算法在实际应用时候是采用简化的 pipeline，并将 multi_label 设为 False, score_thr 提高为 0.25， iou_threshold 降低为 0.45。
+在 YOLOv5 配置中我们提供了一套 detect 落地时候的配置参数，具体见 [yolov5_s-v61_syncbn-detect_8xb16-300e_coco.py](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov5/yolov5_s-v61_syncbn-detect_8xb16-300e_coco.py)。
+
+#### 3 Batch Shape 策略加快测试速度
+
+Batch Shape 是 YOLOv5 中提出的可以加快推理的一个测试技巧，其思路是不再强制要求整个测试过程图片都是 640x640，而是可以变尺度测试，只需要保证当前 batch 内的 shape 是一样的就行。这种方式可以减少额外的图片像素填充，从而实现加速推理过程。
+Batch Shape 的具体实现可以参考 [链接](https://github.com/open-mmlab/mmyolo/blob/main/mmyolo/datasets/utils.py#L55)。MMYOLO 中几乎所有算法在测试时候都是默认开启了 Batch Shape 策略。 如果用户想关闭该功能，可以设置 `val_dataloader.dataset.batch_shapes_cfg=None`。
+
+在实际落地场景下，因为动态 shape 没有固定 shape 快且高效，所以一般会不采用这个策略。
+
+### TTA 提升测试精度
+
+TTA 测试时增强是一个万能的涨点技巧，在打比赛时候非常有用。MMYOLO 已经支持了 TTA，只需要在测试时候输入 `--tta` 即可开启。详情见 [TTA 说明](https://github.com/open-mmlab/mmyolo/blob/dev/docs/zh_cn/common_usage/tta.md)。
diff --git a/docs/zh_cn/recommended_topics/visualization.md b/docs/zh_cn/recommended_topics/visualization.md
index 8a1b8c6fb..ed4bbf94d 100644
--- a/docs/zh_cn/recommended_topics/visualization.md
+++ b/docs/zh_cn/recommended_topics/visualization.md
@@ -90,8 +90,7 @@ python demo/featmap_vis_demo.py demo/dog.jpg \
 ```python
 test_pipeline = [
     dict(
-        type='LoadImageFromFile',
-        file_client_args={{_base_.file_client_args}}),
+        type='LoadImageFromFile'),
     dict(type='YOLOv5KeepRatioResize', scale=img_scale),
     dict(
         type='LetterResize',
@@ -112,7 +111,7 @@ test_pipeline = [
 test_pipeline = [
     dict(
         type='LoadImageFromFile',
-        file_client_args=_base_.file_client_args),
+        backend_args=_base_.backend_args),
     dict(type='mmdet.Resize', scale=img_scale, keep_ratio=False), # 这里将 LetterResize 修改成 mmdet.Resize
     dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),
     dict(
@@ -197,7 +196,7 @@ python demo/featmap_vis_demo.py demo/dog.jpg \
 test_pipeline = [
     dict(
         type='LoadImageFromFile',
-        file_client_args=_base_.file_client_args),
+        backend_args=_base_.backend_args),
     dict(type='YOLOv5KeepRatioResize', scale=img_scale),
     dict(
         type='LetterResize',
@@ -218,7 +217,7 @@ test_pipeline = [
 test_pipeline = [
     dict(
         type='LoadImageFromFile',
-        file_client_args=_base_.file_client_args),
+        backend_args=_base_.backend_args),
     dict(type='mmdet.Resize', scale=img_scale, keep_ratio=False), # 这里将 LetterResize 修改成 mmdet.Resize
     dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),
     dict(
diff --git a/docs/zh_cn/tutorials/config.md b/docs/zh_cn/tutorials/config.md
index 12c7aafe2..d43a4fceb 100644
--- a/docs/zh_cn/tutorials/config.md
+++ b/docs/zh_cn/tutorials/config.md
@@ -86,12 +86,10 @@ YOLOv5 的训练与测试的数据流存在一定差异，这里我们分别进
 ```python
 dataset_type = 'CocoDataset'  # 数据集类型，这将被用来定义数据集
 data_root = 'data/coco/'  # 数据的根路径
-file_client_args = dict(backend='disk')  # 文件读取后端的配置，默认从硬盘读取
 
 pre_transform = [ # 训练数据读取流程
     dict(
-        type='LoadImageFromFile', # 第 1 个流程，从文件路径里加载图像
-        file_client_args=file_client_args),  # 文件读取后端的配置，默认从硬盘读取
+        type='LoadImageFromFile'), # 第 1 个流程，从文件路径里加载图像
     dict(type='LoadAnnotations', # 第 2 个流程，对于当前图像，加载它的注释信息
          with_bbox=True) # 是否使用标注框(bounding box)，目标检测需要设置为 True
 ]
@@ -156,8 +154,7 @@ YOLOv5 测试阶段采用 [Letter Resize](https://github.com/open-mmlab/mmyolo/b
 ```python
 test_pipeline = [ # 测试数据处理流程
     dict(
-        type='LoadImageFromFile', # 第 1 个流程，从文件路径里加载图像
-        file_client_args=file_client_args),  # 文件读取后端的配置，默认从硬盘读取
+        type='LoadImageFromFile'), # 第 1 个流程，从文件路径里加载图像
     dict(type='YOLOv5KeepRatioResize', # 第 2 个流程，保持长宽比的图像大小缩放
          scale=img_scale), # 图像缩放的目标尺寸
     dict(
@@ -475,8 +472,7 @@ train_pipeline = [
 
 test_pipeline = [
     dict(
-        type='LoadImageFromFile',
-        file_client_args={{_base_.file_client_args}}),
+        type='LoadImageFromFile'),
     dict(type='YOLOv5KeepRatioResize', scale=img_scale),
     dict(
         type='LetterResize',
@@ -515,7 +511,6 @@ model = dict(
 ```python
 _base_ = '../_base_/default_runtime.py'
 
-file_client_args = {{_base_.file_client_args}}  # 变量 file_client_args 等于 _base_ 中定义的 file_client_args
 pre_transform = _base_.pre_transform # 变量 pre_transform 等于 _base_ 中定义的 pre_transform
 ```
 
diff --git a/docs/zh_cn/tutorials/custom_installation.md b/docs/zh_cn/tutorials/custom_installation.md
index cdec9ed35..d20d659f6 100644
--- a/docs/zh_cn/tutorials/custom_installation.md
+++ b/docs/zh_cn/tutorials/custom_installation.md
@@ -77,7 +77,7 @@ pip install "mmcv>=2.0.0rc4" -f https://download.openmmlab.com/mmcv/dist/cu116/t
 !pip3 install openmim
 !mim install "mmengine>=0.6.0"
 !mim install "mmcv>=2.0.0rc4,<2.1.0"
-!mim install "mmdet>=3.0.0rc6,<3.1.0"
+!mim install "mmdet>=3.0.0,<4.0.0"
 ```
 
 **步骤 2.** 使用源码安装 MMYOLO：
diff --git a/docs/zh_cn/tutorials/faq.md b/docs/zh_cn/tutorials/faq.md
index 053cbb32c..71ee01d47 100644
--- a/docs/zh_cn/tutorials/faq.md
+++ b/docs/zh_cn/tutorials/faq.md
@@ -79,7 +79,7 @@ EasyDeploy 支持的功能目前没有 MMDeploy 多，但是使用上更加简
 
 ## 如何使用多个 MMYOLO 版本进行开发
 
-推荐你拥有多个 MMYOLO 工程文件夹，例如 mmyolo-v1, mmyolo-v2。 在使用不同版本 MMYOLO 时候，你可以在终端运行前设置
+若你拥有多个 MMYOLO 工程文件夹，例如 mmyolo-v1, mmyolo-v2。 在使用不同版本 MMYOLO 时候，你可以在终端运行前设置
 
 ```shell
 PYTHONPATH="$(dirname $0)/..":$PYTHONPATH
@@ -94,7 +94,7 @@ unset PYTHONPATH
 ## 训练中保存最好模型
 
 用户可以通过在配置中设置 `default_hooks.checkpoint.save_best` 参数来选择根据什么指标来筛选最优模型。以 `COCO` 数据集检测任务为例，
-则 `default_hooks.checkpoint.save_best` 可以选择输入的参数有:
+`default_hooks.checkpoint.save_best` 可以选择输入的参数有:
 
 1. `auto` 将会根据验证集中的第一个评价指标作为筛选条件。
 2. `coco/bbox_mAP` 将会根据 `bbox_mAP` 作为筛选条件。
@@ -106,6 +106,6 @@ unset PYTHONPATH
 
 此外用户还可以选择筛选的逻辑，通过设置配置中的 `default_hooks.checkpoint.rule` 来选择判断逻辑，如：`default_hooks.checkpoint.rule=greater` 表示指标越大越好。更详细的使用可以参考 [checkpoint_hook](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py) 来修改
 
-## 如何进行非正方形输入尺寸训练和测试 ?
+## 如何进行非正方形输入尺寸训练和测试?
 
 在 YOLO 系列算法中默认配置基本上都是 640x640 或者 1280x1280 正方形尺度输入训练的。用户如果想进行非正方形尺度训练，你可以修改配置中 `image_scale` 参数，并将其他对应位置进行修改即可。用户可以参考我们提供的 [yolov5_s-v61_fast_1xb12-40e_608x352_cat.py](https://github.com/open-mmlab/mmyolo/tree/dev/configs/yolov5/yolov5_s-v61_fast_1xb12-40e_608x352_cat.py) 配置。
diff --git a/docs/zh_cn/tutorials/rotated_detection.md b/docs/zh_cn/tutorials/rotated_detection.md
index b06df9b91..1ee974b10 100644
--- a/docs/zh_cn/tutorials/rotated_detection.md
+++ b/docs/zh_cn/tutorials/rotated_detection.md
@@ -64,15 +64,13 @@ mmyolo
 ```python
 dataset_type = 'YOLOv5DOTADataset'  # 数据集类型，这将被用来定义数据集
 data_root = 'data/split_ss_dota/'  # 数据的根路径
-file_client_args = dict(backend='disk')  # 文件读取后端的配置，默认从硬盘读取
 
 angle_version = 'le90' # 角度范围的定义，目前支持 oc, le90 和 le135
 
 train_pipeline = [
     # 训练数据读取流程
     dict(
-        type='LoadImageFromFile', # 第 1 个流程，从文件路径里加载图像
-        file_client_args=file_client_args),  # 文件读取后端的配置，默认从硬盘读取
+        type='LoadImageFromFile'), # 第 1 个流程，从文件路径里加载图像
     dict(type='LoadAnnotations', # 第 2 个流程，对于当前图像，加载它的注释信息
          with_bbox=True, # 是否使用标注框 (bounding box)，目标检测需要设置为 True
          box_type='qbox'), # 指定读取的标注格式，旋转框数据集默认的数据格式为四边形
@@ -122,7 +120,7 @@ RTMDet-R 测试阶段仅采用 Resize 和 Pad，在验证和评测时，都采
 
 ```python
 val_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=_base_.file_client_args),
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
     dict(type='mmdet.Resize', scale=(1024, 1024), keep_ratio=True),
     dict(
         type='mmdet.Pad', size=(1024, 1024),
@@ -205,8 +203,7 @@ dataset_type='YOLOv5CocoDataset'
 train_pipeline = [
     # 训练数据读取流程
     dict(
-        type='LoadImageFromFile', # 第 1 个流程，从文件路径里加载图像
-        file_client_args=file_client_args),  # 文件读取后端的配置，默认从硬盘读取
+        type='LoadImageFromFile'), # 第 1 个流程，从文件路径里加载图像
     dict(type='LoadAnnotations', # 第 2 个流程，对于当前图像，加载它的注释信息
          with_bbox=True, # 是否使用标注框 (bounding box)，目标检测需要设置为 True
          with_mask=True, # 读取储存在 segmentation 标注中的多边形标注
diff --git a/docs/zh_cn/tutorials/warning_notes.md b/docs/zh_cn/tutorials/warning_notes.md
index d1051ba14..38b65c983 100644
--- a/docs/zh_cn/tutorials/warning_notes.md
+++ b/docs/zh_cn/tutorials/warning_notes.md
@@ -19,4 +19,4 @@
 
 ## The model and loaded state dict do not match exactly
 
-这个警告是否会影响性能要根据进一步的打印信息来确定。如果是在微调模式下，由于用户自定义类别不一样无法加载 Head 模块的 COCO 预训练，这是一个正常现象，不会影响性能。
+这个警告是否会影响性能要根据进一步的打印信息来确定。如果是在微调模式下，由于用户自定义类别不一样无法加载 Head 模块的 COCO 预训练权重，这是一个正常现象，不会影响性能。
diff --git a/mmyolo/__init__.py b/mmyolo/__init__.py
index a7a2f3333..6a0bd5d30 100644
--- a/mmyolo/__init__.py
+++ b/mmyolo/__init__.py
@@ -10,12 +10,12 @@
 mmcv_maximum_version = '2.1.0'
 mmcv_version = digit_version(mmcv.__version__)
 
-mmengine_minimum_version = '0.6.0'
+mmengine_minimum_version = '0.7.1'
 mmengine_maximum_version = '1.0.0'
 mmengine_version = digit_version(mmengine.__version__)
 
-mmdet_minimum_version = '3.0.0rc6'
-mmdet_maximum_version = '3.1.0'
+mmdet_minimum_version = '3.0.0'
+mmdet_maximum_version = '4.0.0'
 mmdet_version = digit_version(mmdet.__version__)
 
 
diff --git a/mmyolo/datasets/__init__.py b/mmyolo/datasets/__init__.py
index b3b6b9719..9db439045 100644
--- a/mmyolo/datasets/__init__.py
+++ b/mmyolo/datasets/__init__.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from .pose_coco import PoseCocoDataset
 from .transforms import *  # noqa: F401,F403
 from .utils import BatchShapePolicy, yolov5_collate
 from .yolov5_coco import YOLOv5CocoDataset
@@ -8,5 +9,6 @@
 
 __all__ = [
     'YOLOv5CocoDataset', 'YOLOv5VOCDataset', 'BatchShapePolicy',
-    'yolov5_collate', 'YOLOv5CrowdHumanDataset', 'YOLOv5DOTADataset'
+    'yolov5_collate', 'YOLOv5CrowdHumanDataset', 'YOLOv5DOTADataset',
+    'PoseCocoDataset'
 ]
diff --git a/mmyolo/datasets/pose_coco.py b/mmyolo/datasets/pose_coco.py
new file mode 100644
index 000000000..b17f9836a
--- /dev/null
+++ b/mmyolo/datasets/pose_coco.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any
+
+from mmengine.dataset import force_full_init
+
+try:
+    from mmpose.datasets import CocoDataset as MMPoseCocoDataset
+except ImportError:
+    MMPoseCocoDataset = object
+
+from ..registry import DATASETS
+
+
+@DATASETS.register_module()
+class PoseCocoDataset(MMPoseCocoDataset):
+
+    METAINFO: dict = dict(from_file='configs/_base_/pose/coco.py')
+
+    def __init__(self, *args, **kwargs):
+        if MMPoseCocoDataset is object:
+            raise ImportError(
+                'Please run "mim install -r requirements/mmpose.txt" '
+                'to install mmpose first for PoseCocoDataset.')
+        super().__init__(*args, **kwargs)
+
+    @force_full_init
+    def prepare_data(self, idx) -> Any:
+        data_info = self.get_data_info(idx)
+        data_info['dataset'] = self
+        return self.pipeline(data_info)
diff --git a/mmyolo/datasets/transforms/__init__.py b/mmyolo/datasets/transforms/__init__.py
index 58f4e6fdb..7cdcf8625 100644
--- a/mmyolo/datasets/transforms/__init__.py
+++ b/mmyolo/datasets/transforms/__init__.py
@@ -1,8 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from .formatting import PackDetInputs
 from .mix_img_transforms import Mosaic, Mosaic9, YOLOv5MixUp, YOLOXMixUp
-from .transforms import (LetterResize, LoadAnnotations, PPYOLOERandomCrop,
-                         PPYOLOERandomDistort, RegularizeRotatedBox,
-                         RemoveDataElement, YOLOv5CopyPaste,
+from .transforms import (FilterAnnotations, LetterResize, LoadAnnotations,
+                         Polygon2Mask, PPYOLOERandomCrop, PPYOLOERandomDistort,
+                         RandomAffine, RandomFlip, RegularizeRotatedBox,
+                         RemoveDataElement, Resize, YOLOv5CopyPaste,
                          YOLOv5HSVRandomAug, YOLOv5KeepRatioResize,
                          YOLOv5RandomAffine)
 
@@ -10,5 +12,7 @@
     'YOLOv5KeepRatioResize', 'LetterResize', 'Mosaic', 'YOLOXMixUp',
     'YOLOv5MixUp', 'YOLOv5HSVRandomAug', 'LoadAnnotations',
     'YOLOv5RandomAffine', 'PPYOLOERandomDistort', 'PPYOLOERandomCrop',
-    'Mosaic9', 'YOLOv5CopyPaste', 'RemoveDataElement', 'RegularizeRotatedBox'
+    'Mosaic9', 'YOLOv5CopyPaste', 'RemoveDataElement', 'RegularizeRotatedBox',
+    'Polygon2Mask', 'PackDetInputs', 'RandomAffine', 'RandomFlip', 'Resize',
+    'FilterAnnotations'
 ]
diff --git a/mmyolo/datasets/transforms/formatting.py b/mmyolo/datasets/transforms/formatting.py
new file mode 100644
index 000000000..07eb0121e
--- /dev/null
+++ b/mmyolo/datasets/transforms/formatting.py
@@ -0,0 +1,113 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+from mmcv.transforms import to_tensor
+from mmdet.datasets.transforms import PackDetInputs as MMDET_PackDetInputs
+from mmdet.structures import DetDataSample
+from mmdet.structures.bbox import BaseBoxes
+from mmengine.structures import InstanceData, PixelData
+
+from mmyolo.registry import TRANSFORMS
+
+
+@TRANSFORMS.register_module()
+class PackDetInputs(MMDET_PackDetInputs):
+    """Pack the inputs data for the detection / semantic segmentation /
+    panoptic segmentation.
+
+    Compared to mmdet, we just add the `gt_panoptic_seg` field and logic.
+    """
+    mapping_table = {
+        'gt_bboxes': 'bboxes',
+        'gt_bboxes_labels': 'labels',
+        'gt_masks': 'masks',
+        'gt_keypoints': 'keypoints',
+        'gt_keypoints_visible': 'keypoints_visible'
+    }
+
+    def transform(self, results: dict) -> dict:
+        """Method to pack the input data.
+        Args:
+            results (dict): Result dict from the data pipeline.
+        Returns:
+            dict:
+            - 'inputs' (obj:`torch.Tensor`): The forward data of models.
+            - 'data_sample' (obj:`DetDataSample`): The annotation info of the
+                sample.
+        """
+        packed_results = dict()
+        if 'img' in results:
+            img = results['img']
+            if len(img.shape) < 3:
+                img = np.expand_dims(img, -1)
+            # To improve the computational speed by by 3-5 times, apply:
+            # If image is not contiguous, use
+            # `numpy.transpose()` followed by `numpy.ascontiguousarray()`
+            # If image is already contiguous, use
+            # `torch.permute()` followed by `torch.contiguous()`
+            # Refer to https://github.com/open-mmlab/mmdetection/pull/9533
+            # for more details
+            if not img.flags.c_contiguous:
+                img = np.ascontiguousarray(img.transpose(2, 0, 1))
+                img = to_tensor(img)
+            else:
+                img = to_tensor(img).permute(2, 0, 1).contiguous()
+
+            packed_results['inputs'] = img
+
+        if 'gt_ignore_flags' in results:
+            valid_idx = np.where(results['gt_ignore_flags'] == 0)[0]
+            ignore_idx = np.where(results['gt_ignore_flags'] == 1)[0]
+        if 'gt_keypoints' in results:
+            results['gt_keypoints_visible'] = results[
+                'gt_keypoints'].keypoints_visible
+            results['gt_keypoints'] = results['gt_keypoints'].keypoints
+
+        data_sample = DetDataSample()
+        instance_data = InstanceData()
+        ignore_instance_data = InstanceData()
+
+        for key in self.mapping_table.keys():
+            if key not in results:
+                continue
+            if key == 'gt_masks' or isinstance(results[key], BaseBoxes):
+                if 'gt_ignore_flags' in results:
+                    instance_data[
+                        self.mapping_table[key]] = results[key][valid_idx]
+                    ignore_instance_data[
+                        self.mapping_table[key]] = results[key][ignore_idx]
+                else:
+                    instance_data[self.mapping_table[key]] = results[key]
+            else:
+                if 'gt_ignore_flags' in results:
+                    instance_data[self.mapping_table[key]] = to_tensor(
+                        results[key][valid_idx])
+                    ignore_instance_data[self.mapping_table[key]] = to_tensor(
+                        results[key][ignore_idx])
+                else:
+                    instance_data[self.mapping_table[key]] = to_tensor(
+                        results[key])
+        data_sample.gt_instances = instance_data
+        data_sample.ignored_instances = ignore_instance_data
+
+        if 'gt_seg_map' in results:
+            gt_sem_seg_data = dict(
+                sem_seg=to_tensor(results['gt_seg_map'][None, ...].copy()))
+            data_sample.gt_sem_seg = PixelData(**gt_sem_seg_data)
+
+        # In order to unify the support for the overlap mask annotations
+        # i.e. mask overlap annotations in (h,w) format,
+        # we use the gt_panoptic_seg field to unify the modeling
+        if 'gt_panoptic_seg' in results:
+            data_sample.gt_panoptic_seg = PixelData(
+                pan_seg=results['gt_panoptic_seg'])
+
+        img_meta = {}
+        for key in self.meta_keys:
+            assert key in results, f'`{key}` is not found in `results`, ' \
+                                   f'the valid keys are {list(results)}.'
+            img_meta[key] = results[key]
+
+        data_sample.set_metainfo(img_meta)
+        packed_results['data_samples'] = data_sample
+
+        return packed_results
diff --git a/mmyolo/datasets/transforms/keypoint_structure.py b/mmyolo/datasets/transforms/keypoint_structure.py
new file mode 100644
index 000000000..7b8402be9
--- /dev/null
+++ b/mmyolo/datasets/transforms/keypoint_structure.py
@@ -0,0 +1,248 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta
+from copy import deepcopy
+from typing import List, Optional, Sequence, Tuple, Type, TypeVar, Union
+
+import numpy as np
+import torch
+from torch import Tensor
+
+DeviceType = Union[str, torch.device]
+T = TypeVar('T')
+IndexType = Union[slice, int, list, torch.LongTensor, torch.cuda.LongTensor,
+                  torch.BoolTensor, torch.cuda.BoolTensor, np.ndarray]
+
+
+class Keypoints(metaclass=ABCMeta):
+    """The Keypoints class is for keypoints representation.
+
+    Args:
+        keypoints (Tensor or np.ndarray): The keypoint data with shape of
+            (N, K, 2).
+        keypoints_visible (Tensor or np.ndarray): The visibility of keypoints
+            with shape of (N, K).
+        device (str or torch.device, Optional): device of keypoints.
+            Default to None.
+        clone (bool): Whether clone ``keypoints`` or not. Defaults to True.
+        flip_indices (list, Optional): The indices of keypoints when the
+            images is flipped. Defaults to None.
+
+    Notes:
+        N: the number of instances.
+        K: the number of keypoints.
+    """
+
+    def __init__(self,
+                 keypoints: Union[Tensor, np.ndarray],
+                 keypoints_visible: Union[Tensor, np.ndarray],
+                 device: Optional[DeviceType] = None,
+                 clone: bool = True,
+                 flip_indices: Optional[List] = None) -> None:
+
+        assert len(keypoints_visible) == len(keypoints)
+        assert keypoints.ndim == 3
+        assert keypoints_visible.ndim == 2
+
+        keypoints = torch.as_tensor(keypoints)
+        keypoints_visible = torch.as_tensor(keypoints_visible)
+
+        if device is not None:
+            keypoints = keypoints.to(device=device)
+            keypoints_visible = keypoints_visible.to(device=device)
+
+        if clone:
+            keypoints = keypoints.clone()
+            keypoints_visible = keypoints_visible.clone()
+
+        self.keypoints = keypoints
+        self.keypoints_visible = keypoints_visible
+        self.flip_indices = flip_indices
+
+    def flip_(self,
+              img_shape: Tuple[int, int],
+              direction: str = 'horizontal') -> None:
+        """Flip boxes & kpts horizontally in-place.
+
+        Args:
+            img_shape (Tuple[int, int]): A tuple of image height and width.
+            direction (str): Flip direction, options are "horizontal",
+                "vertical" and "diagonal". Defaults to "horizontal"
+        """
+        assert direction == 'horizontal'
+        self.keypoints[..., 0] = img_shape[1] - self.keypoints[..., 0]
+        self.keypoints = self.keypoints[:, self.flip_indices]
+        self.keypoints_visible = self.keypoints_visible[:, self.flip_indices]
+
+    def translate_(self, distances: Tuple[float, float]) -> None:
+        """Translate boxes and keypoints in-place.
+
+        Args:
+            distances (Tuple[float, float]): translate distances. The first
+                is horizontal distance and the second is vertical distance.
+        """
+        assert len(distances) == 2
+        distances = self.keypoints.new_tensor(distances).reshape(1, 1, 2)
+        self.keypoints = self.keypoints + distances
+
+    def rescale_(self, scale_factor: Tuple[float, float]) -> None:
+        """Rescale boxes & keypoints w.r.t. rescale_factor in-place.
+
+        Note:
+            Both ``rescale_`` and ``resize_`` will enlarge or shrink boxes
+            w.r.t ``scale_facotr``. The difference is that ``resize_`` only
+            changes the width and the height of boxes, but ``rescale_`` also
+            rescales the box centers simultaneously.
+
+        Args:
+            scale_factor (Tuple[float, float]): factors for scaling boxes.
+                The length should be 2.
+        """
+        assert len(scale_factor) == 2
+
+        scale_factor = self.keypoints.new_tensor(scale_factor).reshape(1, 1, 2)
+        self.keypoints = self.keypoints * scale_factor
+
+    def clip_(self, img_shape: Tuple[int, int]) -> None:
+        """Clip bounding boxes and set invisible keypoints outside the image
+        boundary in-place.
+
+        Args:
+            img_shape (Tuple[int, int]): A tuple of image height and width.
+        """
+
+        kpt_outside = torch.logical_or(
+            torch.logical_or(self.keypoints[..., 0] < 0,
+                             self.keypoints[..., 1] < 0),
+            torch.logical_or(self.keypoints[..., 0] > img_shape[1],
+                             self.keypoints[..., 1] > img_shape[0]))
+        self.keypoints_visible[kpt_outside] *= 0
+
+    def project_(self, homography_matrix: Union[Tensor, np.ndarray]) -> None:
+        """Geometrically transform bounding boxes and keypoints in-place using
+        a homography matrix.
+
+        Args:
+            homography_matrix (Tensor or np.ndarray): A 3x3 tensor or ndarray
+                representing the homography matrix for the transformation.
+        """
+        keypoints = self.keypoints
+        if isinstance(homography_matrix, np.ndarray):
+            homography_matrix = keypoints.new_tensor(homography_matrix)
+
+        # Convert keypoints to homogeneous coordinates
+        keypoints = torch.cat([
+            self.keypoints,
+            self.keypoints.new_ones(*self.keypoints.shape[:-1], 1)
+        ],
+                              dim=-1)
+
+        # Transpose keypoints for matrix multiplication
+        keypoints_T = torch.transpose(keypoints, -1, 0).contiguous().flatten(1)
+
+        # Apply homography matrix to corners and keypoints
+        keypoints_T = torch.matmul(homography_matrix, keypoints_T)
+
+        # Transpose back to original shape
+        keypoints_T = keypoints_T.reshape(3, self.keypoints.shape[1], -1)
+        keypoints = torch.transpose(keypoints_T, -1, 0).contiguous()
+
+        # Convert corners and keypoints back to non-homogeneous coordinates
+        keypoints = keypoints[..., :2] / keypoints[..., 2:3]
+
+        # Convert corners back to bounding boxes and update object attributes
+        self.keypoints = keypoints
+
+    @classmethod
+    def cat(cls: Type[T], kps_list: Sequence[T], dim: int = 0) -> T:
+        """Cancatenates an instance list into one single instance. Similar to
+        ``torch.cat``.
+
+        Args:
+            box_list (Sequence[T]): A sequence of instances.
+            dim (int): The dimension over which the box and keypoint are
+                concatenated. Defaults to 0.
+
+        Returns:
+            T: Concatenated instance.
+        """
+        assert isinstance(kps_list, Sequence)
+        if len(kps_list) == 0:
+            raise ValueError('kps_list should not be a empty list.')
+
+        assert dim == 0
+        assert all(isinstance(keypoints, cls) for keypoints in kps_list)
+
+        th_kpt_list = torch.cat(
+            [keypoints.keypoints for keypoints in kps_list], dim=dim)
+        th_kpt_vis_list = torch.cat(
+            [keypoints.keypoints_visible for keypoints in kps_list], dim=dim)
+        flip_indices = kps_list[0].flip_indices
+        return cls(
+            th_kpt_list,
+            th_kpt_vis_list,
+            clone=False,
+            flip_indices=flip_indices)
+
+    def __getitem__(self: T, index: IndexType) -> T:
+        """Rewrite getitem to protect the last dimension shape."""
+        if isinstance(index, np.ndarray):
+            index = torch.as_tensor(index, device=self.device)
+        if isinstance(index, Tensor) and index.dtype == torch.bool:
+            assert index.dim() < self.keypoints.dim() - 1
+        elif isinstance(index, tuple):
+            assert len(index) < self.keypoints.dim() - 1
+            # `Ellipsis`(...) is commonly used in index like [None, ...].
+            # When `Ellipsis` is in index, it must be the last item.
+            if Ellipsis in index:
+                assert index[-1] is Ellipsis
+
+        keypoints = self.keypoints[index]
+        keypoints_visible = self.keypoints_visible[index]
+        if self.keypoints.dim() == 2:
+            keypoints = keypoints.reshape(1, -1, 2)
+            keypoints_visible = keypoints_visible.reshape(1, -1)
+        return type(self)(
+            keypoints,
+            keypoints_visible,
+            flip_indices=self.flip_indices,
+            clone=False)
+
+    def __repr__(self) -> str:
+        """Return a strings that describes the object."""
+        return self.__class__.__name__ + '(\n' + str(self.keypoints) + ')'
+
+    @property
+    def num_keypoints(self) -> Tensor:
+        """Compute the number of visible keypoints for each object."""
+        return self.keypoints_visible.sum(dim=1).int()
+
+    def __deepcopy__(self, memo):
+        """Only clone the tensors when applying deepcopy."""
+        cls = self.__class__
+        other = cls.__new__(cls)
+        memo[id(self)] = other
+        other.keypoints = self.keypoints.clone()
+        other.keypoints_visible = self.keypoints_visible.clone()
+        other.flip_indices = deepcopy(self.flip_indices)
+        return other
+
+    def clone(self: T) -> T:
+        """Reload ``clone`` for tensors."""
+        return type(self)(
+            self.keypoints,
+            self.keypoints_visible,
+            flip_indices=self.flip_indices,
+            clone=True)
+
+    def to(self: T, *args, **kwargs) -> T:
+        """Reload ``to`` for tensors."""
+        return type(self)(
+            self.keypoints.to(*args, **kwargs),
+            self.keypoints_visible.to(*args, **kwargs),
+            flip_indices=self.flip_indices,
+            clone=False)
+
+    @property
+    def device(self) -> torch.device:
+        """Reload ``device`` from self.tensor."""
+        return self.keypoints.device
diff --git a/mmyolo/datasets/transforms/mix_img_transforms.py b/mmyolo/datasets/transforms/mix_img_transforms.py
index 4a25f6f7e..29e4a4057 100644
--- a/mmyolo/datasets/transforms/mix_img_transforms.py
+++ b/mmyolo/datasets/transforms/mix_img_transforms.py
@@ -318,7 +318,9 @@ def mix_img_transform(self, results: dict) -> dict:
         mosaic_bboxes_labels = []
         mosaic_ignore_flags = []
         mosaic_masks = []
+        mosaic_kps = []
         with_mask = True if 'gt_masks' in results else False
+        with_kps = True if 'gt_keypoints' in results else False
         # self.img_scale is wh format
         img_scale_w, img_scale_h = self.img_scale
 
@@ -374,7 +376,7 @@ def mix_img_transform(self, results: dict) -> dict:
             mosaic_ignore_flags.append(gt_ignore_flags_i)
             if with_mask and results_patch.get('gt_masks', None) is not None:
                 gt_masks_i = results_patch['gt_masks']
-                gt_masks_i = gt_masks_i.rescale(float(scale_ratio_i))
+                gt_masks_i = gt_masks_i.resize(img_i.shape[:2])
                 gt_masks_i = gt_masks_i.translate(
                     out_shape=(int(self.img_scale[0] * 2),
                                int(self.img_scale[1] * 2)),
@@ -386,6 +388,12 @@ def mix_img_transform(self, results: dict) -> dict:
                     offset=padh,
                     direction='vertical')
                 mosaic_masks.append(gt_masks_i)
+            if with_kps and results_patch.get('gt_keypoints',
+                                              None) is not None:
+                gt_kps_i = results_patch['gt_keypoints']
+                gt_kps_i.rescale_([scale_ratio_i, scale_ratio_i])
+                gt_kps_i.translate_([padw, padh])
+                mosaic_kps.append(gt_kps_i)
 
         mosaic_bboxes = mosaic_bboxes[0].cat(mosaic_bboxes, 0)
         mosaic_bboxes_labels = np.concatenate(mosaic_bboxes_labels, 0)
@@ -396,6 +404,10 @@ def mix_img_transform(self, results: dict) -> dict:
             if with_mask:
                 mosaic_masks = mosaic_masks[0].cat(mosaic_masks)
                 results['gt_masks'] = mosaic_masks
+            if with_kps:
+                mosaic_kps = mosaic_kps[0].cat(mosaic_kps, 0)
+                mosaic_kps.clip_([2 * img_scale_h, 2 * img_scale_w])
+                results['gt_keypoints'] = mosaic_kps
         else:
             # remove outside bboxes
             inside_inds = mosaic_bboxes.is_inside(
@@ -406,6 +418,10 @@ def mix_img_transform(self, results: dict) -> dict:
             if with_mask:
                 mosaic_masks = mosaic_masks[0].cat(mosaic_masks)[inside_inds]
                 results['gt_masks'] = mosaic_masks
+            if with_kps:
+                mosaic_kps = mosaic_kps[0].cat(mosaic_kps, 0)
+                mosaic_kps = mosaic_kps[inside_inds]
+                results['gt_keypoints'] = mosaic_kps
 
         results['img'] = mosaic_img
         results['img_shape'] = mosaic_img.shape
@@ -1131,6 +1147,31 @@ def mix_img_transform(self, results: dict) -> dict:
             mixup_gt_bboxes_labels = mixup_gt_bboxes_labels[inside_inds]
             mixup_gt_ignore_flags = mixup_gt_ignore_flags[inside_inds]
 
+        if 'gt_keypoints' in results:
+            # adjust kps
+            retrieve_gt_keypoints = retrieve_results['gt_keypoints']
+            retrieve_gt_keypoints.rescale_([scale_ratio, scale_ratio])
+            if self.bbox_clip_border:
+                retrieve_gt_keypoints.clip_([origin_h, origin_w])
+
+            if is_filp:
+                retrieve_gt_keypoints.flip_([origin_h, origin_w],
+                                            direction='horizontal')
+
+            # filter
+            cp_retrieve_gt_keypoints = retrieve_gt_keypoints.clone()
+            cp_retrieve_gt_keypoints.translate_([-x_offset, -y_offset])
+            if self.bbox_clip_border:
+                cp_retrieve_gt_keypoints.clip_([target_h, target_w])
+
+            # mixup
+            mixup_gt_keypoints = cp_retrieve_gt_keypoints.cat(
+                (results['gt_keypoints'], cp_retrieve_gt_keypoints), dim=0)
+            if not self.bbox_clip_border:
+                # remove outside bbox
+                mixup_gt_keypoints = mixup_gt_keypoints[inside_inds]
+            results['gt_keypoints'] = mixup_gt_keypoints
+
         results['img'] = mixup_img.astype(np.uint8)
         results['img_shape'] = mixup_img.shape
         results['gt_bboxes'] = mixup_gt_bboxes
diff --git a/mmyolo/datasets/transforms/transforms.py b/mmyolo/datasets/transforms/transforms.py
index d5179fba3..8060e9c72 100644
--- a/mmyolo/datasets/transforms/transforms.py
+++ b/mmyolo/datasets/transforms/transforms.py
@@ -7,16 +7,21 @@
 import mmcv
 import numpy as np
 import torch
+from mmcv.image.geometric import _scale_size
 from mmcv.transforms import BaseTransform, Compose
 from mmcv.transforms.utils import cache_randomness
+from mmdet.datasets.transforms import FilterAnnotations as FilterDetAnnotations
 from mmdet.datasets.transforms import LoadAnnotations as MMDET_LoadAnnotations
+from mmdet.datasets.transforms import RandomAffine as MMDET_RandomAffine
+from mmdet.datasets.transforms import RandomFlip as MMDET_RandomFlip
 from mmdet.datasets.transforms import Resize as MMDET_Resize
 from mmdet.structures.bbox import (HorizontalBoxes, autocast_box_type,
                                    get_box_type)
-from mmdet.structures.mask import PolygonMasks
+from mmdet.structures.mask import PolygonMasks, polygon_to_bitmap
 from numpy import random
 
 from mmyolo.registry import TRANSFORMS
+from .keypoint_structure import Keypoints
 
 # TODO: Waiting for MMCV support
 TRANSFORMS.register_module(module=Compose, force=True)
@@ -99,17 +104,21 @@ def _resize_img(self, results: dict):
                                             self.scale)
 
             if ratio != 1:
-                # resize image according to the ratio
-                image = mmcv.imrescale(
+                # resize image according to the shape
+                # NOTE: We are currently testing on COCO that modifying
+                # this code will not affect the results.
+                # If you find that it has an effect on your results,
+                # please feel free to contact us.
+                image = mmcv.imresize(
                     img=image,
-                    scale=ratio,
+                    size=(int(original_w * ratio), int(original_h * ratio)),
                     interpolation='area' if ratio < 1 else 'bilinear',
                     backend=self.backend)
 
             resized_h, resized_w = image.shape[:2]
-            scale_ratio = resized_h / original_h
-
-            scale_factor = (scale_ratio, scale_ratio)
+            scale_ratio_h = resized_h / original_h
+            scale_ratio_w = resized_w / original_w
+            scale_factor = (scale_ratio_w, scale_ratio_h)
 
             results['img'] = image
             results['img_shape'] = image.shape[:2]
@@ -142,6 +151,11 @@ class LetterResize(MMDET_Resize):
         stretch_only (bool): Whether stretch to the specified size directly.
             Defaults to False
         allow_scale_up (bool): Allow scale up when ratio > 1. Defaults to True
+        half_pad_param (bool): If set to True, left and right pad_param will
+            be given by dividing padding_h by 2. If set to False, pad_param is
+            in int format. We recommend setting this to False for object
+            detection tasks, and True for instance segmentation tasks.
+            Default to False.
     """
 
     def __init__(self,
@@ -150,6 +164,7 @@ def __init__(self,
                  use_mini_pad: bool = False,
                  stretch_only: bool = False,
                  allow_scale_up: bool = True,
+                 half_pad_param: bool = False,
                  **kwargs):
         super().__init__(scale=scale, keep_ratio=True, **kwargs)
 
@@ -162,6 +177,7 @@ def __init__(self,
         self.use_mini_pad = use_mini_pad
         self.stretch_only = stretch_only
         self.allow_scale_up = allow_scale_up
+        self.half_pad_param = half_pad_param
 
     def _resize_img(self, results: dict):
         """Resize images with ``results['scale']``."""
@@ -212,7 +228,8 @@ def _resize_img(self, results: dict):
                 interpolation=self.interpolation,
                 backend=self.backend)
 
-        scale_factor = (ratio[1], ratio[0])  # mmcv scale factor is (w, h)
+        scale_factor = (no_pad_shape[1] / image_shape[1],
+                        no_pad_shape[0] / image_shape[0])
 
         if 'scale_factor' in results:
             results['scale_factor_origin'] = results['scale_factor']
@@ -246,7 +263,15 @@ def _resize_img(self, results: dict):
         if 'pad_param' in results:
             results['pad_param_origin'] = results['pad_param'] * \
                                           np.repeat(ratio, 2)
-        results['pad_param'] = np.array(padding_list, dtype=np.float32)
+
+        if self.half_pad_param:
+            results['pad_param'] = np.array(
+                [padding_h / 2, padding_h / 2, padding_w / 2, padding_w / 2],
+                dtype=np.float32)
+        else:
+            # We found in object detection, using padding list with
+            # int type can get higher mAP.
+            results['pad_param'] = np.array(padding_list, dtype=np.float32)
 
     def _resize_masks(self, results: dict):
         """Resize masks with ``results['scale']``"""
@@ -370,13 +395,26 @@ def __repr__(self) -> str:
 class LoadAnnotations(MMDET_LoadAnnotations):
     """Because the yolo series does not need to consider ignore bboxes for the
     time being, in order to speed up the pipeline, it can be excluded in
-    advance."""
+    advance.
+
+    Args:
+        mask2bbox (bool): Whether to use mask annotation to get bbox.
+            Defaults to False.
+        poly2mask (bool): Whether to transform the polygons to bitmaps.
+            Defaults to False.
+        merge_polygons (bool): Whether to merge polygons into one polygon.
+            If merged, the storage structure is simpler and training is more
+            effcient, especially if the mask inside a bbox is divided into
+            multiple polygons. Defaults to True.
+    """
 
     def __init__(self,
                  mask2bbox: bool = False,
                  poly2mask: bool = False,
-                 **kwargs) -> None:
+                 merge_polygons: bool = True,
+                 **kwargs):
         self.mask2bbox = mask2bbox
+        self.merge_polygons = merge_polygons
         assert not poly2mask, 'Does not support BitmapMasks considering ' \
                               'that bitmap consumes more memory.'
         super().__init__(poly2mask=poly2mask, **kwargs)
@@ -402,6 +440,11 @@ def transform(self, results: dict) -> dict:
                 self._update_mask_ignore_data(results)
             gt_bboxes = results['gt_masks'].get_bboxes(dst_type='hbox')
             results['gt_bboxes'] = gt_bboxes
+        elif self.with_keypoints:
+            self._load_kps(results)
+            _, box_type_cls = get_box_type(self.box_type)
+            results['gt_bboxes'] = box_type_cls(
+                results.get('bbox', []), dtype=torch.float32)
         else:
             results = super().transform(results)
             self._update_mask_ignore_data(results)
@@ -485,6 +528,8 @@ def _load_masks(self, results: dict) -> None:
                             # ignore
                             self._mask_ignore_flag.append(0)
                         else:
+                            if len(gt_mask) > 1 and self.merge_polygons:
+                                gt_mask = self.merge_multi_segment(gt_mask)
                             gt_masks.append(gt_mask)
                             gt_ignore_flags.append(instance['ignore_flag'])
                             self._mask_ignore_flag.append(1)
@@ -503,6 +548,109 @@ def _load_masks(self, results: dict) -> None:
         gt_masks = PolygonMasks([mask for mask in gt_masks], h, w)
         results['gt_masks'] = gt_masks
 
+    def merge_multi_segment(self,
+                            gt_masks: List[np.ndarray]) -> List[np.ndarray]:
+        """Merge multi segments to one list.
+
+        Find the coordinates with min distance between each segment,
+        then connect these coordinates with one thin line to merge all
+        segments into one.
+        Args:
+            gt_masks(List(np.array)):
+                original segmentations in coco's json file.
+                like [segmentation1, segmentation2,...],
+                each segmentation is a list of coordinates.
+        Return:
+            gt_masks(List(np.array)): merged gt_masks
+        """
+        s = []
+        segments = [np.array(i).reshape(-1, 2) for i in gt_masks]
+        idx_list = [[] for _ in range(len(gt_masks))]
+
+        # record the indexes with min distance between each segment
+        for i in range(1, len(segments)):
+            idx1, idx2 = self.min_index(segments[i - 1], segments[i])
+            idx_list[i - 1].append(idx1)
+            idx_list[i].append(idx2)
+
+        # use two round to connect all the segments
+        # first round: first to end, i.e. A->B(partial)->C
+        # second round: end to first, i.e. C->B(remaining)-A
+        for k in range(2):
+            # forward first round
+            if k == 0:
+                for i, idx in enumerate(idx_list):
+                    # middle segments have two indexes
+                    # reverse the index of middle segments
+                    if len(idx) == 2 and idx[0] > idx[1]:
+                        idx = idx[::-1]
+                        segments[i] = segments[i][::-1, :]
+                    # add the idx[0] point for connect next segment
+                    segments[i] = np.roll(segments[i], -idx[0], axis=0)
+                    segments[i] = np.concatenate(
+                        [segments[i], segments[i][:1]])
+                    # deal with the first segment and the last one
+                    if i in [0, len(idx_list) - 1]:
+                        s.append(segments[i])
+                    # deal with the middle segment
+                    # Note that in the first round, only partial segment
+                    # are appended.
+                    else:
+                        idx = [0, idx[1] - idx[0]]
+                        s.append(segments[i][idx[0]:idx[1] + 1])
+            # forward second round
+            else:
+                for i in range(len(idx_list) - 1, -1, -1):
+                    # deal with the middle segment
+                    # append the remaining points
+                    if i not in [0, len(idx_list) - 1]:
+                        idx = idx_list[i]
+                        nidx = abs(idx[1] - idx[0])
+                        s.append(segments[i][nidx:])
+        return [np.concatenate(s).reshape(-1, )]
+
+    def min_index(self, arr1: np.ndarray, arr2: np.ndarray) -> Tuple[int, int]:
+        """Find a pair of indexes with the shortest distance.
+
+        Args:
+            arr1: (N, 2).
+            arr2: (M, 2).
+        Return:
+            tuple: a pair of indexes.
+        """
+        dis = ((arr1[:, None, :] - arr2[None, :, :])**2).sum(-1)
+        return np.unravel_index(np.argmin(dis, axis=None), dis.shape)
+
+    def _load_kps(self, results: dict) -> None:
+        """Private function to load keypoints annotations.
+
+        Args:
+            results (dict): Result dict from
+                :class:`mmengine.dataset.BaseDataset`.
+
+        Returns:
+            dict: The dict contains loaded keypoints annotations.
+        """
+        results['height'] = results['img_shape'][0]
+        results['width'] = results['img_shape'][1]
+        num_instances = len(results.get('bbox', []))
+
+        if num_instances == 0:
+            results['keypoints'] = np.empty(
+                (0, len(results['flip_indices']), 2), dtype=np.float32)
+            results['keypoints_visible'] = np.empty(
+                (0, len(results['flip_indices'])), dtype=np.int32)
+            results['category_id'] = []
+
+        results['gt_keypoints'] = Keypoints(
+            keypoints=results['keypoints'],
+            keypoints_visible=results['keypoints_visible'],
+            flip_indices=results['flip_indices'],
+        )
+
+        results['gt_ignore_flags'] = np.array([False] * num_instances)
+        results['gt_bboxes_labels'] = np.array(results['category_id']) - 1
+
     def __repr__(self) -> str:
         repr_str = self.__class__.__name__
         repr_str += f'(with_bbox={self.with_bbox}, '
@@ -512,7 +660,7 @@ def __repr__(self) -> str:
         repr_str += f'mask2bbox={self.mask2bbox}, '
         repr_str += f'poly2mask={self.poly2mask}, '
         repr_str += f"imdecode_backend='{self.imdecode_backend}', "
-        repr_str += f'file_client_args={self.file_client_args})'
+        repr_str += f'backend_args={self.backend_args})'
         return repr_str
 
 
@@ -571,7 +719,7 @@ class YOLOv5RandomAffine(BaseTransform):
         min_area_ratio (float): Threshold of area ratio between
             original bboxes and wrapped bboxes. If smaller than this value,
             the box will be removed. Defaults to 0.1.
-        use_mask_refine (bool): Whether to refine bbox by mask.
+        use_mask_refine (bool): Whether to refine bbox by mask. Deprecated.
         max_aspect_ratio (float): Aspect ratio of width and height
             threshold to filter bboxes. If max(h/w, w/h) larger than this
             value, the box will be removed. Defaults to 20.
@@ -603,6 +751,7 @@ def __init__(self,
         self.bbox_clip_border = bbox_clip_border
         self.min_bbox_size = min_bbox_size
         self.min_area_ratio = min_area_ratio
+        # The use_mask_refine parameter has been deprecated.
         self.use_mask_refine = use_mask_refine
         self.max_aspect_ratio = max_aspect_ratio
         self.resample_num = resample_num
@@ -644,7 +793,8 @@ def transform(self, results: dict) -> dict:
         num_bboxes = len(bboxes)
         if num_bboxes:
             orig_bboxes = bboxes.clone()
-            if self.use_mask_refine and 'gt_masks' in results:
+            orig_bboxes.rescale_([scaling_ratio, scaling_ratio])
+            if 'gt_masks' in results:
                 # If the dataset has annotations of mask,
                 # the mask will be used to refine bbox.
                 gt_masks = results['gt_masks']
@@ -654,10 +804,13 @@ def transform(self, results: dict) -> dict:
                                           img_h, img_w)
 
                 # refine bboxes by masks
-                bboxes = gt_masks.get_bboxes(dst_type='hbox')
+                bboxes = self.segment2box(gt_masks, height, width)
                 # filter bboxes outside image
                 valid_index = self.filter_gt_bboxes(orig_bboxes,
                                                     bboxes).numpy()
+                if self.bbox_clip_border:
+                    bboxes.clip_([height - 1e-3, width - 1e-3])
+                    gt_masks = self.clip_polygons(gt_masks, height, width)
                 results['gt_masks'] = gt_masks[valid_index]
             else:
                 bboxes.project_(warp_matrix)
@@ -665,24 +818,88 @@ def transform(self, results: dict) -> dict:
                     bboxes.clip_([height, width])
 
                 # filter bboxes
-                orig_bboxes.rescale_([scaling_ratio, scaling_ratio])
-
                 # Be careful: valid_index must convert to numpy,
                 # otherwise it will raise out of bounds when len(valid_index)=1
                 valid_index = self.filter_gt_bboxes(orig_bboxes,
                                                     bboxes).numpy()
-                if 'gt_masks' in results:
-                    results['gt_masks'] = PolygonMasks(
-                        results['gt_masks'].masks, img_h, img_w)
 
             results['gt_bboxes'] = bboxes[valid_index]
             results['gt_bboxes_labels'] = results['gt_bboxes_labels'][
                 valid_index]
             results['gt_ignore_flags'] = results['gt_ignore_flags'][
                 valid_index]
+        else:
+            if 'gt_masks' in results:
+                results['gt_masks'] = PolygonMasks([], img_h, img_w)
 
         return results
 
+    def segment2box(self, gt_masks: PolygonMasks, height: int,
+                    width: int) -> HorizontalBoxes:
+        """
+        Convert 1 segment label to 1 box label, applying inside-image
+        constraint i.e. (xy1, xy2, ...) to (xyxy)
+        Args:
+            gt_masks (torch.Tensor): the segment label
+            width (int): the width of the image. Defaults to 640
+            height (int): The height of the image. Defaults to 640
+        Returns:
+            HorizontalBoxes: the clip bboxes from gt_masks.
+        """
+        bboxes = []
+        for _, poly_per_obj in enumerate(gt_masks):
+            # simply use a number that is big enough for comparison with
+            # coordinates
+            xy_min = np.array([width * 2, height * 2], dtype=np.float32)
+            xy_max = np.zeros(2, dtype=np.float32) - 1
+
+            for p in poly_per_obj:
+                xy = np.array(p).reshape(-1, 2).astype(np.float32)
+                x, y = xy.T
+                inside = (x >= 0) & (y >= 0) & (x <= width) & (y <= height)
+                x, y = x[inside], y[inside]
+                if not any(x):
+                    continue
+                xy = np.stack([x, y], axis=0).T
+
+                xy_min = np.minimum(xy_min, np.min(xy, axis=0))
+                xy_max = np.maximum(xy_max, np.max(xy, axis=0))
+            if xy_max[0] == -1:
+                bbox = np.zeros(4, dtype=np.float32)
+            else:
+                bbox = np.concatenate([xy_min, xy_max], axis=0)
+            bboxes.append(bbox)
+
+        return HorizontalBoxes(np.stack(bboxes, axis=0))
+
+    # TODO: Move to mmdet
+    def clip_polygons(self, gt_masks: PolygonMasks, height: int,
+                      width: int) -> PolygonMasks:
+        """Function to clip points of polygons with height and width.
+
+        Args:
+            gt_masks (PolygonMasks): Annotations of instance segmentation.
+            height (int): height of clip border.
+            width (int): width of clip border.
+        Return:
+            clipped_masks (PolygonMasks):
+                Clip annotations of instance segmentation.
+        """
+        if len(gt_masks) == 0:
+            clipped_masks = PolygonMasks([], height, width)
+        else:
+            clipped_masks = []
+            for poly_per_obj in gt_masks:
+                clipped_poly_per_obj = []
+                for p in poly_per_obj:
+                    p = p.copy()
+                    p[0::2] = p[0::2].clip(0, width)
+                    p[1::2] = p[1::2].clip(0, height)
+                    clipped_poly_per_obj.append(p)
+                clipped_masks.append(clipped_poly_per_obj)
+            clipped_masks = PolygonMasks(clipped_masks, height, width)
+        return clipped_masks
+
     @staticmethod
     def warp_poly(poly: np.ndarray, warp_matrix: np.ndarray, img_w: int,
                   img_h: int) -> np.ndarray:
@@ -707,10 +924,7 @@ def warp_poly(poly: np.ndarray, warp_matrix: np.ndarray, img_w: int,
         poly = poly @ warp_matrix.T
         poly = poly[:, :2] / poly[:, 2:3]
 
-        # filter point outside image
-        x, y = poly.T
-        valid_ind_point = (x >= 0) & (y >= 0) & (x <= img_w) & (y <= img_h)
-        return poly[valid_ind_point].reshape(-1)
+        return poly.reshape(-1)
 
     def warp_mask(self, gt_masks: PolygonMasks, warp_matrix: np.ndarray,
                   img_w: int, img_h: int) -> PolygonMasks:
@@ -1284,8 +1498,8 @@ def _iou_matrix(self,
         overlap = np.prod(
             rightbottom - lefttop,
             axis=2) * (lefttop < rightbottom).all(axis=2)
-        area_gt_bbox = np.prod(gt_bbox[:, 2:] - crop_bbox[:, :2], axis=1)
-        area_crop_bbox = np.prod(gt_bbox[:, 2:] - crop_bbox[:, :2], axis=1)
+        area_gt_bbox = np.prod(gt_bbox[:, 2:] - gt_bbox[:, :2], axis=1)
+        area_crop_bbox = np.prod(crop_bbox[:, 2:] - crop_bbox[:, :2], axis=1)
         area_o = (area_gt_bbox[:, np.newaxis] + area_crop_bbox - overlap)
         return overlap / (area_o + eps)
 
@@ -1374,7 +1588,7 @@ def transform(self, results: dict) -> Union[dict, None]:
         if len(results.get('gt_masks', [])) == 0:
             return results
         gt_masks = results['gt_masks']
-        assert isinstance(gt_masks, PolygonMasks),\
+        assert isinstance(gt_masks, PolygonMasks), \
             'only support type of PolygonMasks,' \
             ' but get type: %s' % type(gt_masks)
         gt_bboxes = results['gt_bboxes']
@@ -1555,3 +1769,334 @@ def transform(self, results: dict) -> dict:
         results['gt_bboxes'] = self.box_type(
             results['gt_bboxes'].regularize_boxes(self.angle_version))
         return results
+
+
+@TRANSFORMS.register_module()
+class Polygon2Mask(BaseTransform):
+    """Polygons to bitmaps in YOLOv5.
+
+    Args:
+        downsample_ratio (int): Downsample ratio of mask.
+        mask_overlap (bool): Whether to use maskoverlap in mask process.
+            When set to True, the implementation here is the same as the
+            official, with higher training speed. If set to True, all gt masks
+            will compress into one overlap mask, the value of mask indicates
+            the index of gt masks. If set to False, one mask is a binary mask.
+            Default to True.
+        coco_style (bool): Whether to use coco_style to convert the polygons to
+            bitmaps. Note that this option is only used to test if there is an
+            improvement in training speed and we recommend setting it to False.
+    """
+
+    def __init__(self,
+                 downsample_ratio: int = 4,
+                 mask_overlap: bool = True,
+                 coco_style: bool = False):
+        self.downsample_ratio = downsample_ratio
+        self.mask_overlap = mask_overlap
+        self.coco_style = coco_style
+
+    def polygon2mask(self,
+                     img_shape: Tuple[int, int],
+                     polygons: np.ndarray,
+                     color: int = 1) -> np.ndarray:
+        """
+        Args:
+            img_shape (tuple): The image size.
+            polygons (np.ndarray): [N, M], N is the number of polygons,
+                M is the number of points(Be divided by 2).
+            color (int): color in fillPoly.
+        Return:
+            np.ndarray: the overlap mask.
+        """
+        nh, nw = (img_shape[0] // self.downsample_ratio,
+                  img_shape[1] // self.downsample_ratio)
+        if self.coco_style:
+            # This practice can lead to the loss of small objects
+            # polygons = polygons.resize((nh, nw)).masks
+            # polygons = np.asarray(polygons).reshape(-1)
+            # mask = polygon_to_bitmap([polygons], nh, nw)
+
+            polygons = np.asarray(polygons).reshape(-1)
+            mask = polygon_to_bitmap([polygons], img_shape[0],
+                                     img_shape[1]).astype(np.uint8)
+            mask = mmcv.imresize(mask, (nw, nh))
+        else:
+            mask = np.zeros(img_shape, dtype=np.uint8)
+            polygons = np.asarray(polygons)
+            polygons = polygons.astype(np.int32)
+            shape = polygons.shape
+            polygons = polygons.reshape(shape[0], -1, 2)
+            cv2.fillPoly(mask, polygons, color=color)
+            # NOTE: fillPoly firstly then resize is trying the keep the same
+            #  way of loss calculation when mask-ratio=1.
+            mask = mmcv.imresize(mask, (nw, nh))
+        return mask
+
+    def polygons2masks(self,
+                       img_shape: Tuple[int, int],
+                       polygons: PolygonMasks,
+                       color: int = 1) -> np.ndarray:
+        """Return a list of bitmap masks.
+
+        Args:
+            img_shape (tuple): The image size.
+            polygons (PolygonMasks): The mask annotations.
+            color (int): color in fillPoly.
+        Return:
+            List[np.ndarray]: the list of masks in bitmaps.
+        """
+        if self.coco_style:
+            nh, nw = (img_shape[0] // self.downsample_ratio,
+                      img_shape[1] // self.downsample_ratio)
+            masks = polygons.resize((nh, nw)).to_ndarray()
+            return masks
+        else:
+            masks = []
+            for si in range(len(polygons)):
+                mask = self.polygon2mask(img_shape, polygons[si], color)
+                masks.append(mask)
+            return np.array(masks)
+
+    def polygons2masks_overlap(
+            self, img_shape: Tuple[int, int],
+            polygons: PolygonMasks) -> Tuple[np.ndarray, np.ndarray]:
+        """Return a overlap mask and the sorted idx of area.
+
+        Args:
+            img_shape (tuple): The image size.
+            polygons (PolygonMasks): The mask annotations.
+            color (int): color in fillPoly.
+        Return:
+            Tuple[np.ndarray, np.ndarray]:
+                the overlap mask and the sorted idx of area.
+        """
+        masks = np.zeros((img_shape[0] // self.downsample_ratio,
+                          img_shape[1] // self.downsample_ratio),
+                         dtype=np.int32 if len(polygons) > 255 else np.uint8)
+        areas = []
+        ms = []
+        for si in range(len(polygons)):
+            mask = self.polygon2mask(img_shape, polygons[si], color=1)
+            ms.append(mask)
+            areas.append(mask.sum())
+        areas = np.asarray(areas)
+        index = np.argsort(-areas)
+        ms = np.array(ms)[index]
+        for i in range(len(polygons)):
+            mask = ms[i] * (i + 1)
+            masks = masks + mask
+            masks = np.clip(masks, a_min=0, a_max=i + 1)
+        return masks, index
+
+    def transform(self, results: dict) -> dict:
+        gt_masks = results['gt_masks']
+        assert isinstance(gt_masks, PolygonMasks)
+
+        if self.mask_overlap:
+            masks, sorted_idx = self.polygons2masks_overlap(
+                (gt_masks.height, gt_masks.width), gt_masks)
+            results['gt_bboxes'] = results['gt_bboxes'][sorted_idx]
+            results['gt_bboxes_labels'] = results['gt_bboxes_labels'][
+                sorted_idx]
+
+            # In this case we put gt_masks in gt_panoptic_seg
+            results.pop('gt_masks')
+            results['gt_panoptic_seg'] = torch.from_numpy(masks[None])
+        else:
+            masks = self.polygons2masks((gt_masks.height, gt_masks.width),
+                                        gt_masks,
+                                        color=1)
+            masks = torch.from_numpy(masks)
+            # Consistent logic with mmdet
+            results['gt_masks'] = masks
+        return results
+
+
+@TRANSFORMS.register_module()
+class FilterAnnotations(FilterDetAnnotations):
+    """Filter invalid annotations.
+
+    In addition to the conditions checked by ``FilterDetAnnotations``, this
+    filter adds a new condition requiring instances to have at least one
+    visible keypoints.
+    """
+
+    def __init__(self, by_keypoints: bool = False, **kwargs) -> None:
+        # TODO: add more filter options
+        super().__init__(**kwargs)
+        self.by_keypoints = by_keypoints
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> Union[dict, None]:
+        """Transform function to filter annotations.
+
+        Args:
+            results (dict): Result dict.
+        Returns:
+            dict: Updated result dict.
+        """
+        assert 'gt_bboxes' in results
+        gt_bboxes = results['gt_bboxes']
+        if gt_bboxes.shape[0] == 0:
+            return results
+
+        tests = []
+        if self.by_box:
+            tests.append(
+                ((gt_bboxes.widths > self.min_gt_bbox_wh[0]) &
+                 (gt_bboxes.heights > self.min_gt_bbox_wh[1])).numpy())
+
+        if self.by_mask:
+            assert 'gt_masks' in results
+            gt_masks = results['gt_masks']
+            tests.append(gt_masks.areas >= self.min_gt_mask_area)
+
+        if self.by_keypoints:
+            assert 'gt_keypoints' in results
+            num_keypoints = results['gt_keypoints'].num_keypoints
+            tests.append((num_keypoints > 0).numpy())
+
+        keep = tests[0]
+        for t in tests[1:]:
+            keep = keep & t
+
+        if not keep.any():
+            if self.keep_empty:
+                return None
+
+        keys = ('gt_bboxes', 'gt_bboxes_labels', 'gt_masks', 'gt_ignore_flags',
+                'gt_keypoints')
+        for key in keys:
+            if key in results:
+                results[key] = results[key][keep]
+
+        return results
+
+
+# TODO: Check if it can be merged with mmdet.YOLOXHSVRandomAug
+@TRANSFORMS.register_module()
+class RandomAffine(MMDET_RandomAffine):
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        img = results['img']
+        height = img.shape[0] + self.border[1] * 2
+        width = img.shape[1] + self.border[0] * 2
+
+        warp_matrix = self._get_random_homography_matrix(height, width)
+
+        img = cv2.warpPerspective(
+            img,
+            warp_matrix,
+            dsize=(width, height),
+            borderValue=self.border_val)
+        results['img'] = img
+        results['img_shape'] = img.shape
+
+        bboxes = results['gt_bboxes']
+        num_bboxes = len(bboxes)
+        if num_bboxes:
+            bboxes.project_(warp_matrix)
+            if self.bbox_clip_border:
+                bboxes.clip_([height, width])
+            # remove outside bbox
+            valid_index = bboxes.is_inside([height, width]).numpy()
+            results['gt_bboxes'] = bboxes[valid_index]
+            results['gt_bboxes_labels'] = results['gt_bboxes_labels'][
+                valid_index]
+            results['gt_ignore_flags'] = results['gt_ignore_flags'][
+                valid_index]
+
+            if 'gt_masks' in results:
+                raise NotImplementedError('RandomAffine only supports bbox.')
+
+            if 'gt_keypoints' in results:
+                keypoints = results['gt_keypoints']
+                keypoints.project_(warp_matrix)
+                if self.bbox_clip_border:
+                    keypoints.clip_([height, width])
+                results['gt_keypoints'] = keypoints[valid_index]
+
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(hue_delta={self.hue_delta}, '
+        repr_str += f'saturation_delta={self.saturation_delta}, '
+        repr_str += f'value_delta={self.value_delta})'
+        return repr_str
+
+
+# TODO: Check if it can be merged with mmdet.YOLOXHSVRandomAug
+@TRANSFORMS.register_module()
+class RandomFlip(MMDET_RandomFlip):
+
+    @autocast_box_type()
+    def _flip(self, results: dict) -> None:
+        """Flip images, bounding boxes, and semantic segmentation map."""
+        # flip image
+        results['img'] = mmcv.imflip(
+            results['img'], direction=results['flip_direction'])
+
+        img_shape = results['img'].shape[:2]
+
+        # flip bboxes
+        if results.get('gt_bboxes', None) is not None:
+            results['gt_bboxes'].flip_(img_shape, results['flip_direction'])
+
+        # flip keypoints
+        if results.get('gt_keypoints', None) is not None:
+            results['gt_keypoints'].flip_(img_shape, results['flip_direction'])
+
+        # flip masks
+        if results.get('gt_masks', None) is not None:
+            results['gt_masks'] = results['gt_masks'].flip(
+                results['flip_direction'])
+
+        # flip segs
+        if results.get('gt_seg_map', None) is not None:
+            results['gt_seg_map'] = mmcv.imflip(
+                results['gt_seg_map'], direction=results['flip_direction'])
+
+        # record homography matrix for flip
+        self._record_homography_matrix(results)
+
+
+@TRANSFORMS.register_module()
+class Resize(MMDET_Resize):
+
+    def _resize_keypoints(self, results: dict) -> None:
+        """Resize bounding boxes with ``results['scale_factor']``."""
+        if results.get('gt_keypoints', None) is not None:
+            results['gt_keypoints'].rescale_(results['scale_factor'])
+            if self.clip_object_border:
+                results['gt_keypoints'].clip_(results['img_shape'])
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Transform function to resize images, bounding boxes and semantic
+        segmentation map.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Resized results, 'img', 'gt_bboxes', 'gt_seg_map',
+            'scale', 'scale_factor', 'height', 'width', and 'keep_ratio' keys
+            are updated in result dict.
+        """
+        if self.scale:
+            results['scale'] = self.scale
+        else:
+            img_shape = results['img'].shape[:2]
+            results['scale'] = _scale_size(img_shape[::-1], self.scale_factor)
+        self._resize_img(results)
+        self._resize_bboxes(results)
+        self._resize_keypoints(results)
+        self._resize_masks(results)
+        self._resize_seg(results)
+        self._record_homography_matrix(results)
+        return results
diff --git a/mmyolo/datasets/utils.py b/mmyolo/datasets/utils.py
index 62fe5484b..efa2ff5ef 100644
--- a/mmyolo/datasets/utils.py
+++ b/mmyolo/datasets/utils.py
@@ -4,6 +4,7 @@
 import numpy as np
 import torch
 from mmengine.dataset import COLLATE_FUNCTIONS
+from mmengine.dist import get_dist_info
 
 from ..registry import TASK_UTILS
 
@@ -20,6 +21,8 @@ def yolov5_collate(data_batch: Sequence,
     batch_imgs = []
     batch_bboxes_labels = []
     batch_masks = []
+    batch_keyponits = []
+    batch_keypoints_visible = []
     for i in range(len(data_batch)):
         datasamples = data_batch[i]['data_samples']
         inputs = data_batch[i]['inputs']
@@ -28,14 +31,20 @@ def yolov5_collate(data_batch: Sequence,
         gt_bboxes = datasamples.gt_instances.bboxes.tensor
         gt_labels = datasamples.gt_instances.labels
         if 'masks' in datasamples.gt_instances:
-            masks = datasamples.gt_instances.masks.to_tensor(
-                dtype=torch.bool, device=gt_bboxes.device)
+            masks = datasamples.gt_instances.masks
             batch_masks.append(masks)
+        if 'gt_panoptic_seg' in datasamples:
+            batch_masks.append(datasamples.gt_panoptic_seg.pan_seg)
+        if 'keypoints' in datasamples.gt_instances:
+            keypoints = datasamples.gt_instances.keypoints
+            keypoints_visible = datasamples.gt_instances.keypoints_visible
+            batch_keyponits.append(keypoints)
+            batch_keypoints_visible.append(keypoints_visible)
+
         batch_idx = gt_labels.new_full((len(gt_labels), 1), i)
         bboxes_labels = torch.cat((batch_idx, gt_labels[:, None], gt_bboxes),
                                   dim=1)
         batch_bboxes_labels.append(bboxes_labels)
-
     collated_results = {
         'data_samples': {
             'bboxes_labels': torch.cat(batch_bboxes_labels, 0)
@@ -44,6 +53,12 @@ def yolov5_collate(data_batch: Sequence,
     if len(batch_masks) > 0:
         collated_results['data_samples']['masks'] = torch.cat(batch_masks, 0)
 
+    if len(batch_keyponits) > 0:
+        collated_results['data_samples']['keypoints'] = torch.cat(
+            batch_keyponits, 0)
+        collated_results['data_samples']['keypoints_visible'] = torch.cat(
+            batch_keypoints_visible, 0)
+
     if use_ms_training:
         collated_results['inputs'] = batch_imgs
     else:
@@ -70,10 +85,14 @@ def __init__(self,
                  img_size: int = 640,
                  size_divisor: int = 32,
                  extra_pad_ratio: float = 0.5):
-        self.batch_size = batch_size
         self.img_size = img_size
         self.size_divisor = size_divisor
         self.extra_pad_ratio = extra_pad_ratio
+        _, world_size = get_dist_info()
+        # During multi-gpu testing, the batchsize should be multiplied by
+        # worldsize, so that the number of batches can be calculated correctly.
+        # The index of batches will affect the calculation of batch shape.
+        self.batch_size = batch_size * world_size
 
     def __call__(self, data_list: List[dict]) -> List[dict]:
         image_shapes = []
diff --git a/mmyolo/models/backbones/efficient_rep.py b/mmyolo/models/backbones/efficient_rep.py
index 691c5b846..32e455f06 100644
--- a/mmyolo/models/backbones/efficient_rep.py
+++ b/mmyolo/models/backbones/efficient_rep.py
@@ -6,7 +6,7 @@
 import torch.nn as nn
 from mmdet.utils import ConfigType, OptMultiConfig
 
-from mmyolo.models.layers.yolo_bricks import SPPFBottleneck
+from mmyolo.models.layers.yolo_bricks import CSPSPPFBottleneck, SPPFBottleneck
 from mmyolo.registry import MODELS
 from ..layers import BepC3StageBlock, RepStageBlock
 from ..utils import make_round
@@ -72,6 +72,7 @@ def __init__(self,
                  input_channels: int = 3,
                  out_indices: Tuple[int] = (2, 3, 4),
                  frozen_stages: int = -1,
+                 use_cspsppf: bool = False,
                  norm_cfg: ConfigType = dict(
                      type='BN', momentum=0.03, eps=0.001),
                  act_cfg: ConfigType = dict(type='ReLU', inplace=True),
@@ -79,6 +80,7 @@ def __init__(self,
                  block_cfg: ConfigType = dict(type='RepVGGBlock'),
                  init_cfg: OptMultiConfig = None):
         self.block_cfg = block_cfg
+        self.use_cspsppf = use_cspsppf
         super().__init__(
             self.arch_settings[arch],
             deepen_factor,
@@ -145,6 +147,13 @@ def build_stage_layer(self, stage_idx: int, setting: list) -> list:
                 kernel_sizes=5,
                 norm_cfg=self.norm_cfg,
                 act_cfg=self.act_cfg)
+            if self.use_cspsppf:
+                spp = CSPSPPFBottleneck(
+                    in_channels=out_channels,
+                    out_channels=out_channels,
+                    kernel_sizes=5,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg)
             stage.append(spp)
         return stage
 
@@ -222,6 +231,7 @@ def __init__(self,
                  hidden_ratio: float = 0.5,
                  out_indices: Tuple[int] = (2, 3, 4),
                  frozen_stages: int = -1,
+                 use_cspsppf: bool = False,
                  norm_cfg: ConfigType = dict(
                      type='BN', momentum=0.03, eps=0.001),
                  act_cfg: ConfigType = dict(type='SiLU', inplace=True),
@@ -229,6 +239,7 @@ def __init__(self,
                  block_cfg: ConfigType = dict(type='ConvWrapper'),
                  init_cfg: OptMultiConfig = None):
         self.hidden_ratio = hidden_ratio
+        self.use_cspsppf = use_cspsppf
         super().__init__(
             arch=arch,
             deepen_factor=deepen_factor,
@@ -283,5 +294,12 @@ def build_stage_layer(self, stage_idx: int, setting: list) -> list:
                 kernel_sizes=5,
                 norm_cfg=self.norm_cfg,
                 act_cfg=self.act_cfg)
+            if self.use_cspsppf:
+                spp = CSPSPPFBottleneck(
+                    in_channels=out_channels,
+                    out_channels=out_channels,
+                    kernel_sizes=5,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg)
             stage.append(spp)
         return stage
diff --git a/mmyolo/models/data_preprocessors/data_preprocessor.py b/mmyolo/models/data_preprocessors/data_preprocessor.py
index f09fd8e74..a29b90844 100644
--- a/mmyolo/models/data_preprocessors/data_preprocessor.py
+++ b/mmyolo/models/data_preprocessors/data_preprocessor.py
@@ -49,6 +49,10 @@ def forward(self, inputs: Tensor, data_samples: dict) -> Tensor and dict:
             data_samples['bboxes_labels'][:, 2::2] *= scale_x
             data_samples['bboxes_labels'][:, 3::2] *= scale_y
 
+            if 'keypoints' in data_samples:
+                data_samples['keypoints'][..., 0] *= scale_x
+                data_samples['keypoints'][..., 1] *= scale_y
+
         message_hub = MessageHub.get_current_instance()
         if (message_hub.get_info('iter') + 1) % self._interval == 0:
             self._input_size = self._get_random_size(
@@ -102,6 +106,10 @@ def forward(self, data: dict, training: bool = False) -> dict:
         }
         if 'masks' in data_samples:
             data_samples_output['masks'] = data_samples['masks']
+        if 'keypoints' in data_samples:
+            data_samples_output['keypoints'] = data_samples['keypoints']
+            data_samples_output['keypoints_visible'] = data_samples[
+                'keypoints_visible']
 
         return {'inputs': inputs, 'data_samples': data_samples_output}
 
diff --git a/mmyolo/models/dense_heads/__init__.py b/mmyolo/models/dense_heads/__init__.py
index a95abd611..90587c3fb 100644
--- a/mmyolo/models/dense_heads/__init__.py
+++ b/mmyolo/models/dense_heads/__init__.py
@@ -5,10 +5,12 @@
 from .rtmdet_rotated_head import (RTMDetRotatedHead,
                                   RTMDetRotatedSepBNHeadModule)
 from .yolov5_head import YOLOv5Head, YOLOv5HeadModule
+from .yolov5_ins_head import YOLOv5InsHead, YOLOv5InsHeadModule
 from .yolov6_head import YOLOv6Head, YOLOv6HeadModule
 from .yolov7_head import YOLOv7Head, YOLOv7HeadModule, YOLOv7p6HeadModule
 from .yolov8_head import YOLOv8Head, YOLOv8HeadModule
 from .yolox_head import YOLOXHead, YOLOXHeadModule
+from .yolox_pose_head import YOLOXPoseHead, YOLOXPoseHeadModule
 
 __all__ = [
     'YOLOv5Head', 'YOLOv6Head', 'YOLOXHead', 'YOLOv5HeadModule',
@@ -16,5 +18,6 @@
     'RTMDetSepBNHeadModule', 'YOLOv7Head', 'PPYOLOEHead', 'PPYOLOEHeadModule',
     'YOLOv7HeadModule', 'YOLOv7p6HeadModule', 'YOLOv8Head', 'YOLOv8HeadModule',
     'RTMDetRotatedHead', 'RTMDetRotatedSepBNHeadModule', 'RTMDetInsSepBNHead',
-    'RTMDetInsSepBNHeadModule'
+    'RTMDetInsSepBNHeadModule', 'YOLOv5InsHead', 'YOLOv5InsHeadModule',
+    'YOLOXPoseHead', 'YOLOXPoseHeadModule'
 ]
diff --git a/mmyolo/models/dense_heads/ppyoloe_head.py b/mmyolo/models/dense_heads/ppyoloe_head.py
index 72d820041..f46898767 100644
--- a/mmyolo/models/dense_heads/ppyoloe_head.py
+++ b/mmyolo/models/dense_heads/ppyoloe_head.py
@@ -106,8 +106,7 @@ def _init_layers(self):
                 nn.Conv2d(in_channel, 4 * (self.reg_max + 1), 3, padding=1))
 
         # init proj
-        proj = torch.linspace(0, self.reg_max, self.reg_max + 1).view(
-            [1, self.reg_max + 1, 1, 1])
+        proj = torch.arange(self.reg_max + 1, dtype=torch.float)
         self.register_buffer('proj', proj, persistent=False)
 
     def forward(self, x: Tuple[Tensor]) -> Tensor:
@@ -130,16 +129,17 @@ def forward_single(self, x: Tensor, cls_stem: nn.ModuleList,
                        reg_pred: nn.ModuleList) -> Tensor:
         """Forward feature of a single scale level."""
         b, _, h, w = x.shape
-        hw = h * w
         avg_feat = F.adaptive_avg_pool2d(x, (1, 1))
         cls_logit = cls_pred(cls_stem(x, avg_feat) + x)
         bbox_dist_preds = reg_pred(reg_stem(x, avg_feat))
-        # TODO: Test whether use matmul instead of conv can speed up training.
-        bbox_dist_preds = bbox_dist_preds.reshape(
-            [-1, 4, self.reg_max + 1, hw]).permute(0, 2, 3, 1)
-
-        bbox_preds = F.conv2d(F.softmax(bbox_dist_preds, dim=1), self.proj)
-
+        if self.reg_max > 1:
+            bbox_dist_preds = bbox_dist_preds.reshape(
+                [-1, 4, self.reg_max + 1, h * w]).permute(0, 3, 1, 2)
+            bbox_preds = bbox_dist_preds.softmax(3).matmul(
+                self.proj.view([-1, 1])).squeeze(-1)
+            bbox_preds = bbox_preds.transpose(1, 2).reshape(b, -1, h, w)
+        else:
+            bbox_preds = bbox_dist_preds
         if self.training:
             return cls_logit, bbox_preds, bbox_dist_preds
         else:
diff --git a/mmyolo/models/dense_heads/yolov5_head.py b/mmyolo/models/dense_heads/yolov5_head.py
index c49d08518..fb24617fc 100644
--- a/mmyolo/models/dense_heads/yolov5_head.py
+++ b/mmyolo/models/dense_heads/yolov5_head.py
@@ -95,7 +95,12 @@ def init_weights(self):
             b = mi.bias.data.view(self.num_base_priors, -1)
             # obj (8 objects per 640 image)
             b.data[:, 4] += math.log(8 / (640 / s)**2)
-            b.data[:, 5:] += math.log(0.6 / (self.num_classes - 0.999999))
+            # NOTE: The following initialization can only be performed on the
+            # bias of the category, if the following initialization is
+            # performed on the bias of mask coefficient,
+            # there will be a significant decrease in mask AP.
+            b.data[:, 5:5 + self.num_classes] += math.log(
+                0.6 / (self.num_classes - 0.999999))
 
             mi.bias.data = b.view(-1)
 
diff --git a/mmyolo/models/dense_heads/yolov5_ins_head.py b/mmyolo/models/dense_heads/yolov5_ins_head.py
new file mode 100644
index 000000000..df94f422e
--- /dev/null
+++ b/mmyolo/models/dense_heads/yolov5_ins_head.py
@@ -0,0 +1,740 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import List, Optional, Sequence, Tuple, Union
+
+import mmcv
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmdet.models.utils import filter_scores_and_topk, multi_apply
+from mmdet.structures.bbox import bbox_cxcywh_to_xyxy
+from mmdet.utils import ConfigType, OptInstanceList
+from mmengine.config import ConfigDict
+from mmengine.dist import get_dist_info
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmyolo.registry import MODELS
+from ..utils import make_divisible
+from .yolov5_head import YOLOv5Head, YOLOv5HeadModule
+
+
+class ProtoModule(BaseModule):
+    """Mask Proto module for segmentation models of YOLOv5.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        middle_channels (int): Number of channels in the middle feature map.
+        mask_channels (int): Number of channels in the output mask feature
+            map. This is the channel count of the mask.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to ``dict(type='BN', momentum=0.03, eps=0.001)``.
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Default: dict(type='SiLU', inplace=True).
+    """
+
+    def __init__(self,
+                 *args,
+                 in_channels: int = 32,
+                 middle_channels: int = 256,
+                 mask_channels: int = 32,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 **kwargs):
+        super().__init__(*args, **kwargs)
+        self.conv1 = ConvModule(
+            in_channels,
+            middle_channels,
+            kernel_size=3,
+            padding=1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')
+        self.conv2 = ConvModule(
+            middle_channels,
+            middle_channels,
+            kernel_size=3,
+            padding=1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.conv3 = ConvModule(
+            middle_channels,
+            mask_channels,
+            kernel_size=1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self.conv3(self.conv2(self.upsample(self.conv1(x))))
+
+
+@MODELS.register_module()
+class YOLOv5InsHeadModule(YOLOv5HeadModule):
+    """Detection and Instance Segmentation Head of YOLOv5.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        mask_channels (int): Number of channels in the mask feature map.
+            This is the channel count of the mask.
+        proto_channels (int): Number of channels in the proto feature map.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to ``dict(type='BN', momentum=0.03, eps=0.001)``.
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Default: dict(type='SiLU', inplace=True).
+    """
+
+    def __init__(self,
+                 *args,
+                 num_classes: int,
+                 mask_channels: int = 32,
+                 proto_channels: int = 256,
+                 widen_factor: float = 1.0,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 **kwargs):
+        self.mask_channels = mask_channels
+        self.num_out_attrib_with_proto = 5 + num_classes + mask_channels
+        self.proto_channels = make_divisible(proto_channels, widen_factor)
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        super().__init__(
+            *args,
+            num_classes=num_classes,
+            widen_factor=widen_factor,
+            **kwargs)
+
+    def _init_layers(self):
+        """initialize conv layers in YOLOv5 Ins head."""
+        self.convs_pred = nn.ModuleList()
+        for i in range(self.num_levels):
+            conv_pred = nn.Conv2d(
+                self.in_channels[i],
+                self.num_base_priors * self.num_out_attrib_with_proto, 1)
+            self.convs_pred.append(conv_pred)
+
+        self.proto_pred = ProtoModule(
+            in_channels=self.in_channels[0],
+            middle_channels=self.proto_channels,
+            mask_channels=self.mask_channels,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (Tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+        Returns:
+            Tuple[List]: A tuple of multi-level classification scores, bbox
+            predictions, objectnesses, and mask predictions.
+        """
+        assert len(x) == self.num_levels
+        cls_scores, bbox_preds, objectnesses, coeff_preds = multi_apply(
+            self.forward_single, x, self.convs_pred)
+        mask_protos = self.proto_pred(x[0])
+        return cls_scores, bbox_preds, objectnesses, coeff_preds, mask_protos
+
+    def forward_single(
+            self, x: Tensor,
+            convs_pred: nn.Module) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+        """Forward feature of a single scale level."""
+
+        pred_map = convs_pred(x)
+        bs, _, ny, nx = pred_map.shape
+        pred_map = pred_map.view(bs, self.num_base_priors,
+                                 self.num_out_attrib_with_proto, ny, nx)
+
+        cls_score = pred_map[:, :, 5:self.num_classes + 5,
+                             ...].reshape(bs, -1, ny, nx)
+        bbox_pred = pred_map[:, :, :4, ...].reshape(bs, -1, ny, nx)
+        objectness = pred_map[:, :, 4:5, ...].reshape(bs, -1, ny, nx)
+        coeff_pred = pred_map[:, :, self.num_classes + 5:,
+                              ...].reshape(bs, -1, ny, nx)
+
+        return cls_score, bbox_pred, objectness, coeff_pred
+
+
+@MODELS.register_module()
+class YOLOv5InsHead(YOLOv5Head):
+    """YOLOv5 Instance Segmentation and Detection head.
+
+    Args:
+        mask_overlap(bool): Defaults to True.
+        loss_mask (:obj:`ConfigDict` or dict): Config of mask loss.
+        loss_mask_weight (float): The weight of mask loss.
+    """
+
+    def __init__(self,
+                 *args,
+                 mask_overlap: bool = True,
+                 loss_mask: ConfigType = dict(
+                     type='mmdet.CrossEntropyLoss',
+                     use_sigmoid=True,
+                     reduction='none'),
+                 loss_mask_weight=0.05,
+                 **kwargs):
+        super().__init__(*args, **kwargs)
+        self.mask_overlap = mask_overlap
+        self.loss_mask: nn.Module = MODELS.build(loss_mask)
+        self.loss_mask_weight = loss_mask_weight
+
+    def loss(self, x: Tuple[Tensor], batch_data_samples: Union[list,
+                                                               dict]) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        head on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`], dict): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+
+        if isinstance(batch_data_samples, list):
+            # TODO: support non-fast version ins segmention
+            raise NotImplementedError
+        else:
+            outs = self(x)
+            # Fast version
+            loss_inputs = outs + (batch_data_samples['bboxes_labels'],
+                                  batch_data_samples['masks'],
+                                  batch_data_samples['img_metas'])
+            losses = self.loss_by_feat(*loss_inputs)
+
+        return losses
+
+    def loss_by_feat(
+            self,
+            cls_scores: Sequence[Tensor],
+            bbox_preds: Sequence[Tensor],
+            objectnesses: Sequence[Tensor],
+            coeff_preds: Sequence[Tensor],
+            proto_preds: Tensor,
+            batch_gt_instances: Sequence[InstanceData],
+            batch_gt_masks: Sequence[Tensor],
+            batch_img_metas: Sequence[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (Sequence[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_priors * num_classes.
+            bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_priors * 4.
+            objectnesses (Sequence[Tensor]): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, 1, H, W).
+            coeff_preds (Sequence[Tensor]): Mask coefficient for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_priors * mask_channels.
+            proto_preds (Tensor): Mask prototype features extracted from the
+                mask head, has shape (batch_size, mask_channels, H, W).
+            batch_gt_instances (Sequence[InstanceData]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_gt_masks (Sequence[Tensor]): Batch of gt_mask.
+            batch_img_metas (Sequence[dict]): Meta information of each image,
+                e.g., image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+        Returns:
+            dict[str, Tensor]: A dictionary of losses.
+        """
+        # 1. Convert gt to norm format
+        batch_targets_normed = self._convert_gt_to_norm_format(
+            batch_gt_instances, batch_img_metas)
+
+        device = cls_scores[0].device
+        loss_cls = torch.zeros(1, device=device)
+        loss_box = torch.zeros(1, device=device)
+        loss_obj = torch.zeros(1, device=device)
+        loss_mask = torch.zeros(1, device=device)
+        scaled_factor = torch.ones(8, device=device)
+
+        for i in range(self.num_levels):
+            batch_size, _, h, w = bbox_preds[i].shape
+            target_obj = torch.zeros_like(objectnesses[i])
+
+            # empty gt bboxes
+            if batch_targets_normed.shape[1] == 0:
+                loss_box += bbox_preds[i].sum() * 0
+                loss_cls += cls_scores[i].sum() * 0
+                loss_obj += self.loss_obj(
+                    objectnesses[i], target_obj) * self.obj_level_weights[i]
+                loss_mask += coeff_preds[i].sum() * 0
+                continue
+
+            priors_base_sizes_i = self.priors_base_sizes[i]
+            # feature map scale whwh
+            scaled_factor[2:6] = torch.tensor(
+                bbox_preds[i].shape)[[3, 2, 3, 2]]
+            # Scale batch_targets from range 0-1 to range 0-features_maps size.
+            # (num_base_priors, num_bboxes, 8)
+            batch_targets_scaled = batch_targets_normed * scaled_factor
+
+            # 2. Shape match
+            wh_ratio = batch_targets_scaled[...,
+                                            4:6] / priors_base_sizes_i[:, None]
+            match_inds = torch.max(
+                wh_ratio, 1 / wh_ratio).max(2)[0] < self.prior_match_thr
+            batch_targets_scaled = batch_targets_scaled[match_inds]
+
+            # no gt bbox matches anchor
+            if batch_targets_scaled.shape[0] == 0:
+                loss_box += bbox_preds[i].sum() * 0
+                loss_cls += cls_scores[i].sum() * 0
+                loss_obj += self.loss_obj(
+                    objectnesses[i], target_obj) * self.obj_level_weights[i]
+                loss_mask += coeff_preds[i].sum() * 0
+                continue
+
+            # 3. Positive samples with additional neighbors
+
+            # check the left, up, right, bottom sides of the
+            # targets grid, and determine whether assigned
+            # them as positive samples as well.
+            batch_targets_cxcy = batch_targets_scaled[:, 2:4]
+            grid_xy = scaled_factor[[2, 3]] - batch_targets_cxcy
+            left, up = ((batch_targets_cxcy % 1 < self.near_neighbor_thr) &
+                        (batch_targets_cxcy > 1)).T
+            right, bottom = ((grid_xy % 1 < self.near_neighbor_thr) &
+                             (grid_xy > 1)).T
+            offset_inds = torch.stack(
+                (torch.ones_like(left), left, up, right, bottom))
+
+            batch_targets_scaled = batch_targets_scaled.repeat(
+                (5, 1, 1))[offset_inds]
+            retained_offsets = self.grid_offset.repeat(1, offset_inds.shape[1],
+                                                       1)[offset_inds]
+
+            # prepare pred results and positive sample indexes to
+            # calculate class loss and bbox lo
+            _chunk_targets = batch_targets_scaled.chunk(4, 1)
+            img_class_inds, grid_xy, grid_wh,\
+                priors_targets_inds = _chunk_targets
+            (priors_inds, targets_inds) = priors_targets_inds.long().T
+            (img_inds, class_inds) = img_class_inds.long().T
+
+            grid_xy_long = (grid_xy -
+                            retained_offsets * self.near_neighbor_thr).long()
+            grid_x_inds, grid_y_inds = grid_xy_long.T
+            bboxes_targets = torch.cat((grid_xy - grid_xy_long, grid_wh), 1)
+
+            # 4. Calculate loss
+            # bbox loss
+            retained_bbox_pred = bbox_preds[i].reshape(
+                batch_size, self.num_base_priors, -1, h,
+                w)[img_inds, priors_inds, :, grid_y_inds, grid_x_inds]
+            priors_base_sizes_i = priors_base_sizes_i[priors_inds]
+            decoded_bbox_pred = self._decode_bbox_to_xywh(
+                retained_bbox_pred, priors_base_sizes_i)
+            loss_box_i, iou = self.loss_bbox(decoded_bbox_pred, bboxes_targets)
+            loss_box += loss_box_i
+
+            # obj loss
+            iou = iou.detach().clamp(0)
+            target_obj[img_inds, priors_inds, grid_y_inds,
+                       grid_x_inds] = iou.type(target_obj.dtype)
+            loss_obj += self.loss_obj(objectnesses[i],
+                                      target_obj) * self.obj_level_weights[i]
+
+            # cls loss
+            if self.num_classes > 1:
+                pred_cls_scores = cls_scores[i].reshape(
+                    batch_size, self.num_base_priors, -1, h,
+                    w)[img_inds, priors_inds, :, grid_y_inds, grid_x_inds]
+
+                target_class = torch.full_like(pred_cls_scores, 0.)
+                target_class[range(batch_targets_scaled.shape[0]),
+                             class_inds] = 1.
+                loss_cls += self.loss_cls(pred_cls_scores, target_class)
+            else:
+                loss_cls += cls_scores[i].sum() * 0
+
+            # mask regression
+            retained_coeff_preds = coeff_preds[i].reshape(
+                batch_size, self.num_base_priors, -1, h,
+                w)[img_inds, priors_inds, :, grid_y_inds, grid_x_inds]
+
+            _, c, mask_h, mask_w = proto_preds.shape
+            if batch_gt_masks.shape[-2:] != (mask_h, mask_w):
+                batch_gt_masks = F.interpolate(
+                    batch_gt_masks[None], (mask_h, mask_w), mode='nearest')[0]
+
+            xywh_normed = batch_targets_scaled[:, 2:6] / scaled_factor[2:6]
+            area_normed = xywh_normed[:, 2:].prod(1)
+            xywh_scaled = xywh_normed * torch.tensor(
+                proto_preds.shape, device=device)[[3, 2, 3, 2]]
+            xyxy_scaled = bbox_cxcywh_to_xyxy(xywh_scaled)
+
+            for bs in range(batch_size):
+                match_inds = (img_inds == bs)  # matching index
+                if not match_inds.any():
+                    continue
+
+                if self.mask_overlap:
+                    mask_gti = torch.where(
+                        batch_gt_masks[bs][None] ==
+                        targets_inds[match_inds].view(-1, 1, 1), 1.0, 0.0)
+                else:
+                    mask_gti = batch_gt_masks[targets_inds][match_inds]
+
+                mask_preds = (retained_coeff_preds[match_inds]
+                              @ proto_preds[bs].view(c, -1)).view(
+                                  -1, mask_h, mask_w)
+                loss_mask_full = self.loss_mask(mask_preds, mask_gti)
+                loss_mask += (
+                    self.crop_mask(loss_mask_full[None],
+                                   xyxy_scaled[match_inds]).mean(dim=(2, 3)) /
+                    area_normed[match_inds]).mean()
+
+        _, world_size = get_dist_info()
+        return dict(
+            loss_cls=loss_cls * batch_size * world_size,
+            loss_obj=loss_obj * batch_size * world_size,
+            loss_bbox=loss_box * batch_size * world_size,
+            loss_mask=loss_mask * self.loss_mask_weight * world_size)
+
+    def _convert_gt_to_norm_format(self,
+                                   batch_gt_instances: Sequence[InstanceData],
+                                   batch_img_metas: Sequence[dict]) -> Tensor:
+        """Add target_inds for instance segmentation."""
+        batch_targets_normed = super()._convert_gt_to_norm_format(
+            batch_gt_instances, batch_img_metas)
+
+        if self.mask_overlap:
+            batch_size = len(batch_img_metas)
+            target_inds = []
+            for i in range(batch_size):
+                # find number of targets of each image
+                num_gts = (batch_gt_instances[:, 0] == i).sum()
+                # (num_anchor, num_gts)
+                target_inds.append(
+                    torch.arange(num_gts, device=batch_gt_instances.device).
+                    float().view(1, num_gts).repeat(self.num_base_priors, 1) +
+                    1)
+            target_inds = torch.cat(target_inds, 1)
+        else:
+            num_gts = batch_gt_instances.shape[0]
+            target_inds = torch.arange(
+                num_gts, device=batch_gt_instances.device).float().view(
+                    1, num_gts).repeat(self.num_base_priors, 1)
+        batch_targets_normed = torch.cat(
+            [batch_targets_normed, target_inds[..., None]], 2)
+        return batch_targets_normed
+
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        objectnesses: Optional[List[Tensor]] = None,
+                        coeff_preds: Optional[List[Tensor]] = None,
+                        proto_preds: Optional[Tensor] = None,
+                        batch_img_metas: Optional[List[dict]] = None,
+                        cfg: Optional[ConfigDict] = None,
+                        rescale: bool = True,
+                        with_nms: bool = True) -> List[InstanceData]:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+        Note: When score_factors is not None, the cls_scores are
+        usually multiplied by it then obtain the real score used in NMS.
+        Args:
+            cls_scores (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            objectnesses (list[Tensor], Optional): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, 1, H, W).
+            coeff_preds (list[Tensor]): Mask coefficients predictions
+                for all scale levels, each is a 4D-tensor, has shape
+                (batch_size, mask_channels, H, W).
+            proto_preds (Tensor): Mask prototype features extracted from the
+                mask head, has shape (batch_size, mask_channels, H, W).
+            batch_img_metas (list[dict], Optional): Batch image meta info.
+                Defaults to None.
+            cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+        Returns:
+            list[:obj:`InstanceData`]: Object detection and instance
+            segmentation results of each image after the post process.
+            Each item usually contains following keys.
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, h, w).
+        """
+        assert len(cls_scores) == len(bbox_preds) == len(coeff_preds)
+        if objectnesses is None:
+            with_objectnesses = False
+        else:
+            with_objectnesses = True
+            assert len(cls_scores) == len(objectnesses)
+
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+
+        multi_label = cfg.multi_label
+        multi_label &= self.num_classes > 1
+        cfg.multi_label = multi_label
+
+        num_imgs = len(batch_img_metas)
+        featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores]
+
+        # If the shape does not change, use the previous mlvl_priors
+        if featmap_sizes != self.featmap_sizes:
+            self.mlvl_priors = self.prior_generator.grid_priors(
+                featmap_sizes,
+                dtype=cls_scores[0].dtype,
+                device=cls_scores[0].device)
+            self.featmap_sizes = featmap_sizes
+        flatten_priors = torch.cat(self.mlvl_priors)
+
+        mlvl_strides = [
+            flatten_priors.new_full(
+                (featmap_size.numel() * self.num_base_priors, ), stride) for
+            featmap_size, stride in zip(featmap_sizes, self.featmap_strides)
+        ]
+        flatten_stride = torch.cat(mlvl_strides)
+
+        # flatten cls_scores, bbox_preds and objectness
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                  self.num_classes)
+            for cls_score in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_coeff_preds = [
+            coeff_pred.permute(0, 2, 3,
+                               1).reshape(num_imgs, -1,
+                                          self.head_module.mask_channels)
+            for coeff_pred in coeff_preds
+        ]
+
+        flatten_cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid()
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1)
+        flatten_decoded_bboxes = self.bbox_coder.decode(
+            flatten_priors.unsqueeze(0), flatten_bbox_preds, flatten_stride)
+
+        flatten_coeff_preds = torch.cat(flatten_coeff_preds, dim=1)
+
+        if with_objectnesses:
+            flatten_objectness = [
+                objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1)
+                for objectness in objectnesses
+            ]
+            flatten_objectness = torch.cat(flatten_objectness, dim=1).sigmoid()
+        else:
+            flatten_objectness = [None for _ in range(len(featmap_sizes))]
+
+        results_list = []
+        for (bboxes, scores, objectness, coeffs, mask_proto,
+             img_meta) in zip(flatten_decoded_bboxes, flatten_cls_scores,
+                              flatten_objectness, flatten_coeff_preds,
+                              proto_preds, batch_img_metas):
+            ori_shape = img_meta['ori_shape']
+            batch_input_shape = img_meta['batch_input_shape']
+            input_shape_h, input_shape_w = batch_input_shape
+            if 'pad_param' in img_meta:
+                pad_param = img_meta['pad_param']
+                input_shape_withoutpad = (input_shape_h - pad_param[0] -
+                                          pad_param[1], input_shape_w -
+                                          pad_param[2] - pad_param[3])
+            else:
+                pad_param = None
+                input_shape_withoutpad = batch_input_shape
+            scale_factor = (input_shape_withoutpad[1] / ori_shape[1],
+                            input_shape_withoutpad[0] / ori_shape[0])
+
+            score_thr = cfg.get('score_thr', -1)
+            # yolox_style does not require the following operations
+            if objectness is not None and score_thr > 0 and not cfg.get(
+                    'yolox_style', False):
+                conf_inds = objectness > score_thr
+                bboxes = bboxes[conf_inds, :]
+                scores = scores[conf_inds, :]
+                objectness = objectness[conf_inds]
+                coeffs = coeffs[conf_inds]
+
+            if objectness is not None:
+                # conf = obj_conf * cls_conf
+                scores *= objectness[:, None]
+                # NOTE: Important
+                coeffs *= objectness[:, None]
+
+            if scores.shape[0] == 0:
+                empty_results = InstanceData()
+                empty_results.bboxes = bboxes
+                empty_results.scores = scores[:, 0]
+                empty_results.labels = scores[:, 0].int()
+                h, w = ori_shape[:2] if rescale else img_meta['img_shape'][:2]
+                empty_results.masks = torch.zeros(
+                    size=(0, h, w), dtype=torch.bool, device=bboxes.device)
+                results_list.append(empty_results)
+                continue
+
+            nms_pre = cfg.get('nms_pre', 100000)
+            if cfg.multi_label is False:
+                scores, labels = scores.max(1, keepdim=True)
+                scores, _, keep_idxs, results = filter_scores_and_topk(
+                    scores,
+                    score_thr,
+                    nms_pre,
+                    results=dict(labels=labels[:, 0], coeffs=coeffs))
+                labels = results['labels']
+                coeffs = results['coeffs']
+            else:
+                out = filter_scores_and_topk(
+                    scores, score_thr, nms_pre, results=dict(coeffs=coeffs))
+                scores, labels, keep_idxs, filtered_results = out
+                coeffs = filtered_results['coeffs']
+
+            results = InstanceData(
+                scores=scores,
+                labels=labels,
+                bboxes=bboxes[keep_idxs],
+                coeffs=coeffs)
+
+            if cfg.get('yolox_style', False):
+                # do not need max_per_img
+                cfg.max_per_img = len(results)
+
+            results = self._bbox_post_process(
+                results=results,
+                cfg=cfg,
+                rescale=False,
+                with_nms=with_nms,
+                img_meta=img_meta)
+
+            if len(results.bboxes):
+                masks = self.process_mask(mask_proto, results.coeffs,
+                                          results.bboxes,
+                                          (input_shape_h, input_shape_w), True)
+                if rescale:
+                    if pad_param is not None:
+                        # bbox minus pad param
+                        top_pad, _, left_pad, _ = pad_param
+                        results.bboxes -= results.bboxes.new_tensor(
+                            [left_pad, top_pad, left_pad, top_pad])
+                        # mask crop pad param
+                        top, left = int(top_pad), int(left_pad)
+                        bottom, right = int(input_shape_h -
+                                            top_pad), int(input_shape_w -
+                                                          left_pad)
+                        masks = masks[:, :, top:bottom, left:right]
+                    results.bboxes /= results.bboxes.new_tensor(
+                        scale_factor).repeat((1, 2))
+
+                    fast_test = cfg.get('fast_test', False)
+                    if fast_test:
+                        masks = F.interpolate(
+                            masks,
+                            size=ori_shape,
+                            mode='bilinear',
+                            align_corners=False)
+                        masks = masks.squeeze(0)
+                        masks = masks > cfg.mask_thr_binary
+                    else:
+                        masks.gt_(cfg.mask_thr_binary)
+                        masks = torch.as_tensor(masks, dtype=torch.uint8)
+                        masks = masks[0].permute(1, 2,
+                                                 0).contiguous().cpu().numpy()
+                        masks = mmcv.imresize(masks,
+                                              (ori_shape[1], ori_shape[0]))
+
+                        if len(masks.shape) == 2:
+                            masks = masks[:, :, None]
+                        masks = torch.from_numpy(masks).permute(2, 0, 1)
+
+                results.bboxes[:, 0::2].clamp_(0, ori_shape[1])
+                results.bboxes[:, 1::2].clamp_(0, ori_shape[0])
+
+                results.masks = masks.bool()
+                results_list.append(results)
+            else:
+                h, w = ori_shape[:2] if rescale else img_meta['img_shape'][:2]
+                results.masks = torch.zeros(
+                    size=(0, h, w), dtype=torch.bool, device=bboxes.device)
+                results_list.append(results)
+        return results_list
+
+    def process_mask(self,
+                     mask_proto: Tensor,
+                     mask_coeff_pred: Tensor,
+                     bboxes: Tensor,
+                     shape: Tuple[int, int],
+                     upsample: bool = False) -> Tensor:
+        """Generate mask logits results.
+
+        Args:
+            mask_proto (Tensor): Mask prototype features.
+                Has shape (num_instance, mask_channels).
+            mask_coeff_pred (Tensor): Mask coefficients prediction for
+                single image. Has shape (mask_channels, H, W)
+            bboxes (Tensor): Tensor of the bbox. Has shape (num_instance, 4).
+            shape (Tuple): Batch input shape of image.
+            upsample (bool): Whether upsample masks results to batch input
+                shape. Default to False.
+        Return:
+            Tensor: Instance segmentation masks for each instance.
+                Has shape (num_instance, H, W).
+        """
+        c, mh, mw = mask_proto.shape  # CHW
+        masks = (
+            mask_coeff_pred @ mask_proto.float().view(c, -1)).sigmoid().view(
+                -1, mh, mw)[None]
+        if upsample:
+            masks = F.interpolate(
+                masks, shape, mode='bilinear', align_corners=False)  # 1CHW
+        masks = self.crop_mask(masks, bboxes)
+        return masks
+
+    def crop_mask(self, masks: Tensor, boxes: Tensor) -> Tensor:
+        """Crop mask by the bounding box.
+
+        Args:
+          masks (Tensor): Predicted mask results. Has shape
+              (1, num_instance, H, W).
+          boxes (Tensor): Tensor of the bbox. Has shape (num_instance, 4).
+        Returns:
+          (torch.Tensor): The masks are being cropped to the bounding box.
+        """
+        _, n, h, w = masks.shape
+        x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1)
+        r = torch.arange(
+            w, device=masks.device,
+            dtype=x1.dtype)[None, None, None, :]  # rows shape(1, 1, w, 1)
+        c = torch.arange(
+            h, device=masks.device,
+            dtype=x1.dtype)[None, None, :, None]  # cols shape(1, h, 1, 1)
+
+        return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2))
diff --git a/mmyolo/models/dense_heads/yolov6_head.py b/mmyolo/models/dense_heads/yolov6_head.py
index 4b492d121..3b01133f0 100644
--- a/mmyolo/models/dense_heads/yolov6_head.py
+++ b/mmyolo/models/dense_heads/yolov6_head.py
@@ -50,6 +50,7 @@ def __init__(self,
                  in_channels: Union[int, Sequence],
                  widen_factor: float = 1.0,
                  num_base_priors: int = 1,
+                 reg_max=0,
                  featmap_strides: Sequence[int] = (8, 16, 32),
                  norm_cfg: ConfigType = dict(
                      type='BN', momentum=0.03, eps=0.001),
@@ -61,6 +62,7 @@ def __init__(self,
         self.featmap_strides = featmap_strides
         self.num_levels = len(self.featmap_strides)
         self.num_base_priors = num_base_priors
+        self.reg_max = reg_max
         self.norm_cfg = norm_cfg
         self.act_cfg = act_cfg
 
@@ -80,6 +82,12 @@ def _init_layers(self):
         self.cls_preds = nn.ModuleList()
         self.reg_preds = nn.ModuleList()
         self.stems = nn.ModuleList()
+
+        if self.reg_max > 1:
+            proj = torch.arange(
+                self.reg_max + self.num_base_priors, dtype=torch.float)
+            self.register_buffer('proj', proj, persistent=False)
+
         for i in range(self.num_levels):
             self.stems.append(
                 ConvModule(
@@ -116,7 +124,7 @@ def _init_layers(self):
             self.reg_preds.append(
                 nn.Conv2d(
                     in_channels=self.in_channels[i],
-                    out_channels=self.num_base_priors * 4,
+                    out_channels=(self.num_base_priors + self.reg_max) * 4,
                     kernel_size=1))
 
     def init_weights(self):
@@ -148,6 +156,7 @@ def forward_single(self, x: Tensor, stem: nn.Module, cls_conv: nn.Module,
                        cls_pred: nn.Module, reg_conv: nn.Module,
                        reg_pred: nn.Module) -> Tuple[Tensor, Tensor]:
         """Forward feature of a single scale level."""
+        b, _, h, w = x.shape
         y = stem(x)
         cls_x = y
         reg_x = y
@@ -155,9 +164,26 @@ def forward_single(self, x: Tensor, stem: nn.Module, cls_conv: nn.Module,
         reg_feat = reg_conv(reg_x)
 
         cls_score = cls_pred(cls_feat)
-        bbox_pred = reg_pred(reg_feat)
+        bbox_dist_preds = reg_pred(reg_feat)
+
+        if self.reg_max > 1:
+            bbox_dist_preds = bbox_dist_preds.reshape(
+                [-1, 4, self.reg_max + self.num_base_priors,
+                 h * w]).permute(0, 3, 1, 2)
+
+            # TODO: The get_flops script cannot handle the situation of
+            #  matmul, and needs to be fixed later
+            # bbox_preds = bbox_dist_preds.softmax(3).matmul(self.proj)
+            bbox_preds = bbox_dist_preds.softmax(3).matmul(
+                self.proj.view([-1, 1])).squeeze(-1)
+            bbox_preds = bbox_preds.transpose(1, 2).reshape(b, -1, h, w)
+        else:
+            bbox_preds = bbox_dist_preds
 
-        return cls_score, bbox_pred
+        if self.training:
+            return cls_score, bbox_preds, bbox_dist_preds
+        else:
+            return cls_score, bbox_preds
 
 
 @MODELS.register_module()
@@ -238,6 +264,7 @@ def loss_by_feat(
             self,
             cls_scores: Sequence[Tensor],
             bbox_preds: Sequence[Tensor],
+            bbox_dist_preds: Sequence[Tensor],
             batch_gt_instances: Sequence[InstanceData],
             batch_img_metas: Sequence[dict],
             batch_gt_instances_ignore: OptInstanceList = None) -> dict:
diff --git a/mmyolo/models/dense_heads/yolov7_head.py b/mmyolo/models/dense_heads/yolov7_head.py
index 80e6aadd2..124883cf4 100644
--- a/mmyolo/models/dense_heads/yolov7_head.py
+++ b/mmyolo/models/dense_heads/yolov7_head.py
@@ -39,7 +39,7 @@ def init_weights(self):
         for mi, s in zip(self.convs_pred, self.featmap_strides):  # from
             mi = mi[1]  # nn.Conv2d
 
-            b = mi.bias.data.view(3, -1)
+            b = mi.bias.data.view(self.num_base_priors, -1)
             # obj (8 objects per 640 image)
             b.data[:, 4] += math.log(8 / (640 / s)**2)
             b.data[:, 5:] += math.log(0.6 / (self.num_classes - 0.99))
diff --git a/mmyolo/models/dense_heads/yolox_pose_head.py b/mmyolo/models/dense_heads/yolox_pose_head.py
new file mode 100644
index 000000000..96264e552
--- /dev/null
+++ b/mmyolo/models/dense_heads/yolox_pose_head.py
@@ -0,0 +1,409 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import defaultdict
+from typing import List, Optional, Sequence, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmcv.ops import batched_nms
+from mmdet.models.utils import filter_scores_and_topk
+from mmdet.utils import ConfigType, OptInstanceList
+from mmengine.config import ConfigDict
+from mmengine.model import ModuleList, bias_init_with_prob
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmyolo.registry import MODELS
+from ..utils import OutputSaveFunctionWrapper, OutputSaveObjectWrapper
+from .yolox_head import YOLOXHead, YOLOXHeadModule
+
+
+@MODELS.register_module()
+class YOLOXPoseHeadModule(YOLOXHeadModule):
+    """YOLOXPoseHeadModule serves as a head module for `YOLOX-Pose`.
+
+    In comparison to `YOLOXHeadModule`, this module introduces branches for
+    keypoint prediction.
+    """
+
+    def __init__(self, num_keypoints: int, *args, **kwargs):
+        self.num_keypoints = num_keypoints
+        super().__init__(*args, **kwargs)
+
+    def _init_layers(self):
+        """Initializes the layers in the head module."""
+        super()._init_layers()
+
+        # The pose branch requires additional layers for precise regression
+        self.stacked_convs *= 2
+
+        # Create separate layers for each level of feature maps
+        pose_convs, offsets_preds, vis_preds = [], [], []
+        for _ in self.featmap_strides:
+            pose_convs.append(self._build_stacked_convs())
+            offsets_preds.append(
+                nn.Conv2d(self.feat_channels, self.num_keypoints * 2, 1))
+            vis_preds.append(
+                nn.Conv2d(self.feat_channels, self.num_keypoints, 1))
+
+        self.multi_level_pose_convs = ModuleList(pose_convs)
+        self.multi_level_conv_offsets = ModuleList(offsets_preds)
+        self.multi_level_conv_vis = ModuleList(vis_preds)
+
+    def init_weights(self):
+        """Initialize weights of the head."""
+        super().init_weights()
+
+        # Use prior in model initialization to improve stability
+        bias_init = bias_init_with_prob(0.01)
+        for conv_vis in self.multi_level_conv_vis:
+            conv_vis.bias.data.fill_(bias_init)
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List]:
+        """Forward features from the upstream network."""
+        offsets_pred, vis_pred = [], []
+        for i in range(len(x)):
+            pose_feat = self.multi_level_pose_convs[i](x[i])
+            offsets_pred.append(self.multi_level_conv_offsets[i](pose_feat))
+            vis_pred.append(self.multi_level_conv_vis[i](pose_feat))
+        return (*super().forward(x), offsets_pred, vis_pred)
+
+
+@MODELS.register_module()
+class YOLOXPoseHead(YOLOXHead):
+    """YOLOXPoseHead head used in `YOLO-Pose.
+
+    <https://arxiv.org/abs/2204.06806>`_.
+    Args:
+        loss_pose (ConfigDict, optional): Config of keypoint OKS loss.
+    """
+
+    def __init__(
+        self,
+        loss_pose: Optional[ConfigType] = None,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.loss_pose = MODELS.build(loss_pose)
+        self.num_keypoints = self.head_module.num_keypoints
+
+        # set up buffers to save variables generated in methods of
+        # the class's base class.
+        self._log = defaultdict(list)
+        self.sampler = OutputSaveObjectWrapper(self.sampler)
+
+        # ensure that the `sigmas` in self.assigner.oks_calculator
+        # is on the same device as the model
+        if hasattr(self.assigner, 'oks_calculator'):
+            self.add_module('assigner_oks_calculator',
+                            self.assigner.oks_calculator)
+
+    def _clear(self):
+        """Clear variable buffers."""
+        self.sampler.clear()
+        self._log.clear()
+
+    def loss(self, x: Tuple[Tensor], batch_data_samples: Union[list,
+                                                               dict]) -> dict:
+
+        if isinstance(batch_data_samples, list):
+            losses = super().loss(x, batch_data_samples)
+        else:
+            outs = self(x)
+            # Fast version
+            loss_inputs = outs + (batch_data_samples['bboxes_labels'],
+                                  batch_data_samples['keypoints'],
+                                  batch_data_samples['keypoints_visible'],
+                                  batch_data_samples['img_metas'])
+            losses = self.loss_by_feat(*loss_inputs)
+
+        return losses
+
+    def loss_by_feat(
+            self,
+            cls_scores: Sequence[Tensor],
+            bbox_preds: Sequence[Tensor],
+            objectnesses: Sequence[Tensor],
+            kpt_preds: Sequence[Tensor],
+            vis_preds: Sequence[Tensor],
+            batch_gt_instances: Tensor,
+            batch_gt_keypoints: Tensor,
+            batch_gt_keypoints_visible: Tensor,
+            batch_img_metas: Sequence[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        In addition to the base class method, keypoint losses are also
+        calculated in this method.
+        """
+
+        self._clear()
+        batch_gt_instances = self.gt_kps_instances_preprocess(
+            batch_gt_instances, batch_gt_keypoints, batch_gt_keypoints_visible,
+            len(batch_img_metas))
+
+        # collect keypoints coordinates and visibility from model predictions
+        kpt_preds = torch.cat([
+            kpt_pred.flatten(2).permute(0, 2, 1).contiguous()
+            for kpt_pred in kpt_preds
+        ],
+                              dim=1)
+
+        featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores]
+        mlvl_priors = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=cls_scores[0].dtype,
+            device=cls_scores[0].device,
+            with_stride=True)
+        grid_priors = torch.cat(mlvl_priors)
+
+        flatten_kpts = self.decode_pose(grid_priors[..., :2], kpt_preds,
+                                        grid_priors[..., 2])
+
+        vis_preds = torch.cat([
+            vis_pred.flatten(2).permute(0, 2, 1).contiguous()
+            for vis_pred in vis_preds
+        ],
+                              dim=1)
+
+        # compute detection losses and collect targets for keypoints
+        # predictions simultaneously
+        self._log['pred_keypoints'] = list(flatten_kpts.detach().split(
+            1, dim=0))
+        self._log['pred_keypoints_vis'] = list(vis_preds.detach().split(
+            1, dim=0))
+
+        losses = super().loss_by_feat(cls_scores, bbox_preds, objectnesses,
+                                      batch_gt_instances, batch_img_metas,
+                                      batch_gt_instances_ignore)
+
+        kpt_targets, vis_targets = [], []
+        sampling_results = self.sampler.log['sample']
+        sampling_result_idx = 0
+        for gt_instances in batch_gt_instances:
+            if len(gt_instances) > 0:
+                sampling_result = sampling_results[sampling_result_idx]
+                kpt_target = gt_instances['keypoints'][
+                    sampling_result.pos_assigned_gt_inds]
+                vis_target = gt_instances['keypoints_visible'][
+                    sampling_result.pos_assigned_gt_inds]
+                sampling_result_idx += 1
+                kpt_targets.append(kpt_target)
+                vis_targets.append(vis_target)
+
+        if len(kpt_targets) > 0:
+            kpt_targets = torch.cat(kpt_targets, 0)
+            vis_targets = torch.cat(vis_targets, 0)
+
+        # compute keypoint losses
+        if len(kpt_targets) > 0:
+            vis_targets = (vis_targets > 0).float()
+            pos_masks = torch.cat(self._log['foreground_mask'], 0)
+            bbox_targets = torch.cat(self._log['bbox_target'], 0)
+            loss_kpt = self.loss_pose(
+                flatten_kpts.view(-1, self.num_keypoints, 2)[pos_masks],
+                kpt_targets, vis_targets, bbox_targets)
+            loss_vis = self.loss_cls(
+                vis_preds.view(-1, self.num_keypoints)[pos_masks],
+                vis_targets) / vis_targets.sum()
+        else:
+            loss_kpt = kpt_preds.sum() * 0
+            loss_vis = vis_preds.sum() * 0
+
+        losses.update(dict(loss_kpt=loss_kpt, loss_vis=loss_vis))
+
+        self._clear()
+        return losses
+
+    @torch.no_grad()
+    def _get_targets_single(
+            self,
+            priors: Tensor,
+            cls_preds: Tensor,
+            decoded_bboxes: Tensor,
+            objectness: Tensor,
+            gt_instances: InstanceData,
+            img_meta: dict,
+            gt_instances_ignore: Optional[InstanceData] = None) -> tuple:
+        """Calculates targets for a single image, and saves them to the log.
+
+        This method is similar to the _get_targets_single method in the base
+        class, but additionally saves the foreground mask and bbox targets to
+        the log.
+        """
+
+        # Construct a combined representation of bboxes and keypoints to
+        # ensure keypoints are also involved in the positive sample
+        # assignment process
+        kpt = self._log['pred_keypoints'].pop(0).squeeze(0)
+        kpt_vis = self._log['pred_keypoints_vis'].pop(0).squeeze(0)
+        kpt = torch.cat((kpt, kpt_vis.unsqueeze(-1)), dim=-1)
+        decoded_bboxes = torch.cat((decoded_bboxes, kpt.flatten(1)), dim=1)
+
+        targets = super()._get_targets_single(priors, cls_preds,
+                                              decoded_bboxes, objectness,
+                                              gt_instances, img_meta,
+                                              gt_instances_ignore)
+        self._log['foreground_mask'].append(targets[0])
+        self._log['bbox_target'].append(targets[3])
+        return targets
+
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        objectnesses: Optional[List[Tensor]] = None,
+                        kpt_preds: Optional[List[Tensor]] = None,
+                        vis_preds: Optional[List[Tensor]] = None,
+                        batch_img_metas: Optional[List[dict]] = None,
+                        cfg: Optional[ConfigDict] = None,
+                        rescale: bool = True,
+                        with_nms: bool = True) -> List[InstanceData]:
+        """Transform a batch of output features extracted by the head into bbox
+        and keypoint results.
+
+        In addition to the base class method, keypoint predictions are also
+        calculated in this method.
+        """
+        """calculate predicted bboxes and get the kept instances indices.
+
+        use OutputSaveFunctionWrapper as context manager to obtain
+        intermediate output from a parent class without copying a
+        arge block of code
+        """
+        with OutputSaveFunctionWrapper(
+                filter_scores_and_topk,
+                super().predict_by_feat.__globals__) as outputs_1:
+            with OutputSaveFunctionWrapper(
+                    batched_nms,
+                    super()._bbox_post_process.__globals__) as outputs_2:
+                results_list = super().predict_by_feat(cls_scores, bbox_preds,
+                                                       objectnesses,
+                                                       batch_img_metas, cfg,
+                                                       rescale, with_nms)
+                keep_indices_topk = [
+                    out[2][:cfg.max_per_img] for out in outputs_1
+                ]
+                keep_indices_nms = [
+                    out[1][:cfg.max_per_img] for out in outputs_2
+                ]
+
+        num_imgs = len(batch_img_metas)
+
+        # recover keypoints coordinates from model predictions
+        featmap_sizes = [vis_pred.shape[2:] for vis_pred in vis_preds]
+        priors = torch.cat(self.mlvl_priors)
+        strides = [
+            priors.new_full((featmap_size.numel() * self.num_base_priors, ),
+                            stride) for featmap_size, stride in zip(
+                                featmap_sizes, self.featmap_strides)
+        ]
+        strides = torch.cat(strides)
+        kpt_preds = torch.cat([
+            kpt_pred.permute(0, 2, 3, 1).reshape(
+                num_imgs, -1, self.num_keypoints * 2) for kpt_pred in kpt_preds
+        ],
+                              dim=1)
+        flatten_decoded_kpts = self.decode_pose(priors, kpt_preds, strides)
+
+        vis_preds = torch.cat([
+            vis_pred.permute(0, 2, 3, 1).reshape(
+                num_imgs, -1, self.num_keypoints) for vis_pred in vis_preds
+        ],
+                              dim=1).sigmoid()
+
+        # select keypoints predictions according to bbox scores and nms result
+        keep_indices_nms_idx = 0
+        for pred_instances, kpts, kpts_vis, img_meta, keep_idxs \
+            in zip(
+                results_list, flatten_decoded_kpts, vis_preds,
+                batch_img_metas, keep_indices_topk):
+
+            pred_instances.bbox_scores = pred_instances.scores
+
+            if len(pred_instances) == 0:
+                pred_instances.keypoints = kpts[:0]
+                pred_instances.keypoint_scores = kpts_vis[:0]
+                continue
+
+            kpts = kpts[keep_idxs]
+            kpts_vis = kpts_vis[keep_idxs]
+
+            if rescale:
+                pad_param = img_meta.get('img_meta', None)
+                scale_factor = img_meta['scale_factor']
+                if pad_param is not None:
+                    kpts -= kpts.new_tensor([pad_param[2], pad_param[0]])
+                kpts /= kpts.new_tensor(scale_factor).repeat(
+                    (1, self.num_keypoints, 1))
+
+            keep_idxs_nms = keep_indices_nms[keep_indices_nms_idx]
+            kpts = kpts[keep_idxs_nms]
+            kpts_vis = kpts_vis[keep_idxs_nms]
+            keep_indices_nms_idx += 1
+
+            pred_instances.keypoints = kpts
+            pred_instances.keypoint_scores = kpts_vis
+
+        results_list = [r.numpy() for r in results_list]
+        return results_list
+
+    def decode_pose(self, grids: torch.Tensor, offsets: torch.Tensor,
+                    strides: Union[torch.Tensor, int]) -> torch.Tensor:
+        """Decode regression offsets to keypoints.
+
+        Args:
+            grids (torch.Tensor): The coordinates of the feature map grids.
+            offsets (torch.Tensor): The predicted offset of each keypoint
+                relative to its corresponding grid.
+            strides (torch.Tensor | int): The stride of the feature map for
+                each instance.
+        Returns:
+            torch.Tensor: The decoded keypoints coordinates.
+        """
+
+        if isinstance(strides, int):
+            strides = torch.tensor([strides]).to(offsets)
+
+        strides = strides.reshape(1, -1, 1, 1)
+        offsets = offsets.reshape(*offsets.shape[:2], -1, 2)
+        xy_coordinates = (offsets[..., :2] * strides) + grids.unsqueeze(1)
+        return xy_coordinates
+
+    @staticmethod
+    def gt_kps_instances_preprocess(batch_gt_instances: Tensor,
+                                    batch_gt_keypoints,
+                                    batch_gt_keypoints_visible,
+                                    batch_size: int) -> List[InstanceData]:
+        """Split batch_gt_instances with batch size.
+
+        Args:
+            batch_gt_instances (Tensor): Ground truth
+                a 2D-Tensor for whole batch, shape [all_gt_bboxes, 6]
+            batch_size (int): Batch size.
+
+        Returns:
+            List: batch gt instances data, shape [batch_size, InstanceData]
+        """
+        # faster version
+        batch_instance_list = []
+        for i in range(batch_size):
+            batch_gt_instance_ = InstanceData()
+            single_batch_instance = \
+                batch_gt_instances[batch_gt_instances[:, 0] == i, :]
+            keypoints = \
+                batch_gt_keypoints[batch_gt_instances[:, 0] == i, :]
+            keypoints_visible = \
+                batch_gt_keypoints_visible[batch_gt_instances[:, 0] == i, :]
+            batch_gt_instance_.bboxes = single_batch_instance[:, 2:]
+            batch_gt_instance_.labels = single_batch_instance[:, 1]
+            batch_gt_instance_.keypoints = keypoints
+            batch_gt_instance_.keypoints_visible = keypoints_visible
+            batch_instance_list.append(batch_gt_instance_)
+
+        return batch_instance_list
+
+    @staticmethod
+    def gt_instances_preprocess(batch_gt_instances: List[InstanceData], *args,
+                                **kwargs) -> List[InstanceData]:
+        return batch_gt_instances
diff --git a/mmyolo/models/layers/__init__.py b/mmyolo/models/layers/__init__.py
index f709dbb7e..02753057f 100644
--- a/mmyolo/models/layers/__init__.py
+++ b/mmyolo/models/layers/__init__.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .ema import ExpMomentumEMA
-from .yolo_bricks import (BepC3StageBlock, CSPLayerWithTwoConv,
+from .yolo_bricks import (BepC3StageBlock, BiFusion, CSPLayerWithTwoConv,
                           DarknetBottleneck, EELANBlock, EffectiveSELayer,
                           ELANBlock, ImplicitA, ImplicitM,
                           MaxPoolAndStrideConvBlock, PPYOLOEBasicBlock,
@@ -12,5 +12,5 @@
     'ELANBlock', 'MaxPoolAndStrideConvBlock', 'SPPFCSPBlock',
     'PPYOLOEBasicBlock', 'EffectiveSELayer', 'TinyDownSampleBlock',
     'EELANBlock', 'ImplicitA', 'ImplicitM', 'BepC3StageBlock',
-    'CSPLayerWithTwoConv', 'DarknetBottleneck'
+    'CSPLayerWithTwoConv', 'DarknetBottleneck', 'BiFusion'
 ]
diff --git a/mmyolo/models/layers/yolo_bricks.py b/mmyolo/models/layers/yolo_bricks.py
index 2e69d528b..19175be1a 100644
--- a/mmyolo/models/layers/yolo_bricks.py
+++ b/mmyolo/models/layers/yolo_bricks.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Optional, Sequence, Tuple, Union
+from typing import List, Optional, Sequence, Tuple, Union
 
 import numpy as np
 import torch
@@ -1508,3 +1508,221 @@ def forward(self, x: Tensor) -> Tensor:
         x_main = list(x_main.split((self.mid_channels, self.mid_channels), 1))
         x_main.extend(blocks(x_main[-1]) for blocks in self.blocks)
         return self.final_conv(torch.cat(x_main, 1))
+
+
+class BiFusion(nn.Module):
+    """BiFusion Block in YOLOv6.
+
+    BiFusion fuses current-, high- and low-level features.
+    Compared with concatenation in PAN, it fuses an extra low-level feature.
+
+    Args:
+        in_channels0 (int): The channels of current-level feature.
+        in_channels1 (int): The input channels of lower-level feature.
+        out_channels (int): The out channels of the BiFusion module.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='SiLU', inplace=True).
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels0: int,
+                 in_channels1: int,
+                 out_channels: int,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='ReLU', inplace=True)):
+        super().__init__()
+        self.conv1 = ConvModule(
+            in_channels0,
+            out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.conv2 = ConvModule(
+            in_channels1,
+            out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.conv3 = ConvModule(
+            out_channels * 3,
+            out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.upsample = nn.ConvTranspose2d(
+            out_channels, out_channels, kernel_size=2, stride=2, bias=True)
+        self.downsample = ConvModule(
+            out_channels,
+            out_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, x: List[torch.Tensor]) -> Tensor:
+        """Forward process
+        Args:
+            x (List[torch.Tensor]): The tensor list of length 3.
+                x[0]: The high-level feature.
+                x[1]: The current-level feature.
+                x[2]: The low-level feature.
+        """
+        x0 = self.upsample(x[0])
+        x1 = self.conv1(x[1])
+        x2 = self.downsample(self.conv2(x[2]))
+        return self.conv3(torch.cat((x0, x1, x2), dim=1))
+
+
+class CSPSPPFBottleneck(BaseModule):
+    """The SPPF block having a CSP-like version in YOLOv6 3.0.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        kernel_sizes (int, tuple[int]): Sequential or number of kernel
+            sizes of pooling layers. Defaults to 5.
+        use_conv_first (bool): Whether to use conv before pooling layer.
+            In YOLOv5 and YOLOX, the para set to True.
+            In PPYOLOE, the para set to False.
+            Defaults to True.
+        mid_channels_scale (float): Channel multiplier, multiply in_channels
+            by this amount to get mid_channels. This parameter is valid only
+            when use_conv_fist=True.Defaults to 0.5.
+        conv_cfg (dict): Config dict for convolution layer. Defaults to None.
+            which means using conv2d. Defaults to None.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='SiLU', inplace=True).
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_sizes: Union[int, Sequence[int]] = 5,
+                 use_conv_first: bool = True,
+                 mid_channels_scale: float = 0.5,
+                 conv_cfg: ConfigType = None,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(init_cfg)
+
+        if use_conv_first:
+            mid_channels = int(in_channels * mid_channels_scale)
+            self.conv1 = ConvModule(
+                in_channels,
+                mid_channels,
+                1,
+                stride=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            self.conv3 = ConvModule(
+                mid_channels,
+                mid_channels,
+                3,
+                stride=1,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            self.conv4 = ConvModule(
+                mid_channels,
+                mid_channels,
+                1,
+                stride=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+        else:
+            mid_channels = in_channels
+            self.conv1 = None
+            self.conv3 = None
+            self.conv4 = None
+
+        self.conv2 = ConvModule(
+            in_channels,
+            mid_channels,
+            1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        self.kernel_sizes = kernel_sizes
+
+        if isinstance(kernel_sizes, int):
+            self.poolings = nn.MaxPool2d(
+                kernel_size=kernel_sizes, stride=1, padding=kernel_sizes // 2)
+            conv2_in_channels = mid_channels * 4
+        else:
+            self.poolings = nn.ModuleList([
+                nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2)
+                for ks in kernel_sizes
+            ])
+            conv2_in_channels = mid_channels * (len(kernel_sizes) + 1)
+
+        self.conv5 = ConvModule(
+            conv2_in_channels,
+            mid_channels,
+            1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.conv6 = ConvModule(
+            mid_channels,
+            mid_channels,
+            3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.conv7 = ConvModule(
+            mid_channels * 2,
+            out_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward process
+        Args:
+            x (Tensor): The input tensor.
+        """
+        x0 = self.conv4(self.conv3(self.conv1(x))) if self.conv1 else x
+        y = self.conv2(x)
+
+        if isinstance(self.kernel_sizes, int):
+            x1 = self.poolings(x0)
+            x2 = self.poolings(x1)
+            x3 = torch.cat([x0, x1, x2, self.poolings(x2)], dim=1)
+        else:
+            x3 = torch.cat(
+                [x0] + [pooling(x0) for pooling in self.poolings], dim=1)
+
+        x3 = self.conv6(self.conv5(x3))
+        x = self.conv7(torch.cat([y, x3], dim=1))
+        return x
diff --git a/mmyolo/models/losses/__init__.py b/mmyolo/models/losses/__init__.py
index ee192921b..c89fe4dc4 100644
--- a/mmyolo/models/losses/__init__.py
+++ b/mmyolo/models/losses/__init__.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .iou_loss import IoULoss, bbox_overlaps
+from .oks_loss import OksLoss
 
-__all__ = ['IoULoss', 'bbox_overlaps']
+__all__ = ['IoULoss', 'bbox_overlaps', 'OksLoss']
diff --git a/mmyolo/models/losses/oks_loss.py b/mmyolo/models/losses/oks_loss.py
new file mode 100644
index 000000000..62c63422b
--- /dev/null
+++ b/mmyolo/models/losses/oks_loss.py
@@ -0,0 +1,91 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from mmyolo.registry import MODELS
+
+try:
+    from mmpose.datasets.datasets.utils import parse_pose_metainfo
+except ImportError:
+    parse_pose_metainfo = None
+
+
+@MODELS.register_module()
+class OksLoss(nn.Module):
+    """A PyTorch implementation of the Object Keypoint Similarity (OKS) loss as
+    described in the paper "YOLO-Pose: Enhancing YOLO for Multi Person Pose
+    Estimation Using Object Keypoint Similarity Loss" by Debapriya et al.
+
+    (2022).
+    The OKS loss is used for keypoint-based object recognition and consists
+    of a measure of the similarity between predicted and ground truth
+    keypoint locations, adjusted by the size of the object in the image.
+    The loss function takes as input the predicted keypoint locations, the
+    ground truth keypoint locations, a mask indicating which keypoints are
+    valid, and bounding boxes for the objects.
+    Args:
+        metainfo (Optional[str]): Path to a JSON file containing information
+            about the dataset's annotations.
+        loss_weight (float): Weight for the loss.
+    """
+
+    def __init__(self,
+                 metainfo: Optional[str] = None,
+                 loss_weight: float = 1.0):
+        super().__init__()
+
+        if metainfo is not None:
+            if parse_pose_metainfo is None:
+                raise ImportError(
+                    'Please run "mim install -r requirements/mmpose.txt" '
+                    'to install mmpose first for OksLossn.')
+            metainfo = parse_pose_metainfo(dict(from_file=metainfo))
+            sigmas = metainfo.get('sigmas', None)
+            if sigmas is not None:
+                self.register_buffer('sigmas', torch.as_tensor(sigmas))
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                output: Tensor,
+                target: Tensor,
+                target_weights: Tensor,
+                bboxes: Optional[Tensor] = None) -> Tensor:
+        oks = self.compute_oks(output, target, target_weights, bboxes)
+        loss = 1 - oks
+        return loss * self.loss_weight
+
+    def compute_oks(self,
+                    output: Tensor,
+                    target: Tensor,
+                    target_weights: Tensor,
+                    bboxes: Optional[Tensor] = None) -> Tensor:
+        """Calculates the OKS loss.
+
+        Args:
+            output (Tensor): Predicted keypoints in shape N x k x 2, where N
+                is batch size, k is the number of keypoints, and 2 are the
+                xy coordinates.
+            target (Tensor): Ground truth keypoints in the same shape as
+                output.
+            target_weights (Tensor): Mask of valid keypoints in shape N x k,
+                with 1 for valid and 0 for invalid.
+            bboxes (Optional[Tensor]): Bounding boxes in shape N x 4,
+                where 4 are the xyxy coordinates.
+        Returns:
+            Tensor: The calculated OKS loss.
+        """
+
+        dist = torch.norm(output - target, dim=-1)
+
+        if hasattr(self, 'sigmas'):
+            sigmas = self.sigmas.reshape(*((1, ) * (dist.ndim - 1)), -1)
+            dist = dist / sigmas
+        if bboxes is not None:
+            area = torch.norm(bboxes[..., 2:] - bboxes[..., :2], dim=-1)
+            dist = dist / area.clip(min=1e-8).unsqueeze(-1)
+
+        return (torch.exp(-dist.pow(2) / 2) * target_weights).sum(
+            dim=-1) / target_weights.sum(dim=-1).clip(min=1e-8)
diff --git a/mmyolo/models/necks/__init__.py b/mmyolo/models/necks/__init__.py
index 6da9641ce..159fae8d6 100644
--- a/mmyolo/models/necks/__init__.py
+++ b/mmyolo/models/necks/__init__.py
@@ -3,7 +3,8 @@
 from .cspnext_pafpn import CSPNeXtPAFPN
 from .ppyoloe_csppan import PPYOLOECSPPAFPN
 from .yolov5_pafpn import YOLOv5PAFPN
-from .yolov6_pafpn import YOLOv6CSPRepPAFPN, YOLOv6RepPAFPN
+from .yolov6_pafpn import (YOLOv6CSPRepBiPAFPN, YOLOv6CSPRepPAFPN,
+                           YOLOv6RepBiPAFPN, YOLOv6RepPAFPN)
 from .yolov7_pafpn import YOLOv7PAFPN
 from .yolov8_pafpn import YOLOv8PAFPN
 from .yolox_pafpn import YOLOXPAFPN
@@ -11,5 +12,5 @@
 __all__ = [
     'YOLOv5PAFPN', 'BaseYOLONeck', 'YOLOv6RepPAFPN', 'YOLOXPAFPN',
     'CSPNeXtPAFPN', 'YOLOv7PAFPN', 'PPYOLOECSPPAFPN', 'YOLOv6CSPRepPAFPN',
-    'YOLOv8PAFPN'
+    'YOLOv8PAFPN', 'YOLOv6RepBiPAFPN', 'YOLOv6CSPRepBiPAFPN'
 ]
diff --git a/mmyolo/models/necks/yolov6_pafpn.py b/mmyolo/models/necks/yolov6_pafpn.py
index 74b7ce932..877827123 100644
--- a/mmyolo/models/necks/yolov6_pafpn.py
+++ b/mmyolo/models/necks/yolov6_pafpn.py
@@ -7,7 +7,7 @@
 from mmdet.utils import ConfigType, OptMultiConfig
 
 from mmyolo.registry import MODELS
-from ..layers import BepC3StageBlock, RepStageBlock
+from ..layers import BepC3StageBlock, BiFusion, RepStageBlock
 from ..utils import make_round
 from .base_yolo_neck import BaseYOLONeck
 
@@ -283,3 +283,245 @@ def build_bottom_up_layer(self, idx: int) -> nn.Module:
             hidden_ratio=self.hidden_ratio,
             norm_cfg=self.norm_cfg,
             act_cfg=self.block_act_cfg)
+
+
+@MODELS.register_module()
+class YOLOv6RepBiPAFPN(YOLOv6RepPAFPN):
+    """Path Aggregation Network used in YOLOv6 3.0.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Defaults to 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        num_csp_blocks (int): Number of bottlenecks in CSPLayer. Defaults to 1.
+        freeze_all(bool): Whether to freeze the model.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='ReLU', inplace=True).
+        block_cfg (dict): Config dict for the block used to build each
+            layer. Defaults to dict(type='RepVGGBlock').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: List[int],
+                 out_channels: int,
+                 deepen_factor: float = 1.0,
+                 widen_factor: float = 1.0,
+                 num_csp_blocks: int = 12,
+                 freeze_all: bool = False,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='ReLU', inplace=True),
+                 block_cfg: ConfigType = dict(type='RepVGGBlock'),
+                 init_cfg: OptMultiConfig = None):
+        self.extra_in_channel = in_channels[0]
+        super().__init__(
+            in_channels=in_channels[1:],
+            out_channels=out_channels,
+            deepen_factor=deepen_factor,
+            widen_factor=widen_factor,
+            num_csp_blocks=num_csp_blocks,
+            freeze_all=freeze_all,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            block_cfg=block_cfg,
+            init_cfg=init_cfg)
+
+    def build_top_down_layer(self, idx: int) -> nn.Module:
+        """build top down layer.
+
+        Args:
+            idx (int): layer idx.
+        Returns:
+            nn.Module: The top down layer.
+        """
+        block_cfg = self.block_cfg.copy()
+
+        layer0 = RepStageBlock(
+            in_channels=int(self.out_channels[idx - 1] * self.widen_factor),
+            out_channels=int(self.out_channels[idx - 1] * self.widen_factor),
+            num_blocks=make_round(self.num_csp_blocks, self.deepen_factor),
+            block_cfg=block_cfg)
+
+        if idx == 1:
+            return layer0
+        elif idx == 2:
+            layer1 = ConvModule(
+                in_channels=int(self.out_channels[idx - 1] *
+                                self.widen_factor),
+                out_channels=int(self.out_channels[idx - 2] *
+                                 self.widen_factor),
+                kernel_size=1,
+                stride=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+            return nn.Sequential(layer0, layer1)
+
+    def build_upsample_layer(self, idx: int) -> nn.Module:
+        """build upsample layer.
+
+        Args:
+            idx (int): layer idx.
+        Returns:
+            nn.Module: The upsample layer.
+        """
+        in_channels1 = self.in_channels[
+            idx - 2] if idx > 1 else self.extra_in_channel
+        return BiFusion(
+            in_channels0=int(self.in_channels[idx - 1] * self.widen_factor),
+            in_channels1=int(in_channels1 * self.widen_factor),
+            out_channels=int(self.out_channels[idx - 1] * self.widen_factor),
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def forward(self, inputs: List[torch.Tensor]) -> tuple:
+        """Forward function."""
+        assert len(inputs) == len(self.in_channels) + 1
+        # reduce layers
+        reduce_outs = [inputs[0]]
+        for idx in range(len(self.in_channels)):
+            reduce_outs.append(self.reduce_layers[idx](inputs[idx + 1]))
+
+        # top-down path
+        inner_outs = [reduce_outs[-1]]
+        for idx in range(len(self.in_channels) - 1, 0, -1):
+            feat_high = inner_outs[0]
+            feat_cur = reduce_outs[idx]
+            feat_low = reduce_outs[idx - 1]
+            top_down_layer_inputs = self.upsample_layers[len(self.in_channels)
+                                                         - 1 - idx]([
+                                                             feat_high,
+                                                             feat_cur, feat_low
+                                                         ])
+            inner_out = self.top_down_layers[len(self.in_channels) - 1 - idx](
+                top_down_layer_inputs)
+            inner_outs.insert(0, inner_out)
+
+        # bottom-up path
+        outs = [inner_outs[0]]
+        for idx in range(len(self.in_channels) - 1):
+            feat_low = outs[-1]
+            feat_high = inner_outs[idx + 1]
+            downsample_feat = self.downsample_layers[idx](feat_low)
+            out = self.bottom_up_layers[idx](
+                torch.cat([downsample_feat, feat_high], 1))
+            outs.append(out)
+
+        # out_layers
+        results = []
+        for idx in range(len(self.in_channels)):
+            results.append(self.out_layers[idx](outs[idx]))
+
+        return tuple(results)
+
+
+@MODELS.register_module()
+class YOLOv6CSPRepBiPAFPN(YOLOv6RepBiPAFPN):
+    """Path Aggregation Network used in YOLOv6 3.0.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Defaults to 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        num_csp_blocks (int): Number of bottlenecks in CSPLayer. Defaults to 1.
+        freeze_all(bool): Whether to freeze the model.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='ReLU', inplace=True).
+        block_cfg (dict): Config dict for the block used to build each
+            layer. Defaults to dict(type='RepVGGBlock').
+        block_act_cfg (dict): Config dict for activation layer used in each
+            stage. Defaults to dict(type='SiLU', inplace=True).
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: List[int],
+                 out_channels: int,
+                 deepen_factor: float = 1.0,
+                 widen_factor: float = 1.0,
+                 hidden_ratio: float = 0.5,
+                 num_csp_blocks: int = 12,
+                 freeze_all: bool = False,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='ReLU', inplace=True),
+                 block_act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 block_cfg: ConfigType = dict(type='RepVGGBlock'),
+                 init_cfg: OptMultiConfig = None):
+        self.hidden_ratio = hidden_ratio
+        self.block_act_cfg = block_act_cfg
+        super().__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            deepen_factor=deepen_factor,
+            widen_factor=widen_factor,
+            num_csp_blocks=num_csp_blocks,
+            freeze_all=freeze_all,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            block_cfg=block_cfg,
+            init_cfg=init_cfg)
+
+    def build_top_down_layer(self, idx: int) -> nn.Module:
+        """build top down layer.
+
+        Args:
+            idx (int): layer idx.
+        Returns:
+            nn.Module: The top down layer.
+        """
+        block_cfg = self.block_cfg.copy()
+
+        layer0 = BepC3StageBlock(
+            in_channels=int(self.out_channels[idx - 1] * self.widen_factor),
+            out_channels=int(self.out_channels[idx - 1] * self.widen_factor),
+            num_blocks=make_round(self.num_csp_blocks, self.deepen_factor),
+            block_cfg=block_cfg,
+            hidden_ratio=self.hidden_ratio,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.block_act_cfg)
+
+        if idx == 1:
+            return layer0
+        elif idx == 2:
+            layer1 = ConvModule(
+                in_channels=int(self.out_channels[idx - 1] *
+                                self.widen_factor),
+                out_channels=int(self.out_channels[idx - 2] *
+                                 self.widen_factor),
+                kernel_size=1,
+                stride=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+            return nn.Sequential(layer0, layer1)
+
+    def build_bottom_up_layer(self, idx: int) -> nn.Module:
+        """build bottom up layer.
+
+        Args:
+            idx (int): layer idx.
+        Returns:
+            nn.Module: The bottom up layer.
+        """
+        block_cfg = self.block_cfg.copy()
+
+        return BepC3StageBlock(
+            in_channels=int(self.out_channels[idx] * 2 * self.widen_factor),
+            out_channels=int(self.out_channels[idx + 1] * self.widen_factor),
+            num_blocks=make_round(self.num_csp_blocks, self.deepen_factor),
+            block_cfg=block_cfg,
+            hidden_ratio=self.hidden_ratio,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.block_act_cfg)
diff --git a/mmyolo/models/task_modules/assigners/__init__.py b/mmyolo/models/task_modules/assigners/__init__.py
index e74ab728b..7b2e2e69c 100644
--- a/mmyolo/models/task_modules/assigners/__init__.py
+++ b/mmyolo/models/task_modules/assigners/__init__.py
@@ -2,11 +2,13 @@
 from .batch_atss_assigner import BatchATSSAssigner
 from .batch_dsl_assigner import BatchDynamicSoftLabelAssigner
 from .batch_task_aligned_assigner import BatchTaskAlignedAssigner
+from .pose_sim_ota_assigner import PoseSimOTAAssigner
 from .utils import (select_candidates_in_gts, select_highest_overlaps,
                     yolov6_iou_calculator)
 
 __all__ = [
     'BatchATSSAssigner', 'BatchTaskAlignedAssigner',
     'select_candidates_in_gts', 'select_highest_overlaps',
-    'yolov6_iou_calculator', 'BatchDynamicSoftLabelAssigner'
+    'yolov6_iou_calculator', 'BatchDynamicSoftLabelAssigner',
+    'PoseSimOTAAssigner'
 ]
diff --git a/mmyolo/models/task_modules/assigners/pose_sim_ota_assigner.py b/mmyolo/models/task_modules/assigners/pose_sim_ota_assigner.py
new file mode 100644
index 000000000..e66a9bf15
--- /dev/null
+++ b/mmyolo/models/task_modules/assigners/pose_sim_ota_assigner.py
@@ -0,0 +1,210 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from mmdet.models.task_modules.assigners import AssignResult, SimOTAAssigner
+from mmdet.utils import ConfigType
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmyolo.registry import MODELS, TASK_UTILS
+
+INF = 100000.0
+EPS = 1.0e-7
+
+
+@TASK_UTILS.register_module()
+class PoseSimOTAAssigner(SimOTAAssigner):
+
+    def __init__(self,
+                 center_radius: float = 2.5,
+                 candidate_topk: int = 10,
+                 iou_weight: float = 3.0,
+                 cls_weight: float = 1.0,
+                 oks_weight: float = 0.0,
+                 vis_weight: float = 0.0,
+                 iou_calculator: ConfigType = dict(type='BboxOverlaps2D'),
+                 oks_calculator: ConfigType = dict(type='OksLoss')):
+
+        self.center_radius = center_radius
+        self.candidate_topk = candidate_topk
+        self.iou_weight = iou_weight
+        self.cls_weight = cls_weight
+        self.oks_weight = oks_weight
+        self.vis_weight = vis_weight
+
+        self.iou_calculator = TASK_UTILS.build(iou_calculator)
+        self.oks_calculator = MODELS.build(oks_calculator)
+
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               gt_instances_ignore: Optional[InstanceData] = None,
+               **kwargs) -> AssignResult:
+        """Assign gt to priors using SimOTA.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+        Returns:
+            obj:`AssignResult`: The assigned result.
+        """
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        gt_keypoints = gt_instances.keypoints
+        gt_keypoints_visible = gt_instances.keypoints_visible
+        num_gt = gt_bboxes.size(0)
+
+        decoded_bboxes = pred_instances.bboxes[..., :4]
+        pred_kpts = pred_instances.bboxes[..., 4:]
+        pred_kpts = pred_kpts.reshape(*pred_kpts.shape[:-1], -1, 3)
+        pred_kpts_vis = pred_kpts[..., -1]
+        pred_kpts = pred_kpts[..., :2]
+        pred_scores = pred_instances.scores
+        priors = pred_instances.priors
+        num_bboxes = decoded_bboxes.size(0)
+
+        # assign 0 by default
+        assigned_gt_inds = decoded_bboxes.new_full((num_bboxes, ),
+                                                   0,
+                                                   dtype=torch.long)
+        if num_gt == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = decoded_bboxes.new_zeros((num_bboxes, ))
+            assigned_labels = decoded_bboxes.new_full((num_bboxes, ),
+                                                      -1,
+                                                      dtype=torch.long)
+            return AssignResult(
+                num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+
+        valid_mask, is_in_boxes_and_center = self.get_in_gt_and_in_center_info(
+            priors, gt_bboxes)
+        valid_decoded_bbox = decoded_bboxes[valid_mask]
+        valid_pred_scores = pred_scores[valid_mask]
+        valid_pred_kpts = pred_kpts[valid_mask]
+        valid_pred_kpts_vis = pred_kpts_vis[valid_mask]
+        num_valid = valid_decoded_bbox.size(0)
+        if num_valid == 0:
+            # No valid bboxes, return empty assignment
+            max_overlaps = decoded_bboxes.new_zeros((num_bboxes, ))
+            assigned_labels = decoded_bboxes.new_full((num_bboxes, ),
+                                                      -1,
+                                                      dtype=torch.long)
+            return AssignResult(
+                num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+
+        cost_matrix = (~is_in_boxes_and_center) * INF
+
+        # calculate iou
+        pairwise_ious = self.iou_calculator(valid_decoded_bbox, gt_bboxes)
+        if self.iou_weight > 0:
+            iou_cost = -torch.log(pairwise_ious + EPS)
+            cost_matrix = cost_matrix + iou_cost * self.iou_weight
+
+        # calculate oks
+        pairwise_oks = self.oks_calculator.compute_oks(
+            valid_pred_kpts.unsqueeze(1),  # [num_valid, -1, k, 2]
+            gt_keypoints.unsqueeze(0),  # [1, num_gt, k, 2]
+            gt_keypoints_visible.unsqueeze(0),  # [1, num_gt, k]
+            bboxes=gt_bboxes.unsqueeze(0),  # [1, num_gt, 4]
+        )  # -> [num_valid, num_gt]
+        if self.oks_weight > 0:
+            oks_cost = -torch.log(pairwise_oks + EPS)
+            cost_matrix = cost_matrix + oks_cost * self.oks_weight
+
+        # calculate cls
+        if self.cls_weight > 0:
+            gt_onehot_label = (
+                F.one_hot(gt_labels.to(torch.int64),
+                          pred_scores.shape[-1]).float().unsqueeze(0).repeat(
+                              num_valid, 1, 1))
+
+            valid_pred_scores = valid_pred_scores.unsqueeze(1).repeat(
+                1, num_gt, 1)
+            # disable AMP autocast to avoid overflow
+            with torch.cuda.amp.autocast(enabled=False):
+                cls_cost = (
+                    F.binary_cross_entropy(
+                        valid_pred_scores.to(dtype=torch.float32),
+                        gt_onehot_label,
+                        reduction='none',
+                    ).sum(-1).to(dtype=valid_pred_scores.dtype))
+            cost_matrix = cost_matrix + cls_cost * self.cls_weight
+
+        # calculate vis
+        if self.vis_weight > 0:
+            valid_pred_kpts_vis = valid_pred_kpts_vis.sigmoid().unsqueeze(
+                1).repeat(1, num_gt, 1)  # [num_valid, 1, k]
+            gt_kpt_vis = gt_keypoints_visible.unsqueeze(
+                0).float()  # [1, num_gt, k]
+            with torch.cuda.amp.autocast(enabled=False):
+                vis_cost = (
+                    F.binary_cross_entropy(
+                        valid_pred_kpts_vis.to(dtype=torch.float32),
+                        gt_kpt_vis.repeat(num_valid, 1, 1),
+                        reduction='none',
+                    ).sum(-1).to(dtype=valid_pred_kpts_vis.dtype))
+            cost_matrix = cost_matrix + vis_cost * self.vis_weight
+
+        # mixed metric
+        pairwise_oks = pairwise_oks.pow(0.5)
+        matched_pred_oks, matched_gt_inds = \
+            self.dynamic_k_matching(
+                cost_matrix, pairwise_ious, pairwise_oks, num_gt, valid_mask)
+
+        # convert to AssignResult format
+        assigned_gt_inds[valid_mask] = matched_gt_inds + 1
+        assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1)
+        assigned_labels[valid_mask] = gt_labels[matched_gt_inds].long()
+        max_overlaps = assigned_gt_inds.new_full((num_bboxes, ),
+                                                 -INF,
+                                                 dtype=torch.float32)
+        max_overlaps[valid_mask] = matched_pred_oks
+        return AssignResult(
+            num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+
+    def dynamic_k_matching(self, cost: Tensor, pairwise_ious: Tensor,
+                           pairwise_oks: Tensor, num_gt: int,
+                           valid_mask: Tensor) -> Tuple[Tensor, Tensor]:
+        """Use IoU and matching cost to calculate the dynamic top-k positive
+        targets."""
+        matching_matrix = torch.zeros_like(cost, dtype=torch.uint8)
+        # select candidate topk ious for dynamic-k calculation
+        candidate_topk = min(self.candidate_topk, pairwise_ious.size(0))
+        topk_ious, _ = torch.topk(pairwise_ious, candidate_topk, dim=0)
+        # calculate dynamic k for each gt
+        dynamic_ks = torch.clamp(topk_ious.sum(0).int(), min=1)
+        for gt_idx in range(num_gt):
+            _, pos_idx = torch.topk(
+                cost[:, gt_idx], k=dynamic_ks[gt_idx], largest=False)
+            matching_matrix[:, gt_idx][pos_idx] = 1
+
+        del topk_ious, dynamic_ks, pos_idx
+
+        prior_match_gt_mask = matching_matrix.sum(1) > 1
+        if prior_match_gt_mask.sum() > 0:
+            cost_min, cost_argmin = torch.min(
+                cost[prior_match_gt_mask, :], dim=1)
+            matching_matrix[prior_match_gt_mask, :] *= 0
+            matching_matrix[prior_match_gt_mask, cost_argmin] = 1
+        # get foreground mask inside box and center prior
+        fg_mask_inboxes = matching_matrix.sum(1) > 0
+        valid_mask[valid_mask.clone()] = fg_mask_inboxes
+
+        matched_gt_inds = matching_matrix[fg_mask_inboxes, :].argmax(1)
+        matched_pred_oks = (matching_matrix *
+                            pairwise_oks).sum(1)[fg_mask_inboxes]
+        return matched_pred_oks, matched_gt_inds
diff --git a/mmyolo/models/utils/__init__.py b/mmyolo/models/utils/__init__.py
index cdfeaaf0f..d62ff80e2 100644
--- a/mmyolo/models/utils/__init__.py
+++ b/mmyolo/models/utils/__init__.py
@@ -1,4 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from .misc import gt_instances_preprocess, make_divisible, make_round
+from .misc import (OutputSaveFunctionWrapper, OutputSaveObjectWrapper,
+                   gt_instances_preprocess, make_divisible, make_round)
 
-__all__ = ['make_divisible', 'make_round', 'gt_instances_preprocess']
+__all__ = [
+    'make_divisible', 'make_round', 'gt_instances_preprocess',
+    'OutputSaveFunctionWrapper', 'OutputSaveObjectWrapper'
+]
diff --git a/mmyolo/models/utils/misc.py b/mmyolo/models/utils/misc.py
index 531558b69..96cd1195a 100644
--- a/mmyolo/models/utils/misc.py
+++ b/mmyolo/models/utils/misc.py
@@ -1,6 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import math
-from typing import Sequence, Union
+from collections import defaultdict
+from copy import deepcopy
+from typing import Any, Callable, Dict, Optional, Sequence, Tuple, Union
 
 import torch
 from mmdet.structures.bbox.transforms import get_box_tensor
@@ -95,3 +97,90 @@ def gt_instances_preprocess(batch_gt_instances: Union[Tensor, Sequence],
                                          device=batch_gt_instances.device)
 
         return batch_instance
+
+
+class OutputSaveObjectWrapper:
+    """A wrapper class that saves the output of function calls on an object."""
+
+    def __init__(self, obj: Any) -> None:
+        self.obj = obj
+        self.log = defaultdict(list)
+
+    def __getattr__(self, attr: str) -> Any:
+        """Overrides the default behavior when an attribute is accessed.
+
+        - If the attribute is callable, hooks the attribute and saves the
+        returned value of the function call to the log.
+        - If the attribute is not callable, saves the attribute's value to the
+        log and returns the value.
+        """
+        orig_attr = getattr(self.obj, attr)
+
+        if not callable(orig_attr):
+            self.log[attr].append(orig_attr)
+            return orig_attr
+
+        def hooked(*args: Tuple, **kwargs: Dict) -> Any:
+            """The hooked function that logs the return value of the original
+            function."""
+            result = orig_attr(*args, **kwargs)
+            self.log[attr].append(result)
+            return result
+
+        return hooked
+
+    def clear(self):
+        """Clears the log of function call outputs."""
+        self.log.clear()
+
+    def __deepcopy__(self, memo):
+        """Only copy the object when applying deepcopy."""
+        other = type(self)(deepcopy(self.obj))
+        memo[id(self)] = other
+        return other
+
+
+class OutputSaveFunctionWrapper:
+    """A class that wraps a function and saves its outputs.
+
+    This class can be used to decorate a function to save its outputs. It wraps
+    the function with a `__call__` method that calls the original function and
+    saves the results in a log attribute.
+    Args:
+        func (Callable): A function to wrap.
+        spec (Optional[Dict]): A dictionary of global variables to use as the
+            namespace for the wrapper. If `None`, the global namespace of the
+            original function is used.
+    """
+
+    def __init__(self, func: Callable, spec: Optional[Dict]) -> None:
+        """Initializes the OutputSaveFunctionWrapper instance."""
+        assert callable(func)
+        self.log = []
+        self.func = func
+        self.func_name = func.__name__
+
+        if isinstance(spec, dict):
+            self.spec = spec
+        elif hasattr(func, '__globals__'):
+            self.spec = func.__globals__
+        else:
+            raise ValueError
+
+    def __call__(self, *args, **kwargs) -> Any:
+        """Calls the wrapped function with the given arguments and saves the
+        results in the `log` attribute."""
+        results = self.func(*args, **kwargs)
+        self.log.append(results)
+        return results
+
+    def __enter__(self) -> None:
+        """Enters the context and sets the wrapped function to be a global
+        variable in the specified namespace."""
+        self.spec[self.func_name] = self
+        return self.log
+
+    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
+        """Exits the context and resets the wrapped function to its original
+        value in the specified namespace."""
+        self.spec[self.func_name] = self.func
diff --git a/mmyolo/utils/boxam_utils.py b/mmyolo/utils/boxam_utils.py
index 4a46f21c1..50d6c09ec 100644
--- a/mmyolo/utils/boxam_utils.py
+++ b/mmyolo/utils/boxam_utils.py
@@ -202,8 +202,10 @@ def __call__(self, *args, **kwargs):
         if self.is_need_loss:
             # Maybe this is a direction that can be optimized
             # self.detector.init_weights()
-
-            self.detector.bbox_head.head_module.training = True
+            if hasattr(self.detector.bbox_head, 'head_module'):
+                self.detector.bbox_head.head_module.training = True
+            else:
+                self.detector.bbox_head.training = True
             if hasattr(self.detector.bbox_head, 'featmap_sizes'):
                 # Prevent the model algorithm error when calculating loss
                 self.detector.bbox_head.featmap_sizes = None
@@ -219,7 +221,10 @@ def __call__(self, *args, **kwargs):
 
             return [loss]
         else:
-            self.detector.bbox_head.head_module.training = False
+            if hasattr(self.detector.bbox_head, 'head_module'):
+                self.detector.bbox_head.head_module.training = False
+            else:
+                self.detector.bbox_head.training = False
             with torch.no_grad():
                 results = self.detector.test_step(self.input_data)
                 return results
diff --git a/mmyolo/utils/misc.py b/mmyolo/utils/misc.py
index c90f52b94..f5d366d75 100644
--- a/mmyolo/utils/misc.py
+++ b/mmyolo/utils/misc.py
@@ -72,7 +72,9 @@ def get_file_list(source_root: str) -> [list, dict]:
     source_file_path_list = []
     if is_dir:
         # when input source is dir
-        for file in scandir(source_root, IMG_EXTENSIONS, recursive=True):
+        for file in scandir(
+                source_root, IMG_EXTENSIONS, recursive=True,
+                case_sensitive=False):
             source_file_path_list.append(os.path.join(source_root, file))
     elif is_url:
         # when input source is url
diff --git a/mmyolo/version.py b/mmyolo/version.py
index 75c44c7b2..6e4f0e8e3 100644
--- a/mmyolo/version.py
+++ b/mmyolo/version.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-__version__ = '0.5.0'
+__version__ = '0.6.0'
 
 from typing import Tuple
 
diff --git a/projects/easydeploy/deepstream/configs/config_infer_yolov8.txt b/projects/easydeploy/deepstream/configs/config_infer_yolov8.txt
new file mode 100644
index 000000000..6ad7d6429
--- /dev/null
+++ b/projects/easydeploy/deepstream/configs/config_infer_yolov8.txt
@@ -0,0 +1,21 @@
+[property]
+gpu-id=0
+net-scale-factor=0.0039215697906911373
+model-color-format=0
+model-engine-file=../end2end.engine
+labelfile-path=../coco_labels.txt
+batch-size=1
+network-mode=0
+num-detected-classes=80
+interval=0
+gie-unique-id=1
+process-mode=1
+network-type=0
+cluster-mode=2
+maintain-aspect-ratio=1
+parse-bbox-func-name=NvDsInferParseCustomMMYOLO
+custom-lib-path=../build/libnvdsparsebbox_mmyolo.so
+
+[class-attrs-all]
+pre-cluster-threshold=0.45
+topk=100
diff --git a/projects/easydeploy/docs/model_convert.md b/projects/easydeploy/docs/model_convert.md
index 062247fc4..9af62599d 100644
--- a/projects/easydeploy/docs/model_convert.md
+++ b/projects/easydeploy/docs/model_convert.md
@@ -1,5 +1,7 @@
 # MMYOLO 模型 ONNX 转换
 
+## 1. 导出后端支持的 ONNX
+
 ## 环境依赖
 
 - [onnx](https://github.com/onnx/onnx)
@@ -14,9 +16,11 @@
   pip install onnx-simplifier
   ```
 
+\*\*\* 请确保您在 `MMYOLO` 根目录下运行相关脚本，避免无法找到相关依赖包。\*\*\*
+
 ## 使用方法
 
-[模型导出脚本](./projects/easydeploy/tools/export.py)用于将 `MMYOLO` 模型转换为 `onnx` 。
+[模型导出脚本](./projects/easydeploy/tools/export_onnx.py)用于将 `MMYOLO` 模型转换为 `onnx` 。
 
 ### 参数介绍:
 
@@ -28,11 +32,12 @@
 - `--device`: 转换模型使用的设备，默认为 `cuda:0`。
 - `--simplify`: 是否简化导出的 `onnx` 模型，需要安装 [onnx-simplifier](https://github.com/daquexian/onnx-simplifier)，默认关闭。
 - `--opset`: 指定导出 `onnx` 的 `opset`，默认为 `11` 。
-- `--backend`: 指定导出 `onnx` 用于的后端 id，`ONNXRuntime`: `1`, `TensorRT8`: `2`, `TensorRT7`: `3`，默认为`1`即 `ONNXRuntime`。
+- `--backend`: 指定导出 `onnx` 用于的后端名称，`ONNXRuntime`: `onnxruntime`, `TensorRT8`: `tensorrt8`, `TensorRT7`: `tensorrt7`，默认为`onnxruntime`即 `ONNXRuntime`。
 - `--pre-topk`: 指定导出 `onnx` 的后处理筛选候选框个数阈值，默认为 `1000`。
 - `--keep-topk`: 指定导出 `onnx` 的非极大值抑制输出的候选框个数阈值，默认为 `100`。
 - `--iou-threshold`: 非极大值抑制中过滤重复候选框的 `iou` 阈值，默认为 `0.65`。
 - `--score-threshold`: 非极大值抑制中过滤候选框得分的阈值，默认为 `0.25`。
+- `--model-only`: 指定仅导出模型 backbone + neck, 不包含后处理，默认关闭。
 
 例子:
 
@@ -53,4 +58,99 @@ python ./projects/easydeploy/tools/export.py \
 	--score-threshold 0.25
 ```
 
-然后利用后端支持的工具如 `TensorRT` 读取 `onnx` 再次转换为后端支持的模型格式如 `.engine/.plan` 等
+然后利用后端支持的工具如 `TensorRT` 读取 `onnx` 再次转换为后端支持的模型格式如 `.engine/.plan` 等。
+
+`MMYOLO` 目前支持 `TensorRT8`, `TensorRT7`, `ONNXRuntime` 后端的端到端模型转换，目前仅支持静态 shape 模型的导出和转换，动态 batch 或动态长宽的模型端到端转换会在未来继续支持。
+
+端到端转换得到的 `onnx` 模型输入输出如图：
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/92794867/232403745-101ca999-2003-46fa-bc5b-6b0eb2b2d41b.png"/>
+</div>
+
+输入名: `images`, 尺寸 640x640
+
+输出名: `num_dets`, 尺寸 1x1，表示检测目标数量。
+
+输出名: `boxes`, 尺寸 1x100x4，表示检测框的坐标，格式为 `x1y1x2y1`。
+
+输出名: `scores`, 尺寸 1x100，表示检测框的分数。
+
+输出名: `labels`, 尺寸 1x100，表示检测框的类别 id。
+
+可以利用 `num_dets` 中的个数对 `boxes`, `scores`, `labels` 进行截断，从 100 个检测结果中抽取前 `num_dets` 个目标作为最终检测结果。
+
+## 2. 仅导出模型 Backbone + Neck
+
+当您需要部署在非 `TensorRT`, `ONNXRuntime` 等支持端到端部署的平台时，您可以考虑使用`--model-only` 参数并且不要传递 `--backend` 参数，您将会导出仅包含 `Backbone` + `neck` 的模型，模型的部分输出如图:
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/92794867/232406169-40eee9fd-bc53-4fdc-bd37-d0e9033826f9.png"/>
+</div>
+
+这种导出方式获取的 `ONNX` 模型具有如下优点:
+
+- 算子简单，一般而言只包含 `Conv`，激活函数等简单算子，几乎不存在无法正确导出的情况，对于嵌入式部署更加友好。
+- 方便不同算法之间对比速度性能，由于不同的算法后处理不同，仅对比 `backbone` + `Neck` 的速度更加公平。
+
+也有如下缺点:
+
+- 后处理逻辑需要单独完成，会有额外的 `decode` + `nms` 的操作需要实现。
+- 与 `TensorRT` 相比，由于 `TensorRT` 可以利用多核优势并行进行后处理，使用 `--model-only` 方式导出的模型性能会差很多。
+
+### 使用方法
+
+```shell
+python ./projects/easydeploy/tools/export.py \
+	configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \
+	yolov5s.pth \
+	--work-dir work_dir \
+    --img-size 640 640 \
+    --batch 1 \
+    --device cpu \
+    --simplify \
+	--opset 11 \
+	--model-only
+```
+
+## 使用 `model-only` 导出的 ONNX 进行推理
+
+[模型推理脚本](./projects/easydeploy/examples/main_onnxruntime.py)用于推理导出的 `ONNX` 模型，需要安装基础依赖环境:
+
+[`onnxruntime`](https://github.com/microsoft/onnxruntime) 和 [`opencv-python`](https://github.com/opencv/opencv-python)
+
+```shell
+pip install onnxruntime
+pip install opencv-python==4.7.0.72 # 建议使用最新的 opencv
+```
+
+### 参数介绍:
+
+- `img` : 待检测的图片路径或图片文件夹路径。
+- `onnx` : 导出的 `model-only` ONNX 模型。
+- `--type` : 模型名称，目前支持 `yolov5`, `yolox`, `yolov6`, `ppyoloe`, `ppyoloep`, `yolov7`, `rtmdet`, `yolov8`。
+- `--img-size`: 转换模型时输入的尺寸，如 `640 640`。
+- `--out-dir`: 保存检测结果的路径 。
+- `--show`: 是否可视化检测结果。
+- `--score-thr`: 模型检测后处理的置信度分数 。
+- `--iou-thr`: 模型检测后处理的 IOU 分数 。
+
+## 使用方法
+
+```shell
+cd ./projects/easydeploy/examples
+python main_onnxruntime.py \
+	"image_path_to_detect" \
+	yolov5_s_model-only.onnx \
+	--out-dir work_dir \
+    --img-size 640 640 \
+    --show \
+    --score-thr 0.3 \
+    --iou-thr 0.7
+```
+
+*注意！！！*
+
+当您使用自定义数据集训练得到的模型时，请修改 [`config.py`](./projects/easydeploy/examples/config.py) 中 `CLASS_NAMES` 和 `CLASS_COLORS`，如果是 `yolov5` 或者 `yolov7` 基于 `anchor` 的模型请同时修改 `YOLOv5_ANCHORS` 和 `YOLOv7_ANCHORS`。
+
+[`numpy_coder.py`](./projects/easydeploy/examples/numpy_coder.py) 是目前所有算法仅使用 `numpy` 实现的 `decoder`，如果您对性能有较高的要求，可以参照相关代码改写为 `c/c++`。
diff --git a/projects/easydeploy/examples/config.py b/projects/easydeploy/examples/config.py
new file mode 100644
index 000000000..4a85ff342
--- /dev/null
+++ b/projects/easydeploy/examples/config.py
@@ -0,0 +1,64 @@
+from enum import Enum
+
+
+class TASK_TYPE(Enum):
+    DET = 'det'
+    SEG = 'seg'
+    POSE = 'pose'
+
+
+class ModelType(Enum):
+    YOLOV5 = 'yolov5'
+    YOLOX = 'yolox'
+    PPYOLOE = 'ppyoloe'
+    PPYOLOEP = 'ppyoloep'
+    YOLOV6 = 'yolov6'
+    YOLOV7 = 'yolov7'
+    RTMDET = 'rtmdet'
+    YOLOV8 = 'yolov8'
+
+
+CLASS_NAMES = ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
+               'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
+               'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
+               'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe',
+               'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
+               'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat',
+               'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
+               'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
+               'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
+               'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
+               'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop',
+               'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
+               'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock',
+               'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush')
+
+CLASS_COLORS = [(220, 20, 60), (119, 11, 32), (0, 0, 142), (0, 0, 230),
+                (106, 0, 228), (0, 60, 100), (0, 80, 100), (0, 0, 70),
+                (0, 0, 192), (250, 170, 30), (100, 170, 30), (220, 220, 0),
+                (175, 116, 175), (250, 0, 30), (165, 42, 42), (255, 77, 255),
+                (0, 226, 252), (182, 182, 255), (0, 82, 0), (120, 166, 157),
+                (110, 76, 0), (174, 57, 255), (199, 100, 0), (72, 0, 118),
+                (255, 179, 240), (0, 125, 92), (209, 0, 151), (188, 208, 182),
+                (0, 220, 176), (255, 99, 164), (92, 0, 73), (133, 129, 255),
+                (78, 180, 255), (0, 228, 0), (174, 255, 243), (45, 89, 255),
+                (134, 134, 103), (145, 148, 174), (255, 208, 186),
+                (197, 226, 255), (171, 134, 1), (109, 63, 54), (207, 138, 255),
+                (151, 0, 95), (9, 80, 61), (84, 105, 51), (74, 65, 105),
+                (166, 196, 102), (208, 195, 210), (255, 109, 65),
+                (0, 143, 149), (179, 0, 194), (209, 99, 106), (5, 121, 0),
+                (227, 255, 205), (147, 186, 208), (153, 69, 1), (3, 95, 161),
+                (163, 255, 0), (119, 0, 170), (0, 182, 199), (0, 165, 120),
+                (183, 130, 88), (95, 32, 0), (130, 114, 135), (110, 129, 133),
+                (166, 74, 118), (219, 142, 185), (79, 210, 114), (178, 90, 62),
+                (65, 70, 15), (127, 167, 115), (59, 105, 106), (142, 108, 45),
+                (196, 172, 0), (95, 54, 80), (128, 76, 255), (201, 57, 1),
+                (246, 0, 122), (191, 162, 208)]
+
+YOLOv5_ANCHORS = [[(10, 13), (16, 30), (33, 23)],
+                  [(30, 61), (62, 45), (59, 119)],
+                  [(116, 90), (156, 198), (373, 326)]]
+
+YOLOv7_ANCHORS = [[(12, 16), (19, 36), (40, 28)],
+                  [(36, 75), (76, 55), (72, 146)],
+                  [(142, 110), (192, 243), (459, 401)]]
diff --git a/projects/easydeploy/examples/cv2_nms.py b/projects/easydeploy/examples/cv2_nms.py
new file mode 100644
index 000000000..79e376356
--- /dev/null
+++ b/projects/easydeploy/examples/cv2_nms.py
@@ -0,0 +1,36 @@
+from typing import List, Tuple, Union
+
+import cv2
+from numpy import ndarray
+
+MAJOR, MINOR = map(int, cv2.__version__.split('.')[:2])
+assert MAJOR == 4
+
+
+def non_max_suppression(boxes: Union[List[ndarray], Tuple[ndarray]],
+                        scores: Union[List[float], Tuple[float]],
+                        labels: Union[List[int], Tuple[int]],
+                        conf_thres: float = 0.25,
+                        iou_thres: float = 0.65) -> Tuple[List, List, List]:
+    if MINOR >= 7:
+        indices = cv2.dnn.NMSBoxesBatched(boxes, scores, labels, conf_thres,
+                                          iou_thres)
+    elif MINOR == 6:
+        indices = cv2.dnn.NMSBoxes(boxes, scores, conf_thres, iou_thres)
+    else:
+        indices = cv2.dnn.NMSBoxes(boxes, scores, conf_thres,
+                                   iou_thres).flatten()
+
+    nmsd_boxes = []
+    nmsd_scores = []
+    nmsd_labels = []
+    for idx in indices:
+        box = boxes[idx]
+        # x0y0wh -> x0y0x1y1
+        box[2:] = box[:2] + box[2:]
+        score = scores[idx]
+        label = labels[idx]
+        nmsd_boxes.append(box)
+        nmsd_scores.append(score)
+        nmsd_labels.append(label)
+    return nmsd_boxes, nmsd_scores, nmsd_labels
diff --git a/projects/easydeploy/examples/main_onnxruntime.py b/projects/easydeploy/examples/main_onnxruntime.py
new file mode 100644
index 000000000..bc0ad1b0f
--- /dev/null
+++ b/projects/easydeploy/examples/main_onnxruntime.py
@@ -0,0 +1,110 @@
+import math
+import sys
+from argparse import ArgumentParser
+from pathlib import Path
+
+import cv2
+import onnxruntime
+from config import (CLASS_COLORS, CLASS_NAMES, ModelType, YOLOv5_ANCHORS,
+                    YOLOv7_ANCHORS)
+from cv2_nms import non_max_suppression
+from numpy_coder import Decoder
+from preprocess import Preprocess
+from tqdm import tqdm
+
+# Add __FILE__  to sys.path
+sys.path.append(str(Path(__file__).resolve().parents[0]))
+
+IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif',
+                  '.tiff', '.webp')
+
+
+def path_to_list(path: str):
+    path = Path(path)
+    if path.is_file() and path.suffix in IMG_EXTENSIONS:
+        res_list = [str(path.absolute())]
+    elif path.is_dir():
+        res_list = [
+            str(p.absolute()) for p in path.iterdir()
+            if p.suffix in IMG_EXTENSIONS
+        ]
+    else:
+        raise RuntimeError
+    return res_list
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        'img', help='Image path, include image file, dir and URL.')
+    parser.add_argument('onnx', type=str, help='Onnx file')
+    parser.add_argument('--type', type=str, help='Model type')
+    parser.add_argument(
+        '--img-size',
+        nargs='+',
+        type=int,
+        default=[640, 640],
+        help='Image size of height and width')
+    parser.add_argument(
+        '--out-dir', default='./output', type=str, help='Path to output file')
+    parser.add_argument(
+        '--show', action='store_true', help='Show the detection results')
+    parser.add_argument(
+        '--score-thr', type=float, default=0.3, help='Bbox score threshold')
+    parser.add_argument(
+        '--iou-thr', type=float, default=0.7, help='Bbox iou threshold')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    out_dir = Path(args.out_dir)
+    model_type = ModelType(args.type.lower())
+
+    if not args.show:
+        out_dir.mkdir(parents=True, exist_ok=True)
+
+    files = path_to_list(args.img)
+    session = onnxruntime.InferenceSession(
+        args.onnx, providers=['CPUExecutionProvider'])
+    preprocessor = Preprocess(model_type)
+    decoder = Decoder(model_type, model_only=True)
+    if model_type == ModelType.YOLOV5:
+        anchors = YOLOv5_ANCHORS
+    elif model_type == ModelType.YOLOV7:
+        anchors = YOLOv7_ANCHORS
+    else:
+        anchors = None
+
+    for file in tqdm(files):
+        image = cv2.imread(file)
+        image_h, image_w = image.shape[:2]
+        img, (ratio_w, ratio_h) = preprocessor(image, args.img_size)
+        features = session.run(None, {'images': img})
+        decoder_outputs = decoder(
+            features,
+            args.score_thr,
+            num_labels=len(CLASS_NAMES),
+            anchors=anchors)
+        nmsd_boxes, nmsd_scores, nmsd_labels = non_max_suppression(
+            *decoder_outputs, args.score_thr, args.iou_thr)
+        for box, score, label in zip(nmsd_boxes, nmsd_scores, nmsd_labels):
+            x0, y0, x1, y1 = box
+            x0 = math.floor(min(max(x0 / ratio_w, 1), image_w - 1))
+            y0 = math.floor(min(max(y0 / ratio_h, 1), image_h - 1))
+            x1 = math.ceil(min(max(x1 / ratio_w, 1), image_w - 1))
+            y1 = math.ceil(min(max(y1 / ratio_h, 1), image_h - 1))
+            cv2.rectangle(image, (x0, y0), (x1, y1), CLASS_COLORS[label], 2)
+            cv2.putText(image, f'{CLASS_NAMES[label]}: {score:.2f}',
+                        (x0, y0 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5,
+                        (0, 255, 255), 2)
+        if args.show:
+            cv2.imshow('result', image)
+            cv2.waitKey(0)
+        else:
+            cv2.imwrite(f'{out_dir / Path(file).name}', image)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/projects/easydeploy/examples/numpy_coder.py b/projects/easydeploy/examples/numpy_coder.py
new file mode 100644
index 000000000..ccd3687f8
--- /dev/null
+++ b/projects/easydeploy/examples/numpy_coder.py
@@ -0,0 +1,310 @@
+from typing import List, Tuple, Union
+
+import numpy as np
+from config import ModelType
+from numpy import ndarray
+
+
+def softmax(x: ndarray, axis: int = -1) -> ndarray:
+    e_x = np.exp(x - np.max(x, axis=axis, keepdims=True))
+    y = e_x / e_x.sum(axis=axis, keepdims=True)
+    return y
+
+
+def sigmoid(x: ndarray) -> ndarray:
+    return 1. / (1. + np.exp(-x))
+
+
+class Decoder:
+
+    def __init__(self, model_type: ModelType, model_only: bool = False):
+        self.model_type = model_type
+        self.model_only = model_only
+        self.boxes_pro = []
+        self.scores_pro = []
+        self.labels_pro = []
+        self.is_logging = False
+
+    def __call__(self,
+                 feats: Union[List, Tuple],
+                 conf_thres: float,
+                 num_labels: int = 80,
+                 **kwargs) -> Tuple:
+        if not self.is_logging:
+            print('Only support decode in batch==1')
+            self.is_logging = True
+        self.boxes_pro.clear()
+        self.scores_pro.clear()
+        self.labels_pro.clear()
+
+        if self.model_only:
+            # transpose channel to last dim for easy decoding
+            feats = [
+                np.ascontiguousarray(feat[0].transpose(1, 2, 0))
+                for feat in feats
+            ]
+        else:
+            # ax620a horizonX3 transpose channel to last dim by default
+            feats = [np.ascontiguousarray(feat) for feat in feats]
+        if self.model_type == ModelType.YOLOV5:
+            self.__yolov5_decode(feats, conf_thres, num_labels, **kwargs)
+        elif self.model_type == ModelType.YOLOX:
+            self.__yolox_decode(feats, conf_thres, num_labels, **kwargs)
+        elif self.model_type in (ModelType.PPYOLOE, ModelType.PPYOLOEP):
+            self.__ppyoloe_decode(feats, conf_thres, num_labels, **kwargs)
+        elif self.model_type == ModelType.YOLOV6:
+            self.__yolov6_decode(feats, conf_thres, num_labels, **kwargs)
+        elif self.model_type == ModelType.YOLOV7:
+            self.__yolov7_decode(feats, conf_thres, num_labels, **kwargs)
+        elif self.model_type == ModelType.RTMDET:
+            self.__rtmdet_decode(feats, conf_thres, num_labels, **kwargs)
+        elif self.model_type == ModelType.YOLOV8:
+            self.__yolov8_decode(feats, conf_thres, num_labels, **kwargs)
+        else:
+            raise NotImplementedError
+        return self.boxes_pro, self.scores_pro, self.labels_pro
+
+    def __yolov5_decode(self,
+                        feats: List[ndarray],
+                        conf_thres: float,
+                        num_labels: int = 80,
+                        **kwargs):
+        anchors: Union[List, Tuple] = kwargs.get(
+            'anchors',
+            [[(10, 13), (16, 30),
+              (33, 23)], [(30, 61), (62, 45),
+                          (59, 119)], [(116, 90), (156, 198), (373, 326)]])
+        for i, feat in enumerate(feats):
+            stride = 8 << i
+            feat_h, feat_w, _ = feat.shape
+            anchor = anchors[i]
+            feat = sigmoid(feat)
+            feat = feat.reshape((feat_h, feat_w, len(anchor), -1))
+            box_feat, conf_feat, score_feat = np.split(feat, [4, 5], -1)
+
+            hIdx, wIdx, aIdx, _ = np.where(conf_feat > conf_thres)
+
+            num_proposal = hIdx.size
+            if not num_proposal:
+                continue
+
+            score_feat = score_feat[hIdx, wIdx, aIdx] * conf_feat[hIdx, wIdx,
+                                                                  aIdx]
+            boxes = box_feat[hIdx, wIdx, aIdx]
+            labels = score_feat.argmax(-1)
+            scores = score_feat.max(-1)
+
+            indices = np.where(scores > conf_thres)[0]
+            if len(indices) == 0:
+                continue
+
+            for idx in indices:
+                a_w, a_h = anchor[aIdx[idx]]
+                x, y, w, h = boxes[idx]
+                x = (x * 2.0 - 0.5 + wIdx[idx]) * stride
+                y = (y * 2.0 - 0.5 + hIdx[idx]) * stride
+                w = (w * 2.0)**2 * a_w
+                h = (h * 2.0)**2 * a_h
+
+                x0 = x - w / 2
+                y0 = y - h / 2
+
+                self.scores_pro.append(float(scores[idx]))
+                self.boxes_pro.append(
+                    np.array([x0, y0, w, h], dtype=np.float32))
+                self.labels_pro.append(int(labels[idx]))
+
+    def __yolox_decode(self,
+                       feats: List[ndarray],
+                       conf_thres: float,
+                       num_labels: int = 80,
+                       **kwargs):
+        for i, feat in enumerate(feats):
+            stride = 8 << i
+            score_feat, box_feat, conf_feat = np.split(
+                feat, [num_labels, num_labels + 4], -1)
+            conf_feat = sigmoid(conf_feat)
+
+            hIdx, wIdx, _ = np.where(conf_feat > conf_thres)
+
+            num_proposal = hIdx.size
+            if not num_proposal:
+                continue
+
+            score_feat = sigmoid(score_feat[hIdx, wIdx]) * conf_feat[hIdx,
+                                                                     wIdx]
+            boxes = box_feat[hIdx, wIdx]
+            labels = score_feat.argmax(-1)
+            scores = score_feat.max(-1)
+            indices = np.where(scores > conf_thres)[0]
+
+            if len(indices) == 0:
+                continue
+
+            for idx in indices:
+                score = scores[idx]
+                label = labels[idx]
+
+                x, y, w, h = boxes[idx]
+
+                x = (x + wIdx[idx]) * stride
+                y = (y + hIdx[idx]) * stride
+                w = np.exp(w) * stride
+                h = np.exp(h) * stride
+
+                x0 = x - w / 2
+                y0 = y - h / 2
+
+                self.scores_pro.append(float(score))
+                self.boxes_pro.append(
+                    np.array([x0, y0, w, h], dtype=np.float32))
+                self.labels_pro.append(int(label))
+
+    def __ppyoloe_decode(self,
+                         feats: List[ndarray],
+                         conf_thres: float,
+                         num_labels: int = 80,
+                         **kwargs):
+        reg_max: int = kwargs.get('reg_max', 17)
+        dfl = np.arange(0, reg_max, dtype=np.float32)
+        for i, feat in enumerate(feats):
+            stride = 8 << i
+            score_feat, box_feat = np.split(feat, [
+                num_labels,
+            ], -1)
+            score_feat = sigmoid(score_feat)
+            _argmax = score_feat.argmax(-1)
+            _max = score_feat.max(-1)
+            indices = np.where(_max > conf_thres)
+            hIdx, wIdx = indices
+            num_proposal = hIdx.size
+            if not num_proposal:
+                continue
+
+            scores = _max[hIdx, wIdx]
+            boxes = box_feat[hIdx, wIdx].reshape(num_proposal, 4, reg_max)
+            boxes = softmax(boxes, -1) @ dfl
+            labels = _argmax[hIdx, wIdx]
+
+            for k in range(num_proposal):
+                score = scores[k]
+                label = labels[k]
+
+                x0, y0, x1, y1 = boxes[k]
+
+                x0 = (wIdx[k] + 0.5 - x0) * stride
+                y0 = (hIdx[k] + 0.5 - y0) * stride
+                x1 = (wIdx[k] + 0.5 + x1) * stride
+                y1 = (hIdx[k] + 0.5 + y1) * stride
+
+                w = x1 - x0
+                h = y1 - y0
+
+                self.scores_pro.append(float(score))
+                self.boxes_pro.append(
+                    np.array([x0, y0, w, h], dtype=np.float32))
+                self.labels_pro.append(int(label))
+
+    def __yolov6_decode(self,
+                        feats: List[ndarray],
+                        conf_thres: float,
+                        num_labels: int = 80,
+                        **kwargs):
+        for i, feat in enumerate(feats):
+            stride = 8 << i
+            score_feat, box_feat = np.split(feat, [
+                num_labels,
+            ], -1)
+            score_feat = sigmoid(score_feat)
+            _argmax = score_feat.argmax(-1)
+            _max = score_feat.max(-1)
+            indices = np.where(_max > conf_thres)
+            hIdx, wIdx = indices
+            num_proposal = hIdx.size
+            if not num_proposal:
+                continue
+
+            scores = _max[hIdx, wIdx]
+            boxes = box_feat[hIdx, wIdx]
+            labels = _argmax[hIdx, wIdx]
+
+            for k in range(num_proposal):
+                score = scores[k]
+                label = labels[k]
+
+                x0, y0, x1, y1 = boxes[k]
+
+                x0 = (wIdx[k] + 0.5 - x0) * stride
+                y0 = (hIdx[k] + 0.5 - y0) * stride
+                x1 = (wIdx[k] + 0.5 + x1) * stride
+                y1 = (hIdx[k] + 0.5 + y1) * stride
+
+                w = x1 - x0
+                h = y1 - y0
+
+                self.scores_pro.append(float(score))
+                self.boxes_pro.append(
+                    np.array([x0, y0, w, h], dtype=np.float32))
+                self.labels_pro.append(int(label))
+
+    def __yolov7_decode(self,
+                        feats: List[ndarray],
+                        conf_thres: float,
+                        num_labels: int = 80,
+                        **kwargs):
+        anchors: Union[List, Tuple] = kwargs.get(
+            'anchors',
+            [[(12, 16), (19, 36),
+              (40, 28)], [(36, 75), (76, 55),
+                          (72, 146)], [(142, 110), (192, 243), (459, 401)]])
+        self.__yolov5_decode(feats, conf_thres, num_labels, anchors=anchors)
+
+    def __rtmdet_decode(self,
+                        feats: List[ndarray],
+                        conf_thres: float,
+                        num_labels: int = 80,
+                        **kwargs):
+        for i, feat in enumerate(feats):
+            stride = 8 << i
+            score_feat, box_feat = np.split(feat, [
+                num_labels,
+            ], -1)
+            score_feat = sigmoid(score_feat)
+            _argmax = score_feat.argmax(-1)
+            _max = score_feat.max(-1)
+            indices = np.where(_max > conf_thres)
+            hIdx, wIdx = indices
+            num_proposal = hIdx.size
+            if not num_proposal:
+                continue
+
+            scores = _max[hIdx, wIdx]
+            boxes = box_feat[hIdx, wIdx]
+            labels = _argmax[hIdx, wIdx]
+
+            for k in range(num_proposal):
+                score = scores[k]
+                label = labels[k]
+
+                x0, y0, x1, y1 = boxes[k]
+
+                x0 = (wIdx[k] - x0) * stride
+                y0 = (hIdx[k] - y0) * stride
+                x1 = (wIdx[k] + x1) * stride
+                y1 = (hIdx[k] + y1) * stride
+
+                w = x1 - x0
+                h = y1 - y0
+
+                self.scores_pro.append(float(score))
+                self.boxes_pro.append(
+                    np.array([x0, y0, w, h], dtype=np.float32))
+                self.labels_pro.append(int(label))
+
+    def __yolov8_decode(self,
+                        feats: List[ndarray],
+                        conf_thres: float,
+                        num_labels: int = 80,
+                        **kwargs):
+        reg_max: int = kwargs.get('reg_max', 16)
+        self.__ppyoloe_decode(feats, conf_thres, num_labels, reg_max=reg_max)
diff --git a/projects/easydeploy/examples/preprocess.py b/projects/easydeploy/examples/preprocess.py
new file mode 100644
index 000000000..6b6fb563a
--- /dev/null
+++ b/projects/easydeploy/examples/preprocess.py
@@ -0,0 +1,57 @@
+from typing import List, Tuple, Union
+
+import cv2
+import numpy as np
+from config import ModelType
+from numpy import ndarray
+
+
+class Preprocess:
+
+    def __init__(self, model_type: ModelType):
+        if model_type in (ModelType.YOLOV5, ModelType.YOLOV6, ModelType.YOLOV7,
+                          ModelType.YOLOV8):
+            mean = np.array([0, 0, 0], dtype=np.float32)
+            std = np.array([255, 255, 255], dtype=np.float32)
+            is_rgb = True
+        elif model_type == ModelType.YOLOX:
+            mean = np.array([0, 0, 0], dtype=np.float32)
+            std = np.array([1, 1, 1], dtype=np.float32)
+            is_rgb = False
+        elif model_type == ModelType.PPYOLOE:
+            mean = np.array([123.675, 116.28, 103.53], dtype=np.float32)
+            std = np.array([58.395, 57.12, 57.375], dtype=np.float32)
+            is_rgb = True
+
+        elif model_type == ModelType.PPYOLOEP:
+            mean = np.array([0, 0, 0], dtype=np.float32)
+            std = np.array([255, 255, 255], dtype=np.float32)
+            is_rgb = True
+        elif model_type == ModelType.RTMDET:
+            mean = np.array([103.53, 116.28, 123.675], dtype=np.float32)
+            std = np.array([57.375, 57.12, 58.3955], dtype=np.float32)
+            is_rgb = False
+        else:
+            raise NotImplementedError
+
+        self.mean = mean.reshape((3, 1, 1))
+        self.std = std.reshape((3, 1, 1))
+        self.is_rgb = is_rgb
+
+    def __call__(self,
+                 image: ndarray,
+                 new_size: Union[List[int], Tuple[int]] = (640, 640),
+                 **kwargs) -> Tuple[ndarray, Tuple[float, float]]:
+        # new_size: (height, width)
+        height, width = image.shape[:2]
+        ratio_h, ratio_w = new_size[0] / height, new_size[1] / width
+        image = cv2.resize(
+            image, (0, 0),
+            fx=ratio_w,
+            fy=ratio_h,
+            interpolation=cv2.INTER_LINEAR)
+        image = np.ascontiguousarray(image.transpose(2, 0, 1))
+        image = image.astype(np.float32)
+        image -= self.mean
+        image /= self.std
+        return image[np.newaxis], (ratio_w, ratio_h)
diff --git a/projects/easydeploy/examples/requirements.txt b/projects/easydeploy/examples/requirements.txt
new file mode 100644
index 000000000..0b761189b
--- /dev/null
+++ b/projects/easydeploy/examples/requirements.txt
@@ -0,0 +1,2 @@
+onnxruntime
+opencv-python==4.7.0.72
diff --git a/projects/easydeploy/model/__init__.py b/projects/easydeploy/model/__init__.py
index 52d6043e1..38af8bc32 100644
--- a/projects/easydeploy/model/__init__.py
+++ b/projects/easydeploy/model/__init__.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from .backend import MMYOLOBackend
 from .backendwrapper import ORTWrapper, TRTWrapper
 from .model import DeployModel
 
-__all__ = ['DeployModel', 'TRTWrapper', 'ORTWrapper']
+__all__ = ['DeployModel', 'TRTWrapper', 'ORTWrapper', 'MMYOLOBackend']
diff --git a/projects/easydeploy/model/backend.py b/projects/easydeploy/model/backend.py
new file mode 100644
index 000000000..64d6e3f02
--- /dev/null
+++ b/projects/easydeploy/model/backend.py
@@ -0,0 +1,23 @@
+from enum import Enum
+
+import torch
+import torch.nn.functional as F
+
+
+class MMYOLOBackend(Enum):
+    AX620A = 'ax620a'
+    COREML = 'coreml'
+    HORIZONX3 = 'horizonx3'
+    NCNN = 'ncnn'
+    ONNXRUNTIME = 'onnxruntime'
+    OPENVINO = 'openvino'
+    PPLNN = 'pplnn'
+    RKNN = 'rknn'
+    TENSORRT8 = 'tensorrt8'
+    TENSORRT7 = 'tensorrt7'
+    TORCHSCRIPT = 'torchscript'
+    TVM = 'tvm'
+
+
+def HSigmoid__forward(self, x: torch.Tensor) -> torch.Tensor:
+    return F.hardsigmoid(x, inplace=True)
diff --git a/projects/easydeploy/model/model.py b/projects/easydeploy/model/model.py
index 0adcbbd22..c67ed2872 100644
--- a/projects/easydeploy/model/model.py
+++ b/projects/easydeploy/model/model.py
@@ -1,42 +1,47 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from copy import deepcopy
 from functools import partial
-from typing import List, Optional
+from typing import List, Optional, Tuple
 
 import torch
 import torch.nn as nn
 from mmdet.models.backbones.csp_darknet import Focus
+from mmdet.models.layers import ChannelAttention
 from mmengine.config import ConfigDict
 from torch import Tensor
 
 from mmyolo.models import RepVGGBlock
-from mmyolo.models.dense_heads import (RTMDetHead, YOLOv5Head, YOLOv7Head,
-                                       YOLOXHead)
-from mmyolo.models.layers import CSPLayerWithTwoConv
-from ..backbone import DeployC2f, DeployFocus, GConvFocus, NcnnFocus
+from mmyolo.models.dense_heads import (PPYOLOEHead, RTMDetHead, YOLOv5Head,
+                                       YOLOv7Head, YOLOv8Head, YOLOXHead)
+from mmyolo.models.layers import ImplicitA, ImplicitM
+from ..backbone import DeployFocus, GConvFocus, NcnnFocus
 from ..bbox_code import (rtmdet_bbox_decoder, yolov5_bbox_decoder,
                          yolox_bbox_decoder)
 from ..nms import batched_nms, efficient_nms, onnx_nms
+from .backend import MMYOLOBackend
 
 
 class DeployModel(nn.Module):
+    transpose = False
 
     def __init__(self,
                  baseModel: nn.Module,
+                 backend: MMYOLOBackend,
                  postprocess_cfg: Optional[ConfigDict] = None):
         super().__init__()
         self.baseModel = baseModel
+        self.baseHead = baseModel.bbox_head
+        self.backend = backend
         if postprocess_cfg is None:
             self.with_postprocess = False
         else:
             self.with_postprocess = True
-            self.baseHead = baseModel.bbox_head
             self.__init_sub_attributes()
             self.detector_type = type(self.baseHead)
             self.pre_top_k = postprocess_cfg.get('pre_top_k', 1000)
             self.keep_top_k = postprocess_cfg.get('keep_top_k', 100)
             self.iou_threshold = postprocess_cfg.get('iou_threshold', 0.65)
             self.score_threshold = postprocess_cfg.get('score_threshold', 0.25)
-            self.backend = postprocess_cfg.get('backend', 1)
         self.__switch_deploy()
 
     def __init_sub_attributes(self):
@@ -47,21 +52,34 @@ def __init_sub_attributes(self):
         self.num_classes = self.baseHead.num_classes
 
     def __switch_deploy(self):
+        headType = type(self.baseHead)
+        if not self.with_postprocess:
+            if headType in (YOLOv5Head, YOLOv7Head):
+                self.baseHead.head_module.forward_single = self.forward_single
+            elif headType in (PPYOLOEHead, YOLOv8Head):
+                self.baseHead.head_module.reg_max = 0
+
+        if self.backend in (MMYOLOBackend.HORIZONX3, MMYOLOBackend.NCNN,
+                            MMYOLOBackend.TORCHSCRIPT):
+            self.transpose = True
         for layer in self.baseModel.modules():
             if isinstance(layer, RepVGGBlock):
                 layer.switch_to_deploy()
+            elif isinstance(layer, ChannelAttention):
+                layer.global_avgpool.forward = self.forward_gvp
             elif isinstance(layer, Focus):
-                # onnxruntime tensorrt8 tensorrt7
-                if self.backend in (1, 2, 3):
+                # onnxruntime openvino tensorrt8 tensorrt7
+                if self.backend in (MMYOLOBackend.ONNXRUNTIME,
+                                    MMYOLOBackend.OPENVINO,
+                                    MMYOLOBackend.TENSORRT8,
+                                    MMYOLOBackend.TENSORRT7):
                     self.baseModel.backbone.stem = DeployFocus(layer)
                 # ncnn
-                elif self.backend == 4:
+                elif self.backend == MMYOLOBackend.NCNN:
                     self.baseModel.backbone.stem = NcnnFocus(layer)
                 # switch focus to group conv
                 else:
                     self.baseModel.backbone.stem = GConvFocus(layer)
-            elif isinstance(layer, CSPLayerWithTwoConv):
-                setattr(layer, '__class__', DeployC2f)
 
     def pred_by_feat(self,
                      cls_scores: List[Tensor],
@@ -129,11 +147,11 @@ def pred_by_feat(self,
                         self.score_threshold, self.pre_top_k, self.keep_top_k)
 
     def select_nms(self):
-        if self.backend == 1:
+        if self.backend in (MMYOLOBackend.ONNXRUNTIME, MMYOLOBackend.OPENVINO):
             nms_func = onnx_nms
-        elif self.backend == 2:
+        elif self.backend == MMYOLOBackend.TENSORRT8:
             nms_func = efficient_nms
-        elif self.backend == 3:
+        elif self.backend == MMYOLOBackend.TENSORRT7:
             nms_func = batched_nms
         else:
             raise NotImplementedError
@@ -147,4 +165,41 @@ def forward(self, inputs: Tensor):
         if self.with_postprocess:
             return self.pred_by_feat(*neck_outputs)
         else:
-            return neck_outputs
+            outputs = []
+            if self.transpose:
+                for feats in zip(*neck_outputs):
+                    if self.backend in (MMYOLOBackend.NCNN,
+                                        MMYOLOBackend.TORCHSCRIPT):
+                        outputs.append(
+                            torch.cat(
+                                [feat.permute(0, 2, 3, 1) for feat in feats],
+                                -1))
+                    else:
+                        outputs.append(torch.cat(feats, 1).permute(0, 2, 3, 1))
+            else:
+                for feats in zip(*neck_outputs):
+                    outputs.append(torch.cat(feats, 1))
+            return tuple(outputs)
+
+    @staticmethod
+    def forward_single(x: Tensor, convs: nn.Module) -> Tuple[Tensor]:
+        if isinstance(convs, nn.Sequential) and any(
+                type(m) in (ImplicitA, ImplicitM) for m in convs):
+            a, c, m = convs
+            aw = a.implicit.clone()
+            mw = m.implicit.clone()
+            c = deepcopy(c)
+            nw, cw, _, _ = c.weight.shape
+            na, ca, _, _ = aw.shape
+            nm, cm, _, _ = mw.shape
+            c.bias = nn.Parameter(c.bias + (
+                c.weight.reshape(nw, cw) @ aw.reshape(ca, na)).squeeze(1))
+            c.bias = nn.Parameter(c.bias * mw.reshape(cm))
+            c.weight = nn.Parameter(c.weight * mw.transpose(0, 1))
+            convs = c
+        feat = convs(x)
+        return (feat, )
+
+    @staticmethod
+    def forward_gvp(x: Tensor) -> Tensor:
+        return torch.mean(x, [2, 3], keepdim=True)
diff --git a/projects/easydeploy/tools/export.py b/projects/easydeploy/tools/export_onnx.py
similarity index 75%
rename from projects/easydeploy/tools/export.py
rename to projects/easydeploy/tools/export_onnx.py
index fb7419e10..b937cc8a7 100644
--- a/projects/easydeploy/tools/export.py
+++ b/projects/easydeploy/tools/export_onnx.py
@@ -1,16 +1,20 @@
 import argparse
 import os
+import sys
 import warnings
 from io import BytesIO
+from pathlib import Path
 
 import onnx
 import torch
 from mmdet.apis import init_detector
 from mmengine.config import ConfigDict
+from mmengine.logging import print_log
 from mmengine.utils.path import mkdir_or_exist
 
-from mmyolo.utils import register_all_modules
-from projects.easydeploy.model import DeployModel
+# Add MMYOLO ROOT to sys.path
+sys.path.append(str(Path(__file__).resolve().parents[3]))
+from projects.easydeploy.model import DeployModel, MMYOLOBackend  # noqa E402
 
 warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning)
 warnings.filterwarnings(action='ignore', category=torch.jit.ScriptWarning)
@@ -43,7 +47,10 @@ def parse_args():
     parser.add_argument(
         '--opset', type=int, default=11, help='ONNX opset version')
     parser.add_argument(
-        '--backend', type=int, default=1, help='Backend for export onnx')
+        '--backend',
+        type=str,
+        default='onnxruntime',
+        help='Backend for export onnx')
     parser.add_argument(
         '--pre-topk',
         type=int,
@@ -77,10 +84,16 @@ def build_model_from_cfg(config_path, checkpoint_path, device):
 
 def main():
     args = parse_args()
-    register_all_modules()
-
     mkdir_or_exist(args.work_dir)
-
+    backend = MMYOLOBackend(args.backend.lower())
+    if backend in (MMYOLOBackend.ONNXRUNTIME, MMYOLOBackend.OPENVINO,
+                   MMYOLOBackend.TENSORRT8, MMYOLOBackend.TENSORRT7):
+        if not args.model_only:
+            print_log('Export ONNX with bbox decoder and NMS ...')
+    else:
+        args.model_only = True
+        print_log(f'Can not export postprocess for {args.backend.lower()}.\n'
+                  f'Set "args.model_only=True" default.')
     if args.model_only:
         postprocess_cfg = None
         output_names = None
@@ -89,13 +102,12 @@ def main():
             pre_top_k=args.pre_topk,
             keep_top_k=args.keep_topk,
             iou_threshold=args.iou_threshold,
-            score_threshold=args.score_threshold,
-            backend=args.backend)
+            score_threshold=args.score_threshold)
         output_names = ['num_dets', 'boxes', 'scores', 'labels']
     baseModel = build_model_from_cfg(args.config, args.checkpoint, args.device)
 
     deploy_model = DeployModel(
-        baseModel=baseModel, postprocess_cfg=postprocess_cfg)
+        baseModel=baseModel, backend=backend, postprocess_cfg=postprocess_cfg)
     deploy_model.eval()
 
     fake_input = torch.randn(args.batch_size, 3,
@@ -103,7 +115,9 @@ def main():
     # dry run
     deploy_model(fake_input)
 
-    save_onnx_path = os.path.join(args.work_dir, 'end2end.onnx')
+    save_onnx_path = os.path.join(
+        args.work_dir,
+        os.path.basename(args.checkpoint).replace('pth', 'onnx'))
     # export onnx
     with BytesIO() as f:
         torch.onnx.export(
@@ -118,7 +132,8 @@ def main():
         onnx.checker.check_model(onnx_model)
 
         # Fix tensorrt onnx output shape, just for view
-        if args.backend in (2, 3):
+        if not args.model_only and backend in (MMYOLOBackend.TENSORRT8,
+                                               MMYOLOBackend.TENSORRT7):
             shapes = [
                 args.batch_size, 1, args.batch_size, args.keep_topk, 4,
                 args.batch_size, args.keep_topk, args.batch_size,
@@ -133,9 +148,9 @@ def main():
             onnx_model, check = onnxsim.simplify(onnx_model)
             assert check, 'assert check failed'
         except Exception as e:
-            print(f'Simplify failure: {e}')
+            print_log(f'Simplify failure: {e}')
     onnx.save(onnx_model, save_onnx_path)
-    print(f'ONNX export success, save into {save_onnx_path}')
+    print_log(f'ONNX export success, save into {save_onnx_path}')
 
 
 if __name__ == '__main__':
diff --git a/projects/misc/custom_dataset/README.md b/projects/misc/custom_dataset/README.md
index 76658ed72..e98fa7302 100644
--- a/projects/misc/custom_dataset/README.md
+++ b/projects/misc/custom_dataset/README.md
@@ -1,3 +1,3 @@
-Tips: 这个是自定义数据集的 config 文件，请结合 [自定义数据集教程](https://github.com/open-mmlab/mmyolo/blob/dev/docs/zh_cn/user_guides/custom_dataset.md) 来使用。
+Tips: 这个是自定义数据集的 config 文件，请结合 [标注+训练+测试+部署全流程](https://github.com/open-mmlab/mmyolo/blob/main/docs/zh_cn/recommended_topics/labeling_to_deployment_tutorials.md) 来使用。
 
-Tips: This is the config file of the custom dataset. Please use it in combination with [custom dataset](https://github.com/open-mmlab/mmyolo/blob/dev/docs/en/user_guides/custom_dataset.md).
+Tips: This is the config file of the custom dataset. Please use it in combination with [Annotation-to-deployment workflow for custom dataset](https://github.com/open-mmlab/mmyolo/blob/main/docs/en/recommended_topics/labeling_to_deployment_tutorials.md).
diff --git a/projects/misc/ionogram_detection/README.md b/projects/misc/ionogram_detection/README.md
new file mode 100644
index 000000000..eb7ddd580
--- /dev/null
+++ b/projects/misc/ionogram_detection/README.md
@@ -0,0 +1,3 @@
+Tips: 这是 MMYOLO 应用范例的配置文件，请结合 [基于 MMYOLO 的频高图实时目标检测 benchmark](/docs/zh_cn/recommended_topics/application_examples/ionogram_detection.md) 来使用。
+
+Tips: This is the config file of the MMYOLO application examples. Please use it in combination with [A Benchmark for Ionogram Detection Based on MMYOLO](/docs/en/recommended_topics/application_examples/ionogram_detection.md).
diff --git a/projects/misc/ionogram_detection/rtmdet/rtmdet_l_fast_1xb32-100e_ionogram.py b/projects/misc/ionogram_detection/rtmdet/rtmdet_l_fast_1xb32-100e_ionogram.py
new file mode 100644
index 000000000..f1829eebf
--- /dev/null
+++ b/projects/misc/ionogram_detection/rtmdet/rtmdet_l_fast_1xb32-100e_ionogram.py
@@ -0,0 +1,107 @@
+_base_ = 'mmyolo::rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco.py'
+
+# ======================== Modified parameters ======================
+# -----data related-----
+data_root = './Iono4311/'
+train_ann_file = 'annotations/train.json'
+train_data_prefix = 'train_images/'
+val_ann_file = 'annotations/val.json'
+val_data_prefix = 'val_images/'
+test_ann_file = 'annotations/test.json'
+test_data_prefix = 'test_images/'
+
+class_name = ('E', 'Es-l', 'Es-c', 'F1', 'F2', 'Spread-F')
+num_classes = len(class_name)
+metainfo = dict(
+    classes=class_name,
+    palette=[(250, 165, 30), (120, 69, 125), (53, 125, 34), (0, 11, 123),
+             (130, 20, 12), (120, 121, 80)])
+
+train_batch_size_per_gpu = 32
+train_num_workers = 8
+val_batch_size_per_gpu = train_batch_size_per_gpu
+
+# Config of batch shapes. Only on val.
+batch_shapes_cfg = dict(batch_size=val_batch_size_per_gpu)
+
+# -----train val related-----
+load_from = 'https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco/rtmdet_l_syncbn_fast_8xb32-300e_coco_20230102_135928-ee3abdc4.pth'  # noqa
+
+# default hooks
+save_epoch_intervals = 10
+max_epochs = 100
+max_keep_ckpts = 1
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1.0e-5, by_epoch=False, begin=0,
+        end=300),
+    dict(
+        # use cosine lr from 20 to 100 epoch
+        type='CosineAnnealingLR',
+        eta_min=_base_.base_lr * 0.05,
+        begin=max_epochs // 5,
+        end=max_epochs,
+        T_max=max_epochs * 4 // 5,
+        by_epoch=True,
+        convert_to_iter_based=True),
+]
+
+# train_cfg
+val_interval = 2
+val_begin = 20
+
+tta_model = None
+tta_pipeline = None
+
+visualizer = dict(
+    vis_backends=[dict(type='LocalVisBackend'),
+                  dict(type='WandbVisBackend')])
+
+# ===================== Unmodified in most cases ==================
+model = dict(
+    bbox_head=dict(head_module=dict(num_classes=num_classes)),
+    train_cfg=dict(assigner=dict(num_classes=num_classes)))
+
+train_dataloader = dict(
+    batch_size=train_batch_size_per_gpu,
+    num_workers=train_num_workers,
+    dataset=dict(
+        metainfo=metainfo,
+        data_root=data_root,
+        ann_file=train_ann_file,
+        data_prefix=dict(img=train_data_prefix)))
+
+val_dataloader = dict(
+    batch_size=val_batch_size_per_gpu,
+    num_workers=train_num_workers,
+    dataset=dict(
+        metainfo=metainfo,
+        data_root=data_root,
+        data_prefix=dict(img=val_data_prefix),
+        ann_file=val_ann_file))
+
+test_dataloader = dict(
+    batch_size=val_batch_size_per_gpu,
+    num_workers=train_num_workers,
+    dataset=dict(
+        metainfo=metainfo,
+        data_root=data_root,
+        data_prefix=dict(img=test_data_prefix),
+        ann_file=test_ann_file))
+
+default_hooks = dict(
+    checkpoint=dict(
+        interval=save_epoch_intervals,
+        max_keep_ckpts=max_keep_ckpts,
+        save_best='auto'))
+
+val_evaluator = dict(ann_file=data_root + val_ann_file)
+test_evaluator = dict(ann_file=data_root + test_ann_file)
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop',
+    max_epochs=max_epochs,
+    val_begin=val_begin,
+    val_interval=val_interval)
diff --git a/projects/misc/ionogram_detection/rtmdet/rtmdet_s_fast_1xb32-100e_ionogram.py b/projects/misc/ionogram_detection/rtmdet/rtmdet_s_fast_1xb32-100e_ionogram.py
new file mode 100644
index 000000000..49b284b09
--- /dev/null
+++ b/projects/misc/ionogram_detection/rtmdet/rtmdet_s_fast_1xb32-100e_ionogram.py
@@ -0,0 +1,83 @@
+_base_ = './rtmdet_l_fast_1xb32-100e_ionogram.py'
+
+load_from = 'https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco/rtmdet_s_syncbn_fast_8xb32-300e_coco_20221230_182329-0a8c901a.pth'  # noqa
+
+# ======================= Modified parameters =====================
+deepen_factor = 0.33
+widen_factor = 0.5
+img_scale = _base_.img_scale
+
+# ratio range for random resize
+random_resize_ratio_range = (0.5, 2.0)
+# Number of cached images in mosaic
+mosaic_max_cached_images = 40
+# Number of cached images in mixup
+mixup_max_cached_images = 20
+
+# ===================== Unmodified in most cases ==================
+model = dict(
+    backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
+    neck=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+    ),
+    bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Mosaic',
+        img_scale=img_scale,
+        use_cached=True,
+        max_cached_images=mosaic_max_cached_images,
+        pad_val=114.0),
+    dict(
+        type='mmdet.RandomResize',
+        # img_scale is (width, height)
+        scale=(img_scale[0] * 2, img_scale[1] * 2),
+        ratio_range=random_resize_ratio_range,  # note
+        resize_type='mmdet.Resize',
+        keep_ratio=True),
+    dict(type='mmdet.RandomCrop', crop_size=img_scale),
+    dict(type='mmdet.YOLOXHSVRandomAug'),
+    dict(type='mmdet.RandomFlip', prob=0.5),
+    dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))),
+    dict(
+        type='YOLOv5MixUp',
+        use_cached=True,
+        max_cached_images=mixup_max_cached_images),
+    dict(type='mmdet.PackDetInputs')
+]
+
+train_pipeline_stage2 = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='mmdet.RandomResize',
+        scale=img_scale,
+        ratio_range=random_resize_ratio_range,  # note
+        resize_type='mmdet.Resize',
+        keep_ratio=True),
+    dict(type='mmdet.RandomCrop', crop_size=img_scale),
+    dict(type='mmdet.YOLOXHSVRandomAug'),
+    dict(type='mmdet.RandomFlip', prob=0.5),
+    dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))),
+    dict(type='mmdet.PackDetInputs')
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+
+custom_hooks = [
+    dict(
+        type='EMAHook',
+        ema_type='ExpMomentumEMA',
+        momentum=0.0002,
+        update_buffers=True,
+        strict_load=False,
+        priority=49),
+    dict(
+        type='mmdet.PipelineSwitchHook',
+        switch_epoch=_base_.max_epochs - _base_.num_epochs_stage2,
+        switch_pipeline=train_pipeline_stage2)
+]
diff --git a/projects/misc/ionogram_detection/rtmdet/rtmdet_tiny_fast_1xb32-100e_ionogram.py b/projects/misc/ionogram_detection/rtmdet/rtmdet_tiny_fast_1xb32-100e_ionogram.py
new file mode 100644
index 000000000..acdaa0756
--- /dev/null
+++ b/projects/misc/ionogram_detection/rtmdet/rtmdet_tiny_fast_1xb32-100e_ionogram.py
@@ -0,0 +1,62 @@
+_base_ = './rtmdet_s_fast_1xb32-100e_ionogram.py'
+
+# ======================= Modified parameters ======================
+deepen_factor = 0.167
+widen_factor = 0.375
+img_scale = _base_.img_scale
+
+load_from = 'https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco/rtmdet_tiny_syncbn_fast_8xb32-300e_coco_20230102_140117-dbb1dc83.pth'  # noqa
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1.0e-5, by_epoch=False, begin=0,
+        end=300),
+    dict(
+        # use cosine lr from 50 to 100 epoch
+        type='CosineAnnealingLR',
+        eta_min=_base_.base_lr * 0.05,
+        begin=_base_.max_epochs // 2,
+        end=_base_.max_epochs,
+        T_max=_base_.max_epochs // 2,
+        by_epoch=True,
+        convert_to_iter_based=True),
+]
+
+# =======================Unmodified in most cases==================
+model = dict(
+    backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
+    neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor),
+    bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Mosaic',
+        img_scale=img_scale,
+        use_cached=True,
+        max_cached_images=20,  # note
+        random_pop=False,  # note
+        pad_val=114.0),
+    dict(
+        type='mmdet.RandomResize',
+        # img_scale is (width, height)
+        scale=(img_scale[0] * 2, img_scale[1] * 2),
+        ratio_range=(0.5, 2.0),
+        resize_type='mmdet.Resize',
+        keep_ratio=True),
+    dict(type='mmdet.RandomCrop', crop_size=img_scale),
+    dict(type='mmdet.YOLOXHSVRandomAug'),
+    dict(type='mmdet.RandomFlip', prob=0.5),
+    dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))),
+    dict(
+        type='YOLOv5MixUp',
+        use_cached=True,
+        random_pop=False,
+        max_cached_images=10,
+        prob=0.5),
+    dict(type='mmdet.PackDetInputs')
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
diff --git a/projects/misc/ionogram_detection/yolov5/yolov5_m-v61_fast_1xb32-100e_ionogram.py b/projects/misc/ionogram_detection/yolov5/yolov5_m-v61_fast_1xb32-100e_ionogram.py
new file mode 100644
index 000000000..737aeae9a
--- /dev/null
+++ b/projects/misc/ionogram_detection/yolov5/yolov5_m-v61_fast_1xb32-100e_ionogram.py
@@ -0,0 +1,95 @@
+_base_ = './yolov5_s-v61_fast_1xb96-100e_ionogram.py'
+
+# ======================= Modified parameters =====================
+# Copied from '../../yolov5/yolov5_m-v61_syncbn_fast_8xb16-300e_coco.py'
+deepen_factor = 0.67
+widen_factor = 0.75
+lr_factor = 0.1
+affine_scale = 0.9
+loss_cls_weight = 0.3
+loss_obj_weight = 0.7
+mixup_prob = 0.1
+
+# -----data related-----
+train_batch_size_per_gpu = 32
+
+# -----train val related-----
+# Scale lr for SGD
+base_lr = _base_.base_lr * train_batch_size_per_gpu \
+    / _base_.train_batch_size_per_gpu
+load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-v61_syncbn_fast_8xb16-300e_coco/yolov5_m-v61_syncbn_fast_8xb16-300e_coco_20220917_204944-516a710f.pth'  # noqa
+
+# ===================== Unmodified in most cases ==================
+num_classes = _base_.num_classes
+num_det_layers = _base_.num_det_layers
+img_scale = _base_.img_scale
+
+model = dict(
+    backbone=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+    ),
+    neck=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+    ),
+    bbox_head=dict(
+        head_module=dict(widen_factor=widen_factor),
+        loss_cls=dict(loss_weight=loss_cls_weight *
+                      (num_classes / 80 * 3 / num_det_layers)),
+        loss_obj=dict(loss_weight=loss_obj_weight *
+                      ((img_scale[0] / 640)**2 * 3 / num_det_layers))))
+
+pre_transform = _base_.pre_transform
+albu_train_transforms = _base_.albu_train_transforms
+
+mosaic_affine_pipeline = [
+    dict(
+        type='Mosaic',
+        img_scale=img_scale,
+        pad_val=114.0,
+        pre_transform=pre_transform),
+    dict(
+        type='YOLOv5RandomAffine',
+        max_rotate_degree=0.0,
+        max_shear_degree=0.0,
+        scaling_ratio_range=(1 - affine_scale, 1 + affine_scale),
+        # img_scale is (width, height)
+        border=(-img_scale[0] // 2, -img_scale[1] // 2),
+        border_val=(114, 114, 114))
+]
+
+# enable mixup
+train_pipeline = [
+    *pre_transform, *mosaic_affine_pipeline,
+    dict(
+        type='YOLOv5MixUp',
+        prob=mixup_prob,
+        pre_transform=[*pre_transform, *mosaic_affine_pipeline]),
+    dict(
+        type='mmdet.Albu',
+        transforms=albu_train_transforms,
+        bbox_params=dict(
+            type='BboxParams',
+            format='pascal_voc',
+            label_fields=['gt_bboxes_labels', 'gt_ignore_flags']),
+        keymap={
+            'img': 'image',
+            'gt_bboxes': 'bboxes'
+        }),
+    dict(type='YOLOv5HSVRandomAug'),
+    dict(type='mmdet.RandomFlip', prob=0.5),
+    dict(
+        type='mmdet.PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
+                   'flip_direction'))
+]
+
+train_dataloader = dict(
+    batch_size=train_batch_size_per_gpu,
+    dataset=dict(dataset=dict(pipeline=train_pipeline)))
+
+val_dataloader = dict(batch_size=train_batch_size_per_gpu)
+test_dataloader = dict(batch_size=train_batch_size_per_gpu)
+optim_wrapper = dict(optimizer=dict(lr=base_lr))
+default_hooks = dict(param_scheduler=dict(lr_factor=lr_factor))
diff --git a/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb32-100e_ionogram_mosaic.py b/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb32-100e_ionogram_mosaic.py
new file mode 100644
index 000000000..1252ebfca
--- /dev/null
+++ b/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb32-100e_ionogram_mosaic.py
@@ -0,0 +1,35 @@
+_base_ = './yolov5_s-v61_fast_1xb96-100e_ionogram.py'
+
+# ======================= Modified parameters =====================
+# -----data related-----
+train_batch_size_per_gpu = 32
+
+# -----train val related-----
+base_lr = _base_.base_lr * train_batch_size_per_gpu \
+    / _base_.train_batch_size_per_gpu / 2
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Mosaic',
+        img_scale=(640, 640),
+        pad_val=114.0,
+        pre_transform=[
+            dict(type='LoadImageFromFile'),
+            dict(type='LoadAnnotations', with_bbox=True)
+        ]),
+    dict(
+        type='mmdet.PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape'))
+]
+
+# ===================== Unmodified in most cases ==================
+train_dataloader = dict(
+    batch_size=train_batch_size_per_gpu,
+    dataset=dict(dataset=dict(pipeline=train_pipeline)))
+
+val_dataloader = dict(batch_size=train_batch_size_per_gpu)
+
+test_dataloader = dict(batch_size=train_batch_size_per_gpu)
+
+optim_wrapper = dict(optimizer=dict(lr=base_lr))
diff --git a/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py b/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py
new file mode 100644
index 000000000..dbe1305d8
--- /dev/null
+++ b/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py
@@ -0,0 +1,108 @@
+_base_ = 'mmyolo::yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py'
+
+# ======================= Modified parameters =====================
+# -----data related-----
+data_root = './Iono4311/'
+train_ann_file = 'annotations/train.json'
+train_data_prefix = 'train_images/'
+val_ann_file = 'annotations/val.json'
+val_data_prefix = 'val_images/'
+test_ann_file = 'annotations/test.json'
+test_data_prefix = 'test_images/'
+class_name = ('E', 'Es-l', 'Es-c', 'F1', 'F2', 'Spread-F')
+num_classes = len(class_name)
+metainfo = dict(
+    classes=class_name,
+    palette=[(250, 165, 30), (120, 69, 125), (53, 125, 34), (0, 11, 123),
+             (130, 20, 12), (120, 121, 80)])
+# Batch size of a single GPU during training
+train_batch_size_per_gpu = 96
+# Worker to pre-fetch data for each single GPU during training
+train_num_workers = 8
+
+# -----model related-----
+# Basic size of multi-scale prior box
+anchors = [[[8, 6], [24, 4], [19, 9]], [[22, 19], [17, 49], [29, 45]],
+           [[44, 66], [96, 76], [126, 59]]]
+
+# -----train val related-----
+# base_lr_default * (your_bs / default_bs (8x16)) for SGD
+base_lr = _base_.base_lr * train_batch_size_per_gpu / (8 * 16)
+max_epochs = 100
+load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth'  # noqa
+
+# default_hooks
+save_epoch_intervals = 10
+logger_interval = 20
+max_keep_ckpts = 1
+
+# train_cfg
+val_interval = 2
+val_begin = 20
+
+tta_model = None
+tta_pipeline = None
+
+visualizer = dict(
+    vis_backends=[dict(type='LocalVisBackend'),
+                  dict(type='WandbVisBackend')])
+
+# ===================== Unmodified in most cases ==================
+model = dict(
+    bbox_head=dict(
+        head_module=dict(num_classes=num_classes),
+        prior_generator=dict(base_sizes=anchors),
+        loss_cls=dict(loss_weight=0.5 *
+                      (num_classes / 80 * 3 / _base_.num_det_layers))))
+
+train_dataloader = dict(
+    batch_size=train_batch_size_per_gpu,
+    num_workers=train_num_workers,
+    dataset=dict(
+        _delete_=True,
+        type='RepeatDataset',
+        times=1,
+        dataset=dict(
+            type=_base_.dataset_type,
+            data_root=data_root,
+            metainfo=metainfo,
+            ann_file=train_ann_file,
+            data_prefix=dict(img=train_data_prefix),
+            filter_cfg=dict(filter_empty_gt=False, min_size=32),
+            pipeline=_base_.train_pipeline)))
+
+val_dataloader = dict(
+    batch_size=train_batch_size_per_gpu,
+    num_workers=train_num_workers,
+    dataset=dict(
+        metainfo=metainfo,
+        data_root=data_root,
+        ann_file=val_ann_file,
+        data_prefix=dict(img=val_data_prefix)))
+
+test_dataloader = dict(
+    batch_size=train_batch_size_per_gpu,
+    num_workers=train_num_workers,
+    dataset=dict(
+        metainfo=metainfo,
+        data_root=data_root,
+        ann_file=test_ann_file,
+        data_prefix=dict(img=test_data_prefix)))
+
+optim_wrapper = dict(optimizer=dict(lr=base_lr))
+
+default_hooks = dict(
+    checkpoint=dict(
+        type='CheckpointHook',
+        save_param_scheduler=None,  # for yolov5
+        interval=save_epoch_intervals,
+        max_keep_ckpts=max_keep_ckpts,
+        save_best='auto'),
+    param_scheduler=dict(max_epochs=max_epochs),
+    logger=dict(type='LoggerHook', interval=logger_interval))
+
+val_evaluator = dict(ann_file=data_root + val_ann_file)
+test_evaluator = dict(ann_file=data_root + test_ann_file)
+
+train_cfg = dict(
+    max_epochs=max_epochs, val_begin=val_begin, val_interval=val_interval)
diff --git a/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_aug0.py b/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_aug0.py
new file mode 100644
index 000000000..39ffb6ba1
--- /dev/null
+++ b/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_aug0.py
@@ -0,0 +1,21 @@
+_base_ = './yolov5_s-v61_fast_1xb96-100e_ionogram.py'
+
+# ======================= Modified parameters =====================
+# -----train val related-----
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='YOLOv5KeepRatioResize', scale=(640, 640)),
+    dict(
+        type='LetterResize',
+        scale=(640, 640),
+        allow_scale_up=False,
+        pad_val=dict(img=114)),
+    dict(
+        type='mmdet.PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'pad_param'))
+]
+
+# ===================== Unmodified in most cases ==================
+train_dataloader = dict(dataset=dict(dataset=dict(pipeline=train_pipeline)))
diff --git a/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_mosaic_affine.py b/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_mosaic_affine.py
new file mode 100644
index 000000000..10c114cbc
--- /dev/null
+++ b/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_mosaic_affine.py
@@ -0,0 +1,29 @@
+_base_ = './yolov5_s-v61_fast_1xb96-100e_ionogram.py'
+
+# ======================= Modified parameters =====================
+# -----train val related-----
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Mosaic',
+        img_scale=(640, 640),
+        pad_val=114.0,
+        pre_transform=[
+            dict(type='LoadImageFromFile'),
+            dict(type='LoadAnnotations', with_bbox=True)
+        ]),
+    dict(
+        type='YOLOv5RandomAffine',
+        max_rotate_degree=0.0,
+        max_shear_degree=0.0,
+        scaling_ratio_range=(0.5, 1.5),
+        border=(-320, -320),
+        border_val=(114, 114, 114)),
+    dict(
+        type='mmdet.PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape'))
+]
+
+# ===================== Unmodified in most cases ==================
+train_dataloader = dict(dataset=dict(dataset=dict(pipeline=train_pipeline)))
diff --git a/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_mosaic_affine_albu_hsv.py b/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_mosaic_affine_albu_hsv.py
new file mode 100644
index 000000000..df8f6a2c5
--- /dev/null
+++ b/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_mosaic_affine_albu_hsv.py
@@ -0,0 +1,44 @@
+_base_ = './yolov5_s-v61_fast_1xb96-100e_ionogram.py'
+
+# ======================= Modified parameters =====================
+# -----train val related-----
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='Mosaic',
+        img_scale=(640, 640),
+        pad_val=114.0,
+        pre_transform=[
+            dict(type='LoadImageFromFile'),
+            dict(type='LoadAnnotations', with_bbox=True)
+        ]),
+    dict(
+        type='YOLOv5RandomAffine',
+        max_rotate_degree=0.0,
+        max_shear_degree=0.0,
+        scaling_ratio_range=(0.5, 1.5),
+        border=(-320, -320),
+        border_val=(114, 114, 114)),
+    dict(
+        type='mmdet.Albu',
+        transforms=[
+            dict(type='Blur', p=0.01),
+            dict(type='MedianBlur', p=0.01),
+            dict(type='ToGray', p=0.01),
+            dict(type='CLAHE', p=0.01)
+        ],
+        bbox_params=dict(
+            type='BboxParams',
+            format='pascal_voc',
+            label_fields=['gt_bboxes_labels', 'gt_ignore_flags']),
+        keymap=dict(img='image', gt_bboxes='bboxes')),
+    dict(type='YOLOv5HSVRandomAug'),
+    # dict(type='mmdet.RandomFlip', prob=0.5),
+    dict(
+        type='mmdet.PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape'))
+]
+
+# ===================== Unmodified in most cases ==================
+train_dataloader = dict(dataset=dict(dataset=dict(pipeline=train_pipeline)))
diff --git a/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-200e_ionogram_pre0.py b/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-200e_ionogram_pre0.py
new file mode 100644
index 000000000..9f62fac92
--- /dev/null
+++ b/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-200e_ionogram_pre0.py
@@ -0,0 +1,17 @@
+_base_ = './yolov5_s-v61_fast_1xb96-100e_ionogram.py'
+
+# ======================= Modified parameters =====================
+# -----train val related-----
+base_lr = _base_.base_lr * 4
+max_epochs = 200
+load_from = None
+logger_interval = 50
+
+train_cfg = dict(max_epochs=max_epochs, )
+
+# ===================== Unmodified in most cases ==================
+optim_wrapper = dict(optimizer=dict(lr=base_lr))
+
+default_hooks = dict(
+    param_scheduler=dict(max_epochs=max_epochs),
+    logger=dict(type='LoggerHook', interval=logger_interval))
diff --git a/projects/misc/ionogram_detection/yolov6/yolov6_l_fast_1xb32-100e_ionogram.py b/projects/misc/ionogram_detection/yolov6/yolov6_l_fast_1xb32-100e_ionogram.py
new file mode 100644
index 000000000..dc5918d82
--- /dev/null
+++ b/projects/misc/ionogram_detection/yolov6/yolov6_l_fast_1xb32-100e_ionogram.py
@@ -0,0 +1,29 @@
+_base_ = './yolov6_m_fast_1xb32-100e_ionogram.py'
+
+# ======================= Modified parameters =======================
+# -----model related-----
+deepen_factor = 1
+widen_factor = 1
+
+# -----train val related-----
+load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_l_syncbn_fast_8xb32-300e_coco/yolov6_l_syncbn_fast_8xb32-300e_coco_20221109_183156-91e3c447.pth'  # noqa
+
+# ====================== Unmodified in most cases ===================
+model = dict(
+    backbone=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+        hidden_ratio=1. / 2,
+        block_cfg=dict(
+            type='ConvWrapper',
+            norm_cfg=dict(type='BN', momentum=0.03, eps=0.001)),
+        act_cfg=dict(type='SiLU', inplace=True)),
+    neck=dict(
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+        hidden_ratio=1. / 2,
+        block_cfg=dict(
+            type='ConvWrapper',
+            norm_cfg=dict(type='BN', momentum=0.03, eps=0.001)),
+        block_act_cfg=dict(type='SiLU', inplace=True)),
+    bbox_head=dict(head_module=dict(widen_factor=widen_factor)))
diff --git a/projects/misc/ionogram_detection/yolov6/yolov6_m_fast_1xb32-100e_ionogram.py b/projects/misc/ionogram_detection/yolov6/yolov6_m_fast_1xb32-100e_ionogram.py
new file mode 100644
index 000000000..00ea8ff05
--- /dev/null
+++ b/projects/misc/ionogram_detection/yolov6/yolov6_m_fast_1xb32-100e_ionogram.py
@@ -0,0 +1,63 @@
+_base_ = './yolov6_s_fast_1xb32-100e_ionogram.py'
+
+# ======================= Modified parameters =======================
+# -----model related-----
+# The scaling factor that controls the depth of the network structure
+deepen_factor = 0.6
+# The scaling factor that controls the width of the network structure
+widen_factor = 0.75
+
+# -----train val related-----
+affine_scale = 0.9  # YOLOv5RandomAffine scaling ratio
+load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_m_syncbn_fast_8xb32-300e_coco/yolov6_m_syncbn_fast_8xb32-300e_coco_20221109_182658-85bda3f4.pth'  # noqa
+
+# ====================== Unmodified in most cases ===================
+model = dict(
+    backbone=dict(
+        type='YOLOv6CSPBep',
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+        hidden_ratio=2. / 3,
+        block_cfg=dict(type='RepVGGBlock'),
+        act_cfg=dict(type='ReLU', inplace=True)),
+    neck=dict(
+        type='YOLOv6CSPRepPAFPN',
+        deepen_factor=deepen_factor,
+        widen_factor=widen_factor,
+        block_cfg=dict(type='RepVGGBlock'),
+        hidden_ratio=2. / 3,
+        block_act_cfg=dict(type='ReLU', inplace=True)),
+    bbox_head=dict(
+        type='YOLOv6Head', head_module=dict(widen_factor=widen_factor)))
+
+mosaic_affine_pipeline = [
+    dict(
+        type='Mosaic',
+        img_scale=_base_.img_scale,
+        pad_val=114.0,
+        pre_transform=_base_.pre_transform),
+    dict(
+        type='YOLOv5RandomAffine',
+        max_rotate_degree=0.0,
+        max_shear_degree=0.0,
+        scaling_ratio_range=(1 - affine_scale, 1 + affine_scale),
+        # img_scale is (width, height)
+        border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
+        border_val=(114, 114, 114))
+]
+
+train_pipeline = [
+    *_base_.pre_transform, *mosaic_affine_pipeline,
+    dict(
+        type='YOLOv5MixUp',
+        prob=0.1,
+        pre_transform=[*_base_.pre_transform, *mosaic_affine_pipeline]),
+    dict(type='YOLOv5HSVRandomAug'),
+    dict(type='mmdet.RandomFlip', prob=0.5),
+    dict(
+        type='mmdet.PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
+                   'flip_direction'))
+]
+
+train_dataloader = dict(dataset=dict(dataset=dict(pipeline=train_pipeline)))
diff --git a/projects/misc/ionogram_detection/yolov6/yolov6_s_fast_1xb32-100e_ionogram.py b/projects/misc/ionogram_detection/yolov6/yolov6_s_fast_1xb32-100e_ionogram.py
new file mode 100644
index 000000000..c9748b408
--- /dev/null
+++ b/projects/misc/ionogram_detection/yolov6/yolov6_s_fast_1xb32-100e_ionogram.py
@@ -0,0 +1,108 @@
+_base_ = 'mmyolo::yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py'
+
+# ======================= Modified parameters =====================
+# -----data related-----
+data_root = './Iono4311/'
+train_ann_file = 'annotations/train.json'
+train_data_prefix = 'train_images/'
+val_ann_file = 'annotations/val.json'
+val_data_prefix = 'val_images/'
+test_ann_file = 'annotations/test.json'
+test_data_prefix = 'test_images/'
+
+class_name = ('E', 'Es-l', 'Es-c', 'F1', 'F2', 'Spread-F')
+num_classes = len(class_name)
+metainfo = dict(
+    classes=class_name,
+    palette=[(250, 165, 30), (120, 69, 125), (53, 125, 34), (0, 11, 123),
+             (130, 20, 12), (120, 121, 80)])
+
+train_batch_size_per_gpu = 32
+train_num_workers = 8
+
+tta_model = None
+tta_pipeline = None
+
+# -----train val related-----
+load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco/yolov6_s_syncbn_fast_8xb32-400e_coco_20221102_203035-932e1d91.pth'  # noqa
+#  base_lr_default * (your_bs 32 / default_bs (8 x 32))
+base_lr = _base_.base_lr * train_batch_size_per_gpu / (8 * 32)
+max_epochs = 100
+save_epoch_intervals = 10
+val_begin = 20
+max_keep_ckpts = 1
+log_interval = 50
+visualizer = dict(
+    vis_backends=[dict(type='LocalVisBackend'),
+                  dict(type='WandbVisBackend')])
+
+# ==================== Unmodified in most cases ===================
+train_cfg = dict(
+    max_epochs=max_epochs,
+    val_begin=val_begin,
+    val_interval=save_epoch_intervals,
+    dynamic_intervals=None)
+
+model = dict(
+    bbox_head=dict(head_module=dict(num_classes=num_classes)),
+    train_cfg=dict(
+        initial_assigner=dict(num_classes=num_classes),
+        assigner=dict(num_classes=num_classes)))
+
+train_dataloader = dict(
+    batch_size=train_batch_size_per_gpu,
+    num_workers=train_num_workers,
+    dataset=dict(
+        _delete_=True,
+        type='RepeatDataset',
+        times=1,
+        dataset=dict(
+            type=_base_.dataset_type,
+            data_root=data_root,
+            metainfo=metainfo,
+            ann_file=train_ann_file,
+            data_prefix=dict(img=train_data_prefix),
+            filter_cfg=dict(filter_empty_gt=False, min_size=32),
+            pipeline=_base_.train_pipeline)))
+
+val_dataloader = dict(
+    dataset=dict(
+        metainfo=metainfo,
+        data_root=data_root,
+        ann_file=val_ann_file,
+        data_prefix=dict(img=val_data_prefix)))
+
+test_dataloader = dict(
+    dataset=dict(
+        metainfo=metainfo,
+        data_root=data_root,
+        ann_file=test_ann_file,
+        data_prefix=dict(img=test_data_prefix)))
+
+val_evaluator = dict(ann_file=data_root + val_data_prefix)
+test_evaluator = dict(ann_file=data_root + test_data_prefix)
+
+optim_wrapper = dict(optimizer=dict(lr=base_lr))
+
+default_hooks = dict(
+    checkpoint=dict(
+        type='CheckpointHook',
+        interval=save_epoch_intervals,
+        max_keep_ckpts=max_keep_ckpts,
+        save_best='auto'),
+    param_scheduler=dict(max_epochs=max_epochs),
+    logger=dict(type='LoggerHook', interval=log_interval))
+
+custom_hooks = [
+    dict(
+        type='EMAHook',
+        ema_type='ExpMomentumEMA',
+        momentum=0.0001,
+        update_buffers=True,
+        strict_load=False,
+        priority=49),
+    dict(
+        type='mmdet.PipelineSwitchHook',
+        switch_epoch=max_epochs - _base_.num_last_epochs,
+        switch_pipeline=_base_.train_pipeline_stage2)
+]
diff --git a/projects/misc/ionogram_detection/yolov6/yolov6_s_fast_1xb32-200e_ionogram_pre0.py b/projects/misc/ionogram_detection/yolov6/yolov6_s_fast_1xb32-200e_ionogram_pre0.py
new file mode 100644
index 000000000..cc38730f9
--- /dev/null
+++ b/projects/misc/ionogram_detection/yolov6/yolov6_s_fast_1xb32-200e_ionogram_pre0.py
@@ -0,0 +1,17 @@
+_base_ = './yolov6_s_fast_1xb32-100e_ionogram.py'
+
+# ======================= Modified parameters =====================
+base_lr = _base_.base_lr * 4
+optim_wrapper = dict(optimizer=dict(lr=base_lr))
+max_epochs = 200
+load_from = None
+
+# ==================== Unmodified in most cases ===================
+train_cfg = dict(
+    max_epochs=max_epochs,
+    val_begin=20,
+)
+
+default_hooks = dict(
+    param_scheduler=dict(max_epochs=max_epochs),
+    logger=dict(type='LoggerHook', interval=50))
diff --git a/projects/misc/ionogram_detection/yolov7/yolov7_l_fast_1xb16-100e_ionogram.py b/projects/misc/ionogram_detection/yolov7/yolov7_l_fast_1xb16-100e_ionogram.py
new file mode 100644
index 000000000..44d58c1f3
--- /dev/null
+++ b/projects/misc/ionogram_detection/yolov7/yolov7_l_fast_1xb16-100e_ionogram.py
@@ -0,0 +1,98 @@
+_base_ = 'mmyolo::yolov7/yolov7_l_syncbn_fast_8x16b-300e_coco.py'
+
+# ======================== Modified parameters ======================
+# -----data related-----
+data_root = './Iono4311/'
+train_ann_file = 'annotations/train.json'
+train_data_prefix = 'train_images/'
+val_ann_file = 'annotations/val.json'
+val_data_prefix = 'val_images/'
+test_ann_file = 'annotations/test.json'
+test_data_prefix = 'test_images/'
+
+class_name = ('E', 'Es-l', 'Es-c', 'F1', 'F2', 'Spread-F')
+num_classes = len(class_name)
+metainfo = dict(
+    classes=class_name,
+    palette=[(250, 165, 30), (120, 69, 125), (53, 125, 34), (0, 11, 123),
+             (130, 20, 12), (120, 121, 80)])
+
+train_batch_size_per_gpu = 16
+train_num_workers = 8
+
+# -----model related-----
+anchors = [[[14, 14], [35, 6], [32, 18]], [[32, 45], [28, 97], [52, 80]],
+           [[71, 122], [185, 94], [164, 134]]]
+
+# -----train val related-----
+#  base_lr_default * (your_bs 32 / default_bs (8 x 16))
+base_lr = _base_.base_lr * train_batch_size_per_gpu / (8 * 16)
+load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_l_syncbn_fast_8x16b-300e_coco/yolov7_l_syncbn_fast_8x16b-300e_coco_20221123_023601-8113c0eb.pth'  # noqa
+
+# default hooks
+save_epoch_intervals = 10
+max_epochs = 100
+max_keep_ckpts = 1
+
+# train_cfg
+val_interval = 2
+val_begin = 20
+
+tta_model = None
+tta_pipeline = None
+
+visualizer = dict(
+    vis_backends=[dict(type='LocalVisBackend'),
+                  dict(type='WandbVisBackend')])
+
+# ===================== Unmodified in most cases ==================
+model = dict(
+    bbox_head=dict(
+        head_module=dict(num_classes=num_classes),
+        prior_generator=dict(base_sizes=anchors),
+        loss_cls=dict(loss_weight=_base_.loss_cls_weight *
+                      (num_classes / 80 * 3 / _base_.num_det_layers))))
+
+train_dataloader = dict(
+    batch_size=train_batch_size_per_gpu,
+    num_workers=train_num_workers,
+    dataset=dict(
+        metainfo=metainfo,
+        data_root=data_root,
+        ann_file=train_ann_file,
+        data_prefix=dict(img=train_data_prefix)))
+
+val_dataloader = dict(
+    batch_size=train_batch_size_per_gpu,
+    num_workers=train_num_workers,
+    dataset=dict(
+        metainfo=metainfo,
+        data_root=data_root,
+        data_prefix=dict(img=val_data_prefix),
+        ann_file=val_ann_file))
+
+test_dataloader = dict(
+    batch_size=train_batch_size_per_gpu,
+    num_workers=train_num_workers,
+    dataset=dict(
+        metainfo=metainfo,
+        data_root=data_root,
+        data_prefix=dict(img=test_data_prefix),
+        ann_file=test_ann_file))
+
+optim_wrapper = dict(
+    optimizer=dict(lr=base_lr, batch_size_per_gpu=train_batch_size_per_gpu))
+
+default_hooks = dict(
+    param_scheduler=dict(max_epochs=max_epochs),
+    checkpoint=dict(
+        interval=save_epoch_intervals, max_keep_ckpts=max_keep_ckpts))
+
+val_evaluator = dict(ann_file=data_root + val_ann_file)
+test_evaluator = dict(ann_file=data_root + test_ann_file)
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop',
+    max_epochs=max_epochs,
+    val_begin=val_begin,
+    val_interval=val_interval)
diff --git a/projects/misc/ionogram_detection/yolov7/yolov7_tiny_fast_1xb16-100e_ionogram.py b/projects/misc/ionogram_detection/yolov7/yolov7_tiny_fast_1xb16-100e_ionogram.py
new file mode 100644
index 000000000..9c2d63dde
--- /dev/null
+++ b/projects/misc/ionogram_detection/yolov7/yolov7_tiny_fast_1xb16-100e_ionogram.py
@@ -0,0 +1,101 @@
+_base_ = './yolov7_l_fast_1xb16-100e_ionogram.py'
+
+# ======================== Modified parameters =======================
+# pre-train
+load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco/yolov7_tiny_syncbn_fast_8x16b-300e_coco_20221126_102719-0ee5bbdf.pth'  # noqa
+
+# -----model related-----
+# Data augmentation
+max_translate_ratio = 0.1  # YOLOv5RandomAffine
+scaling_ratio_range = (0.5, 1.6)  # YOLOv5RandomAffine
+mixup_prob = 0.05  # YOLOv5MixUp
+randchoice_mosaic_prob = [0.8, 0.2]
+mixup_alpha = 8.0  # YOLOv5MixUp
+mixup_beta = 8.0  # YOLOv5MixUp
+
+# -----train val related-----
+loss_cls_weight = 0.5
+loss_obj_weight = 1.0
+
+lr_factor = 0.01  # Learning rate scaling factor
+
+# ====================== Unmodified in most cases ====================
+num_classes = _base_.num_classes
+num_det_layers = _base_.num_det_layers
+img_scale = _base_.img_scale
+pre_transform = _base_.pre_transform
+model = dict(
+    backbone=dict(
+        arch='Tiny', act_cfg=dict(type='LeakyReLU', negative_slope=0.1)),
+    neck=dict(
+        is_tiny_version=True,
+        in_channels=[128, 256, 512],
+        out_channels=[64, 128, 256],
+        block_cfg=dict(
+            _delete_=True, type='TinyDownSampleBlock', middle_ratio=0.25),
+        act_cfg=dict(type='LeakyReLU', negative_slope=0.1),
+        use_repconv_outs=False),
+    bbox_head=dict(
+        head_module=dict(in_channels=[128, 256, 512]),
+        loss_cls=dict(loss_weight=loss_cls_weight *
+                      (num_classes / 80 * 3 / num_det_layers)),
+        loss_obj=dict(loss_weight=loss_obj_weight *
+                      ((img_scale[0] / 640)**2 * 3 / num_det_layers))))
+
+mosiac4_pipeline = [
+    dict(
+        type='Mosaic',
+        img_scale=img_scale,
+        pad_val=114.0,
+        pre_transform=pre_transform),
+    dict(
+        type='YOLOv5RandomAffine',
+        max_rotate_degree=0.0,
+        max_shear_degree=0.0,
+        max_translate_ratio=max_translate_ratio,  # change
+        scaling_ratio_range=scaling_ratio_range,  # change
+        # img_scale is (width, height)
+        border=(-img_scale[0] // 2, -img_scale[1] // 2),
+        border_val=(114, 114, 114)),
+]
+
+mosiac9_pipeline = [
+    dict(
+        type='Mosaic9',
+        img_scale=img_scale,
+        pad_val=114.0,
+        pre_transform=pre_transform),
+    dict(
+        type='YOLOv5RandomAffine',
+        max_rotate_degree=0.0,
+        max_shear_degree=0.0,
+        max_translate_ratio=max_translate_ratio,  # change
+        scaling_ratio_range=scaling_ratio_range,  # change
+        border=(-img_scale[0] // 2, -img_scale[1] // 2),
+        border_val=(114, 114, 114)),
+]
+
+randchoice_mosaic_pipeline = dict(
+    type='RandomChoice',
+    transforms=[mosiac4_pipeline, mosiac9_pipeline],
+    prob=randchoice_mosaic_prob)
+
+train_pipeline = [
+    *pre_transform,
+    randchoice_mosaic_pipeline,
+    dict(
+        type='YOLOv5MixUp',
+        alpha=mixup_alpha,
+        beta=mixup_beta,
+        prob=mixup_prob,  # change
+        pre_transform=[*pre_transform, randchoice_mosaic_pipeline]),
+    dict(type='YOLOv5HSVRandomAug'),
+    dict(type='mmdet.RandomFlip', prob=0.5),
+    dict(
+        type='mmdet.PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
+                   'flip_direction'))
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+default_hooks = dict(param_scheduler=dict(lr_factor=lr_factor))
diff --git a/projects/misc/ionogram_detection/yolov7/yolov7_x_fast_1xb16-100e_ionogram.py b/projects/misc/ionogram_detection/yolov7/yolov7_x_fast_1xb16-100e_ionogram.py
new file mode 100644
index 000000000..606232a66
--- /dev/null
+++ b/projects/misc/ionogram_detection/yolov7/yolov7_x_fast_1xb16-100e_ionogram.py
@@ -0,0 +1,19 @@
+_base_ = './yolov7_l_fast_1xb16-100e_ionogram.py'
+
+# ======================== Modified parameters =======================
+load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_x_syncbn_fast_8x16b-300e_coco/yolov7_x_syncbn_fast_8x16b-300e_coco_20221124_215331-ef949a68.pth'  # noqa
+
+# ===================== Unmodified in most cases ==================
+model = dict(
+    backbone=dict(arch='X'),
+    neck=dict(
+        in_channels=[640, 1280, 1280],
+        out_channels=[160, 320, 640],
+        block_cfg=dict(
+            type='ELANBlock',
+            middle_ratio=0.4,
+            block_ratio=0.4,
+            num_blocks=3,
+            num_convs_in_block=2),
+        use_repconv_outs=False),
+    bbox_head=dict(head_module=dict(in_channels=[320, 640, 1280])))
diff --git a/requirements/docs.txt b/requirements/docs.txt
index 2eacbcde0..4933cc9e2 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -1,7 +1,7 @@
 docutils==0.16.0
 mmcv>=2.0.0rc4,<2.1.0
-mmdet>=3.0.0rc6
-mmengine>=0.6.0
+mmdet>=3.0.0
+mmengine>=0.7.1
 myst-parser
 -e git+https://github.com/open-mmlab/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
 sphinx==4.0.2
@@ -10,3 +10,4 @@ sphinx_markdown_tables
 sphinx_rtd_theme==0.5.2
 torch
 torchvision
+urllib3<2.0.0
diff --git a/requirements/mminstall.txt b/requirements/mminstall.txt
index 91727bafb..f078af142 100644
--- a/requirements/mminstall.txt
+++ b/requirements/mminstall.txt
@@ -1,3 +1,3 @@
 mmcv>=2.0.0rc4,<2.1.0
-mmdet>=3.0.0rc6
-mmengine>=0.6.0
+mmdet>=3.0.0
+mmengine>=0.7.1
diff --git a/requirements/mmpose.txt b/requirements/mmpose.txt
new file mode 100644
index 000000000..8e4726e68
--- /dev/null
+++ b/requirements/mmpose.txt
@@ -0,0 +1 @@
+mmpose>=1.0.0
diff --git a/requirements/tests.txt b/requirements/tests.txt
index be276ebe5..285b3f396 100644
--- a/requirements/tests.txt
+++ b/requirements/tests.txt
@@ -1,4 +1,3 @@
-codecov
 flake8
 interrogate
 isort==4.3.21
@@ -6,6 +5,7 @@ isort==4.3.21
 kwarray
 memory_profiler
 mmcls>=1.0.0rc4
+mmpose>=1.0.0
 mmrazor>=1.0.0rc2
 mmrotate>=1.0.0rc1
 parameterized
diff --git a/tests/test_datasets/test_transforms/test_formatting.py b/tests/test_datasets/test_transforms/test_formatting.py
new file mode 100644
index 000000000..c75475dfc
--- /dev/null
+++ b/tests/test_datasets/test_transforms/test_formatting.py
@@ -0,0 +1,119 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os.path as osp
+import unittest
+
+import numpy as np
+from mmdet.structures import DetDataSample
+from mmdet.structures.mask import BitmapMasks
+from mmengine.structures import InstanceData, PixelData
+
+from mmyolo.datasets.transforms import PackDetInputs
+
+
+class TestPackDetInputs(unittest.TestCase):
+
+    def setUp(self):
+        """Setup the model and optimizer which are used in every test method.
+
+        TestCase calls functions in this order: setUp() -> testMethod() ->
+        tearDown() -> cleanUp()
+        """
+        data_prefix = osp.join(osp.dirname(__file__), '../../data')
+        img_path = osp.join(data_prefix, 'color.jpg')
+        rng = np.random.RandomState(0)
+        self.results1 = {
+            'img_id': 1,
+            'img_path': img_path,
+            'ori_shape': (300, 400),
+            'img_shape': (600, 800),
+            'scale_factor': 2.0,
+            'flip': False,
+            'img': rng.rand(300, 400),
+            'gt_seg_map': rng.rand(300, 400),
+            'gt_masks':
+            BitmapMasks(rng.rand(3, 300, 400), height=300, width=400),
+            'gt_bboxes_labels': rng.rand(3, ),
+            'gt_ignore_flags': np.array([0, 0, 1], dtype=bool),
+            'proposals': rng.rand(2, 4),
+            'proposals_scores': rng.rand(2, )
+        }
+        self.results2 = {
+            'img_id': 1,
+            'img_path': img_path,
+            'ori_shape': (300, 400),
+            'img_shape': (600, 800),
+            'scale_factor': 2.0,
+            'flip': False,
+            'img': rng.rand(300, 400),
+            'gt_seg_map': rng.rand(300, 400),
+            'gt_masks':
+            BitmapMasks(rng.rand(3, 300, 400), height=300, width=400),
+            'gt_bboxes_labels': rng.rand(3, ),
+            'proposals': rng.rand(2, 4),
+            'proposals_scores': rng.rand(2, )
+        }
+        self.results3 = {
+            'img_id': 1,
+            'img_path': img_path,
+            'ori_shape': (300, 400),
+            'img_shape': (600, 800),
+            'scale_factor': 2.0,
+            'flip': False,
+            'img': rng.rand(300, 400),
+            'gt_seg_map': rng.rand(300, 400),
+            'gt_masks':
+            BitmapMasks(rng.rand(3, 300, 400), height=300, width=400),
+            'gt_panoptic_seg': rng.rand(1, 300, 400),
+            'gt_bboxes_labels': rng.rand(3, ),
+            'proposals': rng.rand(2, 4),
+            'proposals_scores': rng.rand(2, )
+        }
+        self.meta_keys = ('img_id', 'img_path', 'ori_shape', 'scale_factor',
+                          'flip')
+
+    def test_transform(self):
+        transform = PackDetInputs(meta_keys=self.meta_keys)
+        results = transform(copy.deepcopy(self.results1))
+        self.assertIn('data_samples', results)
+        self.assertIsInstance(results['data_samples'], DetDataSample)
+        self.assertIsInstance(results['data_samples'].gt_instances,
+                              InstanceData)
+        self.assertIsInstance(results['data_samples'].ignored_instances,
+                              InstanceData)
+        self.assertEqual(len(results['data_samples'].gt_instances), 2)
+        self.assertEqual(len(results['data_samples'].ignored_instances), 1)
+        self.assertIsInstance(results['data_samples'].gt_sem_seg, PixelData)
+
+    def test_transform_without_ignore(self):
+        transform = PackDetInputs(meta_keys=self.meta_keys)
+        results = transform(copy.deepcopy(self.results2))
+        self.assertIn('data_samples', results)
+        self.assertIsInstance(results['data_samples'], DetDataSample)
+        self.assertIsInstance(results['data_samples'].gt_instances,
+                              InstanceData)
+        self.assertIsInstance(results['data_samples'].ignored_instances,
+                              InstanceData)
+        self.assertEqual(len(results['data_samples'].gt_instances), 3)
+        self.assertEqual(len(results['data_samples'].ignored_instances), 0)
+        self.assertIsInstance(results['data_samples'].gt_sem_seg, PixelData)
+
+    def test_transform_with_panoptic_seg(self):
+        transform = PackDetInputs(meta_keys=self.meta_keys)
+        results = transform(copy.deepcopy(self.results3))
+        self.assertIn('data_samples', results)
+        self.assertIsInstance(results['data_samples'], DetDataSample)
+        self.assertIsInstance(results['data_samples'].gt_instances,
+                              InstanceData)
+        self.assertIsInstance(results['data_samples'].ignored_instances,
+                              InstanceData)
+        self.assertEqual(len(results['data_samples'].gt_instances), 3)
+        self.assertEqual(len(results['data_samples'].ignored_instances), 0)
+        self.assertIsInstance(results['data_samples'].gt_sem_seg, PixelData)
+        self.assertIsInstance(results['data_samples'].gt_panoptic_seg,
+                              PixelData)
+
+    def test_repr(self):
+        transform = PackDetInputs(meta_keys=self.meta_keys)
+        self.assertEqual(
+            repr(transform), f'PackDetInputs(meta_keys={self.meta_keys})')
diff --git a/tests/test_datasets/test_transforms/test_mix_img_transforms.py b/tests/test_datasets/test_transforms/test_mix_img_transforms.py
index 1d4eff0b4..2e9bf20e3 100644
--- a/tests/test_datasets/test_transforms/test_mix_img_transforms.py
+++ b/tests/test_datasets/test_transforms/test_mix_img_transforms.py
@@ -24,9 +24,7 @@ def setUp(self):
         tearDown() -> cleanUp()
         """
         self.pre_transform = [
-            dict(
-                type='LoadImageFromFile',
-                file_client_args=dict(backend='disk')),
+            dict(type='LoadImageFromFile'),
             dict(type='LoadAnnotations', with_bbox=True)
         ]
 
@@ -107,9 +105,7 @@ def test_transform_with_box_list(self):
     def test_transform_with_mask(self):
         rng = np.random.RandomState(0)
         pre_transform = [
-            dict(
-                type='LoadImageFromFile',
-                file_client_args=dict(backend='disk')),
+            dict(type='LoadImageFromFile'),
             dict(type='LoadAnnotations', with_bbox=True, with_mask=True)
         ]
 
@@ -157,9 +153,7 @@ def setUp(self):
         """
         rng = np.random.RandomState(0)
         self.pre_transform = [
-            dict(
-                type='LoadImageFromFile',
-                file_client_args=dict(backend='disk')),
+            dict(type='LoadImageFromFile'),
             dict(type='LoadAnnotations', with_bbox=True)
         ]
 
@@ -249,9 +243,7 @@ def setUp(self):
         tearDown() -> cleanUp()
         """
         self.pre_transform = [
-            dict(
-                type='LoadImageFromFile',
-                file_client_args=dict(backend='disk')),
+            dict(type='LoadImageFromFile'),
             dict(type='LoadAnnotations', with_bbox=True)
         ]
         self.dataset = YOLOv5CocoDataset(
@@ -307,9 +299,7 @@ def test_transform_with_box_list(self):
     def test_transform_with_mask(self):
         rng = np.random.RandomState(0)
         pre_transform = [
-            dict(
-                type='LoadImageFromFile',
-                file_client_args=dict(backend='disk')),
+            dict(type='LoadImageFromFile'),
             dict(type='LoadAnnotations', with_bbox=True, with_mask=True)
         ]
         dataset = YOLOv5CocoDataset(
@@ -357,9 +347,7 @@ def setUp(self):
         """
         rng = np.random.RandomState(0)
         self.pre_transform = [
-            dict(
-                type='LoadImageFromFile',
-                file_client_args=dict(backend='disk')),
+            dict(type='LoadImageFromFile'),
             dict(type='LoadAnnotations', with_bbox=True)
         ]
         self.dataset = YOLOv5CocoDataset(
diff --git a/tests/test_datasets/test_transforms/test_transforms.py b/tests/test_datasets/test_transforms/test_transforms.py
index fc46151d4..a8b7ea49f 100644
--- a/tests/test_datasets/test_transforms/test_transforms.py
+++ b/tests/test_datasets/test_transforms/test_transforms.py
@@ -148,18 +148,21 @@ def test_letter_resize(self):
             self.assertIn('pad_param', data_info)
             pad_param = data_info['pad_param'].reshape(-1, 2).sum(
                 1)  # (top, b, l, r) -> (h, w)
-            scale_factor = np.asarray(
-                data_info['scale_factor'])[::-1]  # (w, h) -> (h, w)
-            scale_factor_keepratio = np.min(
-                np.asarray((32, 32)) / (input_h, input_w))
-            validate_shape = np.floor(
-                np.asarray((input_h, input_w)) * scale_factor_keepratio + 0.5)
-            scale_factor_keepratio = np.floor(scale_factor_keepratio *
-                                              input_h + 0.5) / input_h
-            scale_factor_letter = (output_h, output_w) / validate_shape
-            scale_factor_letter = (
-                scale_factor_letter -
-                (pad_param / validate_shape))[np.argmin(scale_factor_letter)]
+            scale_factor = np.asarray(data_info['scale_factor'])  # (w, h)
+
+            max_long_edge = max((32, 32))
+            max_short_edge = min((32, 32))
+            scale_factor_keepratio = min(
+                max_long_edge / max(input_h, input_w),
+                max_short_edge / min(input_h, input_w))
+            validate_shape = np.asarray(
+                (int(input_h * scale_factor_keepratio),
+                 int(input_w * scale_factor_keepratio)))
+            scale_factor_keepratio = np.asarray(
+                (validate_shape[1] / input_w, validate_shape[0] / input_h))
+
+            scale_factor_letter = ((np.asarray(
+                (output_h, output_w)) - pad_param) / validate_shape)[::-1]
             self.assertTrue(data_info['img_shape'][:2] == (output_h, output_w))
             self.assertTrue((scale_factor == (scale_factor_keepratio *
                                               scale_factor_letter)).all())
diff --git a/tests/test_deploy/data/model.py b/tests/test_deploy/data/model.py
index cf13167eb..817f8e22c 100644
--- a/tests/test_deploy/data/model.py
+++ b/tests/test_deploy/data/model.py
@@ -27,8 +27,6 @@
 load_from = None
 resume = False
 
-file_client_args = dict(backend='disk')
-
 # dataset settings
 data_root = 'data/coco/'
 dataset_type = 'YOLOv5CocoDataset'
@@ -133,7 +131,7 @@
 ]
 
 pre_transform = [
-    dict(type='LoadImageFromFile', file_client_args=file_client_args),
+    dict(type='LoadImageFromFile'),
     dict(type='LoadAnnotations', with_bbox=True)
 ]
 
@@ -185,7 +183,7 @@
         pipeline=train_pipeline))
 
 test_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=file_client_args),
+    dict(type='LoadImageFromFile'),
     dict(type='YOLOv5KeepRatioResize', scale=img_scale),
     dict(
         type='LetterResize',
diff --git a/tests/test_models/test_dense_heads/test_yolov5_head.py b/tests/test_models/test_dense_heads/test_yolov5_head.py
index 31b399bf4..974b9a986 100644
--- a/tests/test_models/test_dense_heads/test_yolov5_head.py
+++ b/tests/test_models/test_dense_heads/test_yolov5_head.py
@@ -1,11 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from unittest import TestCase
 
+import numpy as np
 import torch
 from mmengine.config import Config
 from mmengine.structures import InstanceData
 
-from mmyolo.models.dense_heads import YOLOv5Head
+from mmyolo.models.dense_heads import YOLOv5Head, YOLOv5InsHead
 from mmyolo.utils import register_all_modules
 
 register_all_modules()
@@ -234,3 +235,177 @@ def test_loss_by_feat_with_ignore(self):
                            'box loss should be non-zero')
         self.assertGreater(onegt_obj_loss.item(), 0,
                            'obj loss should be non-zero')
+
+
+class TestYOLOv5InsHead(TestCase):
+
+    def setUp(self):
+        self.head_module = dict(
+            type='YOLOv5InsHeadModule',
+            num_classes=4,
+            in_channels=[32, 64, 128],
+            featmap_strides=[8, 16, 32],
+            mask_channels=32,
+            proto_channels=32,
+            widen_factor=1.0)
+
+    def test_init_weights(self):
+        head = YOLOv5InsHead(head_module=self.head_module)
+        head.head_module.init_weights()
+
+    def test_predict_by_feat(self):
+        s = 256
+        img_metas = [{
+            'img_shape': (s, s, 3),
+            'ori_shape': (s, s, 3),
+            'batch_input_shape': (s, s),
+            'scale_factor': (1.0, 1.0),
+        }]
+        test_cfg = Config(
+            dict(
+                multi_label=True,
+                nms_pre=30000,
+                min_bbox_size=0,
+                score_thr=0.001,
+                nms=dict(type='nms', iou_threshold=0.6),
+                max_per_img=300,
+                mask_thr_binary=0.5))
+
+        head = YOLOv5InsHead(head_module=self.head_module, test_cfg=test_cfg)
+        head.eval()
+
+        feat = []
+        for i in range(len(self.head_module['in_channels'])):
+            in_channel = self.head_module['in_channels'][i]
+            feat_size = self.head_module['featmap_strides'][i]
+            feat.append(
+                torch.rand(1, in_channel, s // feat_size, s // feat_size))
+
+        with torch.no_grad():
+            res = head.forward(feat)
+            cls_scores, bbox_preds, objectnesses,\
+                coeff_preds, proto_preds = res
+            head.predict_by_feat(
+                cls_scores,
+                bbox_preds,
+                objectnesses,
+                coeff_preds,
+                proto_preds,
+                img_metas,
+                cfg=test_cfg,
+                rescale=True,
+                with_nms=True)
+
+            with self.assertRaises(AssertionError):
+                head.predict_by_feat(
+                    cls_scores,
+                    bbox_preds,
+                    coeff_preds,
+                    proto_preds,
+                    img_metas,
+                    cfg=test_cfg,
+                    rescale=True,
+                    with_nms=False)
+
+    def test_loss_by_feat(self):
+        s = 256
+        img_metas = [{
+            'img_shape': (s, s, 3),
+            'batch_input_shape': (s, s),
+            'scale_factor': 1,
+        }]
+
+        head = YOLOv5InsHead(head_module=self.head_module)
+        rng = np.random.RandomState(0)
+
+        feat = []
+        for i in range(len(self.head_module['in_channels'])):
+            in_channel = self.head_module['in_channels'][i]
+            feat_size = self.head_module['featmap_strides'][i]
+            feat.append(
+                torch.rand(1, in_channel, s // feat_size, s // feat_size))
+
+        cls_scores, bbox_preds, objectnesses,\
+            coeff_preds, proto_preds = head.forward(feat)
+
+        # Test that empty ground truth encourages the network to predict
+        # background
+        gt_bboxes_labels = torch.empty((0, 6))
+        gt_masks = rng.rand(0, s // 4, s // 4)
+
+        empty_gt_losses = head.loss_by_feat(cls_scores, bbox_preds,
+                                            objectnesses, coeff_preds,
+                                            proto_preds, gt_bboxes_labels,
+                                            gt_masks, img_metas)
+        # When there is no truth, the cls loss should be nonzero but there
+        # should be no box loss.
+        empty_cls_loss = empty_gt_losses['loss_cls'].sum()
+        empty_box_loss = empty_gt_losses['loss_bbox'].sum()
+        empty_obj_loss = empty_gt_losses['loss_obj'].sum()
+        empty_mask_loss = empty_gt_losses['loss_mask'].sum()
+        self.assertEqual(
+            empty_cls_loss.item(), 0,
+            'there should be no cls loss when there are no true boxes')
+        self.assertEqual(
+            empty_box_loss.item(), 0,
+            'there should be no box loss when there are no true boxes')
+        self.assertGreater(empty_obj_loss.item(), 0,
+                           'objectness loss should be non-zero')
+        self.assertEqual(
+            empty_mask_loss.item(), 0,
+            'there should be no mask loss when there are no true masks')
+
+        # When truth is non-empty then both cls and box loss should be nonzero
+        # for random inputs
+        head = YOLOv5InsHead(head_module=self.head_module)
+
+        bboxes = torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]])
+        labels = torch.Tensor([1.])
+        batch_id = torch.LongTensor([0])
+        gt_bboxes_labels = torch.cat([batch_id[None], labels[None], bboxes],
+                                     dim=1)
+        gt_masks = torch.from_numpy(rng.rand(1, s // 4, s // 4)).int()
+
+        one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, objectnesses,
+                                          coeff_preds, proto_preds,
+                                          gt_bboxes_labels, gt_masks,
+                                          img_metas)
+        onegt_cls_loss = one_gt_losses['loss_cls'].sum()
+        onegt_box_loss = one_gt_losses['loss_bbox'].sum()
+        onegt_obj_loss = one_gt_losses['loss_obj'].sum()
+        onegt_mask_loss = one_gt_losses['loss_mask'].sum()
+        self.assertGreater(onegt_cls_loss.item(), 0,
+                           'cls loss should be non-zero')
+        self.assertGreater(onegt_box_loss.item(), 0,
+                           'box loss should be non-zero')
+        self.assertGreater(onegt_obj_loss.item(), 0,
+                           'obj loss should be non-zero')
+        self.assertGreater(onegt_mask_loss.item(), 0,
+                           'mask loss should be non-zero')
+
+        # test num_class = 1
+        self.head_module['num_classes'] = 1
+        head = YOLOv5InsHead(head_module=self.head_module)
+        bboxes = torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]])
+        labels = torch.Tensor([1.])
+        batch_id = torch.LongTensor([0])
+        gt_bboxes_labels = torch.cat([batch_id[None], labels[None], bboxes],
+                                     dim=1)
+        gt_masks = torch.from_numpy(rng.rand(1, s // 4, s // 4)).int()
+
+        one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, objectnesses,
+                                          coeff_preds, proto_preds,
+                                          gt_bboxes_labels, gt_masks,
+                                          img_metas)
+        onegt_cls_loss = one_gt_losses['loss_cls'].sum()
+        onegt_box_loss = one_gt_losses['loss_bbox'].sum()
+        onegt_obj_loss = one_gt_losses['loss_obj'].sum()
+        onegt_mask_loss = one_gt_losses['loss_mask'].sum()
+        self.assertEqual(onegt_cls_loss.item(), 0,
+                         'cls loss should be non-zero')
+        self.assertGreater(onegt_box_loss.item(), 0,
+                           'box loss should be non-zero')
+        self.assertGreater(onegt_obj_loss.item(), 0,
+                           'obj loss should be non-zero')
+        self.assertGreater(onegt_mask_loss.item(), 0,
+                           'mask loss should be non-zero')
diff --git a/tests/test_models/test_dense_heads/test_yolov6_head.py b/tests/test_models/test_dense_heads/test_yolov6_head.py
index 47401d023..5bb951d12 100644
--- a/tests/test_models/test_dense_heads/test_yolov6_head.py
+++ b/tests/test_models/test_dense_heads/test_yolov6_head.py
@@ -34,6 +34,7 @@ def test_predict_by_feat(self):
                 nms=dict(type='nms', iou_threshold=0.65)))
 
         head = YOLOv6Head(head_module=self.head_module, test_cfg=test_cfg)
+        head.eval()
 
         feat = []
         for i in range(len(self.head_module['in_channels'])):
diff --git a/tests/test_models/test_dense_heads/test_yolox_head.py b/tests/test_models/test_dense_heads/test_yolox_head.py
index 60e0abe9d..390994417 100644
--- a/tests/test_models/test_dense_heads/test_yolox_head.py
+++ b/tests/test_models/test_dense_heads/test_yolox_head.py
@@ -6,7 +6,7 @@
 from mmengine.model import bias_init_with_prob
 from mmengine.testing import assert_allclose
 
-from mmyolo.models.dense_heads import YOLOXHead
+from mmyolo.models.dense_heads import YOLOXHead, YOLOXPoseHead
 from mmyolo.utils import register_all_modules
 
 register_all_modules()
@@ -157,3 +157,223 @@ def test_loss_by_feat(self):
             'there should be no box loss when gt_bboxes out of bound')
         self.assertGreater(empty_obj_loss.item(), 0,
                            'objectness loss should be non-zero')
+
+
+class TestYOLOXPoseHead(TestCase):
+
+    def setUp(self):
+        self.head_module = dict(
+            type='YOLOXPoseHeadModule',
+            num_classes=1,
+            num_keypoints=17,
+            in_channels=1,
+            stacked_convs=1,
+        )
+        self.train_cfg = Config(
+            dict(
+                assigner=dict(
+                    type='PoseSimOTAAssigner',
+                    center_radius=2.5,
+                    oks_weight=3.0,
+                    iou_calculator=dict(type='mmdet.BboxOverlaps2D'),
+                    oks_calculator=dict(
+                        type='OksLoss',
+                        metainfo='configs/_base_/pose/coco.py'))))
+        self.loss_pose = Config(
+            dict(
+                type='OksLoss',
+                metainfo='configs/_base_/pose/coco.py',
+                loss_weight=30.0))
+
+    def test_init_weights(self):
+        head = YOLOXPoseHead(
+            head_module=self.head_module,
+            loss_pose=self.loss_pose,
+            train_cfg=self.train_cfg)
+        head.head_module.init_weights()
+        bias_init = bias_init_with_prob(0.01)
+        for conv_cls, conv_obj, conv_vis in zip(
+                head.head_module.multi_level_conv_cls,
+                head.head_module.multi_level_conv_obj,
+                head.head_module.multi_level_conv_vis):
+            assert_allclose(conv_cls.bias.data,
+                            torch.ones_like(conv_cls.bias.data) * bias_init)
+            assert_allclose(conv_obj.bias.data,
+                            torch.ones_like(conv_obj.bias.data) * bias_init)
+            assert_allclose(conv_vis.bias.data,
+                            torch.ones_like(conv_vis.bias.data) * bias_init)
+
+    def test_predict_by_feat(self):
+        s = 256
+        img_metas = [{
+            'img_shape': (s, s, 3),
+            'ori_shape': (s, s, 3),
+            'scale_factor': (1.0, 1.0),
+        }]
+        test_cfg = Config(
+            dict(
+                multi_label=True,
+                max_per_img=300,
+                score_thr=0.01,
+                nms=dict(type='nms', iou_threshold=0.65)))
+
+        head = YOLOXPoseHead(
+            head_module=self.head_module,
+            loss_pose=self.loss_pose,
+            train_cfg=self.train_cfg,
+            test_cfg=test_cfg)
+        feat = [
+            torch.rand(1, 1, s // feat_size, s // feat_size)
+            for feat_size in [4, 8, 16]
+        ]
+        cls_scores, bbox_preds, objectnesses, \
+            offsets_preds, vis_preds = head.forward(feat)
+        head.predict_by_feat(
+            cls_scores,
+            bbox_preds,
+            objectnesses,
+            offsets_preds,
+            vis_preds,
+            img_metas,
+            cfg=test_cfg,
+            rescale=True,
+            with_nms=True)
+
+    def test_loss_by_feat(self):
+        s = 256
+        img_metas = [{
+            'img_shape': (s, s, 3),
+            'scale_factor': 1,
+        }]
+
+        head = YOLOXPoseHead(
+            head_module=self.head_module,
+            loss_pose=self.loss_pose,
+            train_cfg=self.train_cfg)
+        assert not head.use_bbox_aux
+
+        feat = [
+            torch.rand(1, 1, s // feat_size, s // feat_size)
+            for feat_size in [4, 8, 16]
+        ]
+        cls_scores, bbox_preds, objectnesses, \
+            offsets_preds, vis_preds = head.forward(feat)
+
+        # Test that empty ground truth encourages the network to predict
+        # background
+        gt_instances = torch.empty((0, 6))
+        gt_keypoints = torch.empty((0, 17, 2))
+        gt_keypoints_visible = torch.empty((0, 17))
+
+        empty_gt_losses = head.loss_by_feat(cls_scores, bbox_preds,
+                                            objectnesses, offsets_preds,
+                                            vis_preds, gt_instances,
+                                            gt_keypoints, gt_keypoints_visible,
+                                            img_metas)
+        # When there is no truth, the cls loss should be nonzero but there
+        # should be no box loss.
+        empty_cls_loss = empty_gt_losses['loss_cls'].sum()
+        empty_box_loss = empty_gt_losses['loss_bbox'].sum()
+        empty_obj_loss = empty_gt_losses['loss_obj'].sum()
+        empty_loss_kpt = empty_gt_losses['loss_kpt'].sum()
+        empty_loss_vis = empty_gt_losses['loss_vis'].sum()
+        self.assertEqual(
+            empty_cls_loss.item(), 0,
+            'there should be no cls loss when there are no true boxes')
+        self.assertEqual(
+            empty_box_loss.item(), 0,
+            'there should be no box loss when there are no true boxes')
+        self.assertGreater(empty_obj_loss.item(), 0,
+                           'objectness loss should be non-zero')
+        self.assertEqual(
+            empty_loss_kpt.item(), 0,
+            'there should be no kpt loss when there are no true keypoints')
+        self.assertEqual(
+            empty_loss_vis.item(), 0,
+            'there should be no vis loss when there are no true keypoints')
+        # When truth is non-empty then both cls and box loss should be nonzero
+        # for random inputs
+        head = YOLOXPoseHead(
+            head_module=self.head_module,
+            loss_pose=self.loss_pose,
+            train_cfg=self.train_cfg)
+        gt_instances = torch.Tensor(
+            [[0, 0, 23.6667, 23.8757, 238.6326, 151.8874]])
+        gt_keypoints = torch.Tensor([[[317.1519,
+                                       429.8433], [338.3080, 416.9187],
+                                      [298.9951,
+                                       403.8911], [102.7025, 273.1329],
+                                      [255.4321,
+                                       404.8712], [400.0422, 554.4373],
+                                      [167.7857,
+                                       516.7591], [397.4943, 737.4575],
+                                      [116.3247,
+                                       674.5684], [102.7025, 273.1329],
+                                      [66.0319,
+                                       808.6383], [102.7025, 273.1329],
+                                      [157.6150,
+                                       819.1249], [102.7025, 273.1329],
+                                      [102.7025,
+                                       273.1329], [102.7025, 273.1329],
+                                      [102.7025, 273.1329]]])
+        gt_keypoints_visible = torch.Tensor([[
+            1., 1., 1., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
+        ]])
+
+        one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, objectnesses,
+                                          offsets_preds, vis_preds,
+                                          gt_instances, gt_keypoints,
+                                          gt_keypoints_visible, img_metas)
+        onegt_cls_loss = one_gt_losses['loss_cls'].sum()
+        onegt_box_loss = one_gt_losses['loss_bbox'].sum()
+        onegt_obj_loss = one_gt_losses['loss_obj'].sum()
+        onegt_loss_kpt = one_gt_losses['loss_kpt'].sum()
+        onegt_loss_vis = one_gt_losses['loss_vis'].sum()
+
+        self.assertGreater(onegt_cls_loss.item(), 0,
+                           'cls loss should be non-zero')
+        self.assertGreater(onegt_box_loss.item(), 0,
+                           'box loss should be non-zero')
+        self.assertGreater(onegt_obj_loss.item(), 0,
+                           'obj loss should be non-zero')
+        self.assertGreater(onegt_loss_kpt.item(), 0,
+                           'kpt loss should be non-zero')
+        self.assertGreater(onegt_loss_vis.item(), 0,
+                           'vis loss should be non-zero')
+
+        # Test groud truth out of bound
+        gt_instances = torch.Tensor(
+            [[0, 2, s * 4, s * 4, s * 4 + 10, s * 4 + 10]])
+        gt_keypoints = torch.Tensor([[[s * 4, s * 4 + 10], [s * 4, s * 4 + 10],
+                                      [s * 4, s * 4 + 10], [s * 4, s * 4 + 10],
+                                      [s * 4, s * 4 + 10], [s * 4, s * 4 + 10],
+                                      [s * 4, s * 4 + 10], [s * 4, s * 4 + 10],
+                                      [s * 4, s * 4 + 10], [s * 4, s * 4 + 10],
+                                      [s * 4, s * 4 + 10], [s * 4, s * 4 + 10],
+                                      [s * 4, s * 4 + 10], [s * 4, s * 4 + 10],
+                                      [s * 4, s * 4 + 10], [s * 4, s * 4 + 10],
+                                      [s * 4, s * 4 + 10]]])
+        empty_gt_losses = head.loss_by_feat(cls_scores, bbox_preds,
+                                            objectnesses, offsets_preds,
+                                            vis_preds, gt_instances,
+                                            gt_keypoints, gt_keypoints_visible,
+                                            img_metas)
+        # When gt_bboxes out of bound, the assign results should be empty,
+        # so the cls and bbox loss should be zero.
+        empty_cls_loss = empty_gt_losses['loss_cls'].sum()
+        empty_box_loss = empty_gt_losses['loss_bbox'].sum()
+        empty_obj_loss = empty_gt_losses['loss_obj'].sum()
+        empty_kpt_loss = empty_gt_losses['loss_kpt'].sum()
+        empty_vis_loss = empty_gt_losses['loss_vis'].sum()
+        self.assertEqual(
+            empty_cls_loss.item(), 0,
+            'there should be no cls loss when gt_bboxes out of bound')
+        self.assertEqual(
+            empty_box_loss.item(), 0,
+            'there should be no box loss when gt_bboxes out of bound')
+        self.assertGreater(empty_obj_loss.item(), 0,
+                           'objectness loss should be non-zero')
+        self.assertEqual(empty_kpt_loss.item(), 0,
+                         'kps loss should be non-zero')
+        self.assertEqual(empty_vis_loss.item(), 0,
+                         'vis loss should be non-zero')
diff --git a/tests/test_models/test_necks/test_yolov6_pafpn.py b/tests/test_models/test_necks/test_yolov6_pafpn.py
index bea49febe..e766aa870 100644
--- a/tests/test_models/test_necks/test_yolov6_pafpn.py
+++ b/tests/test_models/test_necks/test_yolov6_pafpn.py
@@ -3,7 +3,8 @@
 
 import torch
 
-from mmyolo.models.necks import YOLOv6CSPRepPAFPN, YOLOv6RepPAFPN
+from mmyolo.models.necks import (YOLOv6CSPRepBiPAFPN, YOLOv6CSPRepPAFPN,
+                                 YOLOv6RepBiPAFPN, YOLOv6RepPAFPN)
 from mmyolo.utils import register_all_modules
 
 register_all_modules()
@@ -44,3 +45,37 @@ def test_YOLOv6CSPRepPAFPN_forward(self):
         for i in range(len(feats)):
             assert outs[i].shape[1] == out_channels[i]
             assert outs[i].shape[2] == outs[i].shape[3] == s // (2**i)
+
+    def test_YOLOv6CSPRepBiPAFPN_forward(self):
+        s = 64
+        in_channels = [4, 8, 16, 32]  # includes an extra input for BiFusion
+        feat_sizes = [s // 2**i for i in range(4)]  # [64, 32, 16, 8]
+        out_channels = [8, 16, 32]
+        feats = [
+            torch.rand(1, in_channels[i], feat_sizes[i], feat_sizes[i])
+            for i in range(len(in_channels))
+        ]
+        neck = YOLOv6CSPRepBiPAFPN(
+            in_channels=in_channels, out_channels=out_channels)
+        outs = neck(feats)
+        assert len(outs) == len(feats) - 1
+        for i in range(len(feats) - 1):
+            assert outs[i].shape[1] == out_channels[i]
+            assert outs[i].shape[2] == outs[i].shape[3] == feat_sizes[i + 1]
+
+    def test_YOLOv6RepBiPAFPN_forward(self):
+        s = 64
+        in_channels = [4, 8, 16, 32]  # includes an extra input for BiFusion
+        feat_sizes = [s // 2**i for i in range(4)]  # [64, 32, 16, 8]
+        out_channels = [8, 16, 32]
+        feats = [
+            torch.rand(1, in_channels[i], feat_sizes[i], feat_sizes[i])
+            for i in range(len(in_channels))
+        ]
+        neck = YOLOv6RepBiPAFPN(
+            in_channels=in_channels, out_channels=out_channels)
+        outs = neck(feats)
+        assert len(outs) == len(feats) - 1
+        for i in range(len(feats) - 1):
+            assert outs[i].shape[1] == out_channels[i]
+            assert outs[i].shape[2] == outs[i].shape[3] == feat_sizes[i + 1]
diff --git a/tests/test_models/test_task_modules/test_assigners/test_pose_sim_ota_assigner.py b/tests/test_models/test_task_modules/test_assigners/test_pose_sim_ota_assigner.py
new file mode 100644
index 000000000..fb4793f7e
--- /dev/null
+++ b/tests/test_models/test_task_modules/test_assigners/test_pose_sim_ota_assigner.py
@@ -0,0 +1,85 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import torch
+from mmengine.structures import InstanceData
+from mmengine.testing import assert_allclose
+
+from mmyolo.models.task_modules.assigners import PoseSimOTAAssigner
+
+
+class TestPoseSimOTAAssigner(TestCase):
+
+    def test_assign(self):
+        assigner = PoseSimOTAAssigner(
+            center_radius=2.5,
+            candidate_topk=1,
+            iou_weight=3.0,
+            cls_weight=1.0,
+            iou_calculator=dict(type='mmdet.BboxOverlaps2D'))
+        pred_instances = InstanceData(
+            bboxes=torch.Tensor([[23, 23, 43, 43] + [1] * 51,
+                                 [4, 5, 6, 7] + [1] * 51]),
+            scores=torch.FloatTensor([[0.2], [0.8]]),
+            priors=torch.Tensor([[30, 30, 8, 8], [4, 5, 6, 7]]))
+        gt_instances = InstanceData(
+            bboxes=torch.Tensor([[23, 23, 43, 43]]),
+            labels=torch.LongTensor([0]),
+            keypoints_visible=torch.Tensor([[
+                1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 0., 0., 0., 0.,
+                0.
+            ]]),
+            keypoints=torch.Tensor([[[30, 30], [30, 30], [30, 30], [30, 30],
+                                     [30, 30], [30, 30], [30, 30], [30, 30],
+                                     [30, 30], [30, 30], [30, 30], [30, 30],
+                                     [30, 30], [30, 30], [30, 30], [30, 30],
+                                     [30, 30]]]))
+        assign_result = assigner.assign(
+            pred_instances=pred_instances, gt_instances=gt_instances)
+
+        expected_gt_inds = torch.LongTensor([1, 0])
+        assert_allclose(assign_result.gt_inds, expected_gt_inds)
+
+    def test_assign_with_no_valid_bboxes(self):
+        assigner = PoseSimOTAAssigner(
+            center_radius=2.5,
+            candidate_topk=1,
+            iou_weight=3.0,
+            cls_weight=1.0,
+            iou_calculator=dict(type='mmdet.BboxOverlaps2D'))
+        pred_instances = InstanceData(
+            bboxes=torch.Tensor([[123, 123, 143, 143], [114, 151, 161, 171]]),
+            scores=torch.FloatTensor([[0.2], [0.8]]),
+            priors=torch.Tensor([[30, 30, 8, 8], [55, 55, 8, 8]]))
+        gt_instances = InstanceData(
+            bboxes=torch.Tensor([[0, 0, 1, 1]]),
+            labels=torch.LongTensor([0]),
+            keypoints_visible=torch.zeros((1, 17)),
+            keypoints=torch.zeros((1, 17, 2)))
+        assign_result = assigner.assign(
+            pred_instances=pred_instances, gt_instances=gt_instances)
+
+        expected_gt_inds = torch.LongTensor([0, 0])
+        assert_allclose(assign_result.gt_inds, expected_gt_inds)
+
+    def test_assign_with_empty_gt(self):
+        assigner = PoseSimOTAAssigner(
+            center_radius=2.5,
+            candidate_topk=1,
+            iou_weight=3.0,
+            cls_weight=1.0,
+            iou_calculator=dict(type='mmdet.BboxOverlaps2D'))
+        pred_instances = InstanceData(
+            bboxes=torch.Tensor([[[30, 40, 50, 60]], [[4, 5, 6, 7]]]),
+            scores=torch.FloatTensor([[0.2], [0.8]]),
+            priors=torch.Tensor([[0, 12, 23, 34], [4, 5, 6, 7]]))
+        gt_instances = InstanceData(
+            bboxes=torch.empty(0, 4),
+            labels=torch.empty(0),
+            keypoints_visible=torch.empty(0, 17),
+            keypoints=torch.empty(0, 17, 2))
+
+        assign_result = assigner.assign(
+            pred_instances=pred_instances, gt_instances=gt_instances)
+        expected_gt_inds = torch.LongTensor([0, 0])
+        assert_allclose(assign_result.gt_inds, expected_gt_inds)
diff --git a/tools/analysis_tools/browse_dataset.py b/tools/analysis_tools/browse_dataset.py
index fc8b52c56..21a1d709d 100644
--- a/tools/analysis_tools/browse_dataset.py
+++ b/tools/analysis_tools/browse_dataset.py
@@ -19,6 +19,7 @@
 
 
 # TODO: Support for printing the change in key of results
+# TODO: Some bug. If you meet some bug, please use the original
 def parse_args():
     parser = argparse.ArgumentParser(description='Browse a dataset')
     parser.add_argument('config', help='train config file path')
@@ -140,6 +141,18 @@ def make_grid(imgs, names):
     return visualizer.get_image()
 
 
+def swap_pipeline_position(dataset_cfg):
+    load_ann_tfm_name = 'LoadAnnotations'
+    pipeline = dataset_cfg.get('pipeline')
+    if (pipeline is None):
+        return dataset_cfg
+    all_transform_types = [tfm['type'] for tfm in pipeline]
+    if load_ann_tfm_name in all_transform_types:
+        load_ann_tfm_index = all_transform_types.index(load_ann_tfm_name)
+        load_ann_tfm = pipeline.pop(load_ann_tfm_index)
+        pipeline.insert(1, load_ann_tfm)
+
+
 class InspectCompose(Compose):
     """Compose multiple transforms sequentially.
 
@@ -185,6 +198,8 @@ def main():
     init_default_scope(cfg.get('default_scope', 'mmyolo'))
 
     dataset_cfg = cfg.get(args.phase + '_dataloader').get('dataset')
+    if (args.phase in ['test', 'val']):
+        swap_pipeline_position(dataset_cfg)
     dataset = DATASETS.build(dataset_cfg)
     visualizer = VISUALIZERS.build(cfg.visualizer)
     visualizer.dataset_meta = dataset.metainfo
diff --git a/tools/analysis_tools/browse_dataset_simple.py b/tools/analysis_tools/browse_dataset_simple.py
new file mode 100644
index 000000000..ebacbde3a
--- /dev/null
+++ b/tools/analysis_tools/browse_dataset_simple.py
@@ -0,0 +1,89 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+
+from mmdet.models.utils import mask2ndarray
+from mmdet.structures.bbox import BaseBoxes
+from mmengine.config import Config, DictAction
+from mmengine.registry import init_default_scope
+from mmengine.utils import ProgressBar
+
+from mmyolo.registry import DATASETS, VISUALIZERS
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Browse a dataset')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        '--output-dir',
+        default=None,
+        type=str,
+        help='If there is no display interface, you can save it')
+    parser.add_argument('--not-show', default=False, action='store_true')
+    parser.add_argument(
+        '--show-interval',
+        type=float,
+        default=0,
+        help='the interval of show (s)')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    # register all modules in mmdet into the registries
+    init_default_scope(cfg.get('default_scope', 'mmyolo'))
+
+    dataset = DATASETS.build(cfg.train_dataloader.dataset)
+    visualizer = VISUALIZERS.build(cfg.visualizer)
+    visualizer.dataset_meta = dataset.metainfo
+
+    progress_bar = ProgressBar(len(dataset))
+    for item in dataset:
+        img = item['inputs'].permute(1, 2, 0).numpy()
+        data_sample = item['data_samples'].numpy()
+        gt_instances = data_sample.gt_instances
+        img_path = osp.basename(item['data_samples'].img_path)
+
+        out_file = osp.join(
+            args.output_dir,
+            osp.basename(img_path)) if args.output_dir is not None else None
+
+        img = img[..., [2, 1, 0]]  # bgr to rgb
+        gt_bboxes = gt_instances.get('bboxes', None)
+        if gt_bboxes is not None and isinstance(gt_bboxes, BaseBoxes):
+            gt_instances.bboxes = gt_bboxes.tensor
+        gt_masks = gt_instances.get('masks', None)
+        if gt_masks is not None:
+            masks = mask2ndarray(gt_masks)
+            gt_instances.masks = masks.astype(bool)
+        data_sample.gt_instances = gt_instances
+
+        visualizer.add_datasample(
+            osp.basename(img_path),
+            img,
+            data_sample,
+            draw_pred=False,
+            show=not args.not_show,
+            wait_time=args.show_interval,
+            out_file=out_file)
+
+        progress_bar.update()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/model_converters/convert_kd_ckpt_to_student.py b/tools/model_converters/convert_kd_ckpt_to_student.py
index e44f66d02..d2f787e47 100644
--- a/tools/model_converters/convert_kd_ckpt_to_student.py
+++ b/tools/model_converters/convert_kd_ckpt_to_student.py
@@ -1,8 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import argparse
+import os.path as osp
 from pathlib import Path
 
-import torch
+from mmengine.runner import CheckpointLoader, save_checkpoint
+from mmengine.utils import mkdir_or_exist
 
 
 def parse_args():
@@ -18,7 +20,8 @@ def parse_args():
 
 def main():
     args = parse_args()
-    checkpoint = torch.load(args.checkpoint, map_location='cpu')
+    checkpoint = CheckpointLoader.load_checkpoint(
+        args.checkpoint, map_location='cpu')
     new_state_dict = dict()
     new_meta = checkpoint['meta']
 
@@ -32,7 +35,9 @@ def main():
     checkpoint['state_dict'] = new_state_dict
 
     if args.inplace:
-        torch.save(checkpoint, args.checkpoint)
+        assert osp.exists(args.checkpoint), \
+            'can not find the checkpoint path: {args.checkpoint}'
+        save_checkpoint(checkpoint, args.checkpoint)
     else:
         ckpt_path = Path(args.checkpoint)
         ckpt_name = ckpt_path.stem
@@ -40,8 +45,9 @@ def main():
             ckpt_dir = Path(args.out_path)
         else:
             ckpt_dir = ckpt_path.parent
-        new_ckpt_path = ckpt_dir / f'{ckpt_name}_student.pth'
-        torch.save(checkpoint, new_ckpt_path)
+        mkdir_or_exist(ckpt_dir)
+        new_ckpt_path = osp.join(ckpt_dir, f'{ckpt_name}_student.pth')
+        save_checkpoint(checkpoint, new_ckpt_path)
 
 
 if __name__ == '__main__':
diff --git a/tools/model_converters/yolov5_to_mmyolo.py b/tools/model_converters/yolov5_to_mmyolo.py
index c1d4e41d4..a4e62a2f7 100644
--- a/tools/model_converters/yolov5_to_mmyolo.py
+++ b/tools/model_converters/yolov5_to_mmyolo.py
@@ -25,6 +25,7 @@
     'model.21': 'neck.downsample_layers.1',
     'model.23': 'neck.bottom_up_layers.1',
     'model.24.m': 'bbox_head.head_module.convs_pred',
+    'model.24.proto': 'bbox_head.head_module.proto_preds',
 }
 
 convert_dict_p6 = {
@@ -54,6 +55,7 @@
     'model.30': 'neck.downsample_layers.2',
     'model.32': 'neck.bottom_up_layers.2',
     'model.33.m': 'bbox_head.head_module.convs_pred',
+    'model.33.proto': 'bbox_head.head_module.proto_preds',
 }
 
 
@@ -94,6 +96,10 @@ def convert(src, dst):
         if '.m.' in new_key:
             new_key = new_key.replace('.m.', '.blocks.')
             new_key = new_key.replace('.cv', '.conv')
+        elif 'bbox_head.head_module.proto_preds.cv' in new_key:
+            new_key = new_key.replace(
+                'bbox_head.head_module.proto_preds.cv',
+                'bbox_head.head_module.proto_preds.conv')
         else:
             new_key = new_key.replace('.cv1', '.main_conv')
             new_key = new_key.replace('.cv2', '.short_conv')
diff --git a/tools/model_converters/yolov5u_to_mmyolo.py b/tools/model_converters/yolov5u_to_mmyolo.py
new file mode 100644
index 000000000..806c76cb4
--- /dev/null
+++ b/tools/model_converters/yolov5u_to_mmyolo.py
@@ -0,0 +1,88 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+from collections import OrderedDict
+
+import torch
+
+convert_dict_p5 = {
+    'model.0': 'backbone.stem',
+    'model.1': 'backbone.stage1.0',
+    'model.2': 'backbone.stage1.1',
+    'model.3': 'backbone.stage2.0',
+    'model.4': 'backbone.stage2.1',
+    'model.5': 'backbone.stage3.0',
+    'model.6': 'backbone.stage3.1',
+    'model.7': 'backbone.stage4.0',
+    'model.8': 'backbone.stage4.1',
+    'model.9': 'backbone.stage4.2',
+    'model.10': 'neck.reduce_layers.2',
+    'model.13': 'neck.top_down_layers.0.0',
+    'model.14': 'neck.top_down_layers.0.1',
+    'model.17': 'neck.top_down_layers.1',
+    'model.18': 'neck.downsample_layers.0',
+    'model.20': 'neck.bottom_up_layers.0',
+    'model.21': 'neck.downsample_layers.1',
+    'model.23': 'neck.bottom_up_layers.1',
+    'model.24': 'bbox_head.head_module',
+}
+
+
+def convert(src, dst):
+    """Convert keys in pretrained YOLOv5u models to mmyolo style."""
+    convert_dict = convert_dict_p5
+
+    print('Converting P5 model')
+    try:
+        yolov5_model = torch.load(src)['model']
+        blobs = yolov5_model.state_dict()
+    except ModuleNotFoundError:
+        raise RuntimeError(
+            'This script must be placed under the ultralytics repo,'
+            ' because loading the official pretrained model need'
+            ' `model.py` to build model.')
+    state_dict = OrderedDict()
+
+    for key, weight in blobs.items():
+
+        num, module = key.split('.')[1:3]
+        prefix = f'model.{num}'
+        new_key = key.replace(prefix, convert_dict[prefix])
+
+        if '.m.' in new_key:
+            new_key = new_key.replace('.m.', '.blocks.')
+            new_key = new_key.replace('.cv', '.conv')
+        elif 'bbox_head.head_module' in new_key:
+            new_key = new_key.replace('.cv2', '.reg_preds')
+            new_key = new_key.replace('.cv3', '.cls_preds')
+        elif 'backbone.stage4.2' in new_key:
+            new_key = new_key.replace('.cv', '.conv')
+        else:
+            new_key = new_key.replace('.cv1', '.main_conv')
+            new_key = new_key.replace('.cv2', '.short_conv')
+            new_key = new_key.replace('.cv3', '.final_conv')
+
+        if 'bbox_head.head_module.dfl.conv.weight' == new_key:
+            print('Drop "bbox_head.head_module.dfl.conv.weight", '
+                  'because it is useless')
+            continue
+        state_dict[new_key] = weight
+        print(f'Convert {key} to {new_key}')
+
+    # save checkpoint
+    checkpoint = dict()
+    checkpoint['state_dict'] = state_dict
+    torch.save(checkpoint, dst)
+
+
+# Note: This script must be placed under the ultralytics repo to run.
+def main():
+    parser = argparse.ArgumentParser(description='Convert model keys')
+    parser.add_argument(
+        '--src', default='yolov5su.pt', help='src yolov5u model path')
+    parser.add_argument('--dst', default='mmyolov5su.pth', help='save path')
+    args = parser.parse_args()
+    convert(args.src, args.dst)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/model_converters/yolov6_v3_to_mmyolo.py b/tools/model_converters/yolov6_v3_to_mmyolo.py
new file mode 100644
index 000000000..bc8766424
--- /dev/null
+++ b/tools/model_converters/yolov6_v3_to_mmyolo.py
@@ -0,0 +1,145 @@
+import argparse
+from collections import OrderedDict
+
+import torch
+
+
+def convert(src, dst):
+    import sys
+    sys.path.append('yolov6')
+    try:
+        ckpt = torch.load(src, map_location=torch.device('cpu'))
+    except ModuleNotFoundError:
+        raise RuntimeError(
+            'This script must be placed under the meituan/YOLOv6 repo,'
+            ' because loading the official pretrained model need'
+            ' some python files to build model.')
+    # The saved model is the model before reparameterization
+    model = ckpt['ema' if ckpt.get('ema') else 'model'].float()
+    new_state_dict = OrderedDict()
+    is_ns = False
+    for k, v in model.state_dict().items():
+        name = k
+        if 'detect' in k:
+            if 'proj' in k:
+                continue
+            if 'reg_preds_lrtb' in k:
+                is_ns = True
+            name = k.replace('detect', 'bbox_head.head_module')
+        if k.find('anchors') >= 0 or k.find('anchor_grid') >= 0:
+            continue
+
+        if 'ERBlock_2' in k:
+            name = k.replace('ERBlock_2', 'stage1.0')
+            if '.cv' in k:
+                name = name.replace('.cv', '.conv')
+            if '.m.' in k:
+                name = name.replace('.m.', '.block.')
+        elif 'ERBlock_3' in k:
+            name = k.replace('ERBlock_3', 'stage2.0')
+            if '.cv' in k:
+                name = name.replace('.cv', '.conv')
+            if '.m.' in k:
+                name = name.replace('.m.', '.block.')
+        elif 'ERBlock_4' in k:
+            name = k.replace('ERBlock_4', 'stage3.0')
+            if '.cv' in k:
+                name = name.replace('.cv', '.conv')
+            if '.m.' in k:
+                name = name.replace('.m.', '.block.')
+        elif 'ERBlock_5' in k:
+            name = k.replace('ERBlock_5', 'stage4.0')
+            if '.cv' in k:
+                name = name.replace('.cv', '.conv')
+            if '.m.' in k:
+                name = name.replace('.m.', '.block.')
+            if 'stage4.0.2' in name:
+                name = name.replace('stage4.0.2', 'stage4.1')
+                name = name.replace('cv', 'conv')
+        elif 'reduce_layer0' in k:
+            name = k.replace('reduce_layer0', 'reduce_layers.2')
+        elif 'Rep_p4' in k:
+            name = k.replace('Rep_p4', 'top_down_layers.0.0')
+            if '.cv' in k:
+                name = name.replace('.cv', '.conv')
+            if '.m.' in k:
+                name = name.replace('.m.', '.block.')
+        elif 'reduce_layer1' in k:
+            name = k.replace('reduce_layer1', 'top_down_layers.0.1')
+            if '.cv' in k:
+                name = name.replace('.cv', '.conv')
+            if '.m.' in k:
+                name = name.replace('.m.', '.block.')
+        elif 'Rep_p3' in k:
+            name = k.replace('Rep_p3', 'top_down_layers.1')
+            if '.cv' in k:
+                name = name.replace('.cv', '.conv')
+            if '.m.' in k:
+                name = name.replace('.m.', '.block.')
+        elif 'Bifusion0' in k:
+            name = k.replace('Bifusion0', 'upsample_layers.0')
+            if '.cv' in k:
+                name = name.replace('.cv', '.conv')
+            if '.m.' in k:
+                name = name.replace('.m.', '.block.')
+            if '.upsample_transpose.' in k:
+                name = name.replace('.upsample_transpose.', '.')
+        elif 'Bifusion1' in k:
+            name = k.replace('Bifusion1', 'upsample_layers.1')
+            if '.cv' in k:
+                name = name.replace('.cv', '.conv')
+            if '.m.' in k:
+                name = name.replace('.m.', '.block.')
+            if '.upsample_transpose.' in k:
+                name = name.replace('.upsample_transpose.', '.')
+        elif 'Rep_n3' in k:
+            name = k.replace('Rep_n3', 'bottom_up_layers.0')
+            if '.cv' in k:
+                name = name.replace('.cv', '.conv')
+            if '.m.' in k:
+                name = name.replace('.m.', '.block.')
+        elif 'Rep_n4' in k:
+            name = k.replace('Rep_n4', 'bottom_up_layers.1')
+            if '.cv' in k:
+                name = name.replace('.cv', '.conv')
+            if '.m.' in k:
+                name = name.replace('.m.', '.block.')
+        elif 'downsample2' in k:
+            name = k.replace('downsample2', 'downsample_layers.0')
+        elif 'downsample1' in k:
+            name = k.replace('downsample1', 'downsample_layers.1')
+
+        new_state_dict[name] = v
+
+    # The yolov6_v3_n/s has two regression heads.
+    # One called 'reg_preds_lrtb' is a regular anchor-free head,
+    # which is used for inference.
+    # One called 'reg_preds' is a DFL style head, which
+    # is only used in training.
+    if is_ns:
+        tmp_state_dict = OrderedDict()
+        for k, v in new_state_dict.items():
+            name = k
+            if 'reg_preds_lrtb' in k:
+                name = k.replace('reg_preds_lrtb', 'reg_preds')
+            elif 'reg_preds' in k:
+                name = k.replace('reg_preds', 'distill_ns_head')
+            tmp_state_dict[name] = v
+        new_state_dict = tmp_state_dict
+
+    data = {'state_dict': new_state_dict}
+    torch.save(data, dst)
+
+
+# Note: This script must be placed under the yolov6 repo to run.
+def main():
+    parser = argparse.ArgumentParser(description='Convert model keys')
+    parser.add_argument(
+        '--src', default='yolov6s.pt', help='src yolov6 model path')
+    parser.add_argument('--dst', default='mmyolov6.pt', help='save path')
+    args = parser.parse_args()
+    convert(args.src, args.dst)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/model_converters/yolov8_to_mmyolo.py b/tools/model_converters/yolov8_to_mmyolo.py
index d15413674..4ed64f249 100644
--- a/tools/model_converters/yolov8_to_mmyolo.py
+++ b/tools/model_converters/yolov8_to_mmyolo.py
@@ -53,6 +53,19 @@ def convert(src, dst):
         if '.m.' in new_key:
             new_key = new_key.replace('.m.', '.blocks.')
             new_key = new_key.replace('.cv', '.conv')
+        elif 'bbox_head.head_module.proto.cv' in new_key:
+            new_key = new_key.replace(
+                'bbox_head.head_module.proto.cv',
+                'bbox_head.head_module.proto_preds.conv')
+        elif 'bbox_head.head_module.proto' in new_key:
+            new_key = new_key.replace('bbox_head.head_module.proto',
+                                      'bbox_head.head_module.proto_preds')
+        elif 'bbox_head.head_module.cv4.' in new_key:
+            new_key = new_key.replace(
+                'bbox_head.head_module.cv4',
+                'bbox_head.head_module.mask_coeff_preds')
+            new_key = new_key.replace('.2.weight', '.2.conv.weight')
+            new_key = new_key.replace('.2.bias', '.2.conv.bias')
         elif 'bbox_head.head_module' in new_key:
             new_key = new_key.replace('.cv2', '.reg_preds')
             new_key = new_key.replace('.cv3', '.cls_preds')
@@ -75,7 +88,7 @@ def convert(src, dst):
     torch.save(checkpoint, dst)
 
 
-# Note: This script must be placed under the YOLOv8 repo to run.
+# Note: This script must be placed under the ultralytics repo to run.
 def main():
     parser = argparse.ArgumentParser(description='Convert model keys')
     parser.add_argument(
diff --git a/tools/test.py b/tools/test.py
index c05defe3c..f0ac8bde4 100644
--- a/tools/test.py
+++ b/tools/test.py
@@ -4,6 +4,7 @@
 import os.path as osp
 
 from mmdet.engine.hooks.utils import trigger_visualization_hook
+from mmdet.utils import setup_cache_size_limit_of_dynamo
 from mmengine.config import Config, ConfigDict, DictAction
 from mmengine.evaluator import DumpResults
 from mmengine.runner import Runner
@@ -63,7 +64,10 @@ def parse_args():
         choices=['none', 'pytorch', 'slurm', 'mpi'],
         default='none',
         help='job launcher')
-    parser.add_argument('--local_rank', type=int, default=0)
+    # When using PyTorch version >= 2.0.0, the `torch.distributed.launch`
+    # will pass the `--local-rank` parameter to `tools/train.py` instead
+    # of `--local_rank`.
+    parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
     args = parser.parse_args()
     if 'LOCAL_RANK' not in os.environ:
         os.environ['LOCAL_RANK'] = str(args.local_rank)
@@ -73,6 +77,10 @@ def parse_args():
 def main():
     args = parse_args()
 
+    # Reduce the number of repeated compilations and improve
+    # training speed.
+    setup_cache_size_limit_of_dynamo()
+
     # load config
     cfg = Config.fromfile(args.config)
     # replace the ${key} with the value of cfg.key
diff --git a/tools/train.py b/tools/train.py
index 1060b631a..61f94980d 100644
--- a/tools/train.py
+++ b/tools/train.py
@@ -4,6 +4,7 @@
 import os
 import os.path as osp
 
+from mmdet.utils import setup_cache_size_limit_of_dynamo
 from mmengine.config import Config, DictAction
 from mmengine.logging import print_log
 from mmengine.runner import Runner
@@ -44,7 +45,10 @@ def parse_args():
         choices=['none', 'pytorch', 'slurm', 'mpi'],
         default='none',
         help='job launcher')
-    parser.add_argument('--local_rank', type=int, default=0)
+    # When using PyTorch version >= 2.0.0, the `torch.distributed.launch`
+    # will pass the `--local-rank` parameter to `tools/train.py` instead
+    # of `--local_rank`.
+    parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
     args = parser.parse_args()
     if 'LOCAL_RANK' not in os.environ:
         os.environ['LOCAL_RANK'] = str(args.local_rank)
@@ -55,6 +59,10 @@ def parse_args():
 def main():
     args = parse_args()
 
+    # Reduce the number of repeated compilations and improve
+    # training speed.
+    setup_cache_size_limit_of_dynamo()
+
     # load config
     cfg = Config.fromfile(args.config)
     # replace the ${key} with the value of cfg.key