From 4d06a209702b7b93b969bdffac5741eb8d6e86b7 Mon Sep 17 00:00:00 2001
From: Louis-Dupont <35190946+Louis-Dupont@users.noreply.github.com>
Date: Wed, 19 Apr 2023 13:13:25 +0300
Subject: [PATCH] Feature/sg 814 support yoloformat loader (#847)

* first draft

* improve naming

* fix name

* remove comments

* wip

* add comment
---
 src/super_gradients/common/object_names.py    |  2 +
 ...ction_yolo_format_base_dataset_params.yaml | 92 +++++++++++++++++++
 .../training/dataloaders/dataloaders.py       | 24 ++++-
 .../detection_datasets/detection_dataset.py   |  2 +-
 .../yolo_format_detection.py                  | 15 +--
 5 files changed, 126 insertions(+), 9 deletions(-)
 create mode 100644 src/super_gradients/recipes/dataset_params/coco_detection_yolo_format_base_dataset_params.yaml

diff --git a/src/super_gradients/common/object_names.py b/src/super_gradients/common/object_names.py
index 47870aa06c..c558d22e2f 100644
--- a/src/super_gradients/common/object_names.py
+++ b/src/super_gradients/common/object_names.py
@@ -334,6 +334,8 @@ class Dataloaders:
     COCO2017_VAL_SSD_LITE_MOBILENET_V2 = "coco2017_val_ssd_lite_mobilenet_v2"
     COCO2017_POSE_TRAIN = "coco2017_pose_train"
     COCO2017_POSE_VAL = "coco2017_pose_val"
+    COCO_DETECTION_YOLO_FORMAT_TRAIN = "coco_detection_yolo_format_train"
+    COCO_DETECTION_YOLO_FORMAT_VAL = "coco_detection_yolo_format_val"
     IMAGENET_TRAIN = "imagenet_train"
     IMAGENET_VAL = "imagenet_val"
     IMAGENET_EFFICIENTNET_TRAIN = "imagenet_efficientnet_train"
diff --git a/src/super_gradients/recipes/dataset_params/coco_detection_yolo_format_base_dataset_params.yaml b/src/super_gradients/recipes/dataset_params/coco_detection_yolo_format_base_dataset_params.yaml
new file mode 100644
index 0000000000..6ee9fbd1d4
--- /dev/null
+++ b/src/super_gradients/recipes/dataset_params/coco_detection_yolo_format_base_dataset_params.yaml
@@ -0,0 +1,92 @@
+
+train_dataset_params:
+  data_dir: /data/coco # TO FILL: Where the data is stored.
+  images_dir: images/train2017 # TO FILL: Local path to directory that includes all the images. Path relative to `data_dir`. Can be the same as `labels_dir`.
+  labels_dir: labels/train2017 # TO FILL: Local path to directory that includes all the labels. Path relative to `data_dir`. Can be the same as `images_dir`.
+  classes: [ person, bicycle, car, motorcycle, airplane, bus, train, truck, boat, traffic light, fire hydrant, stop sign,
+             parking meter, bench, bird, cat, dog, horse, sheep, cow, elephant, bear, zebra, giraffe, backpack, umbrella, handbag,
+             tie, suitcase, frisbee, skis, snowboard, sports ball, kite, baseball bat, baseball glove, skateboard, surfboard,
+             tennis racket, bottle, wine glass, cup, fork, knife, spoon, bowl, banana, apple, sandwich, orange, broccoli, carrot,
+             hot dog, pizza, donut, cake, chair, couch, potted plant, bed, dining table, toilet, tv, laptop, mouse, remote,
+             keyboard, cell phone, microwave, oven, toaster, sink, refrigerator, book, clock, vase, scissors, teddy bear,
+             hair drier, toothbrush] # TO FILL: List of classes used in your dataset.
+  input_dim: [640, 640]
+  cache_dir:
+  cache: False
+  transforms:
+    - DetectionMosaic:
+        input_dim: ${dataset_params.train_dataset_params.input_dim}
+        prob: 1.
+    - DetectionRandomAffine:
+        degrees: 10.                  # rotation degrees, randomly sampled from [-degrees, degrees]
+        translate: 0.1                # image translation fraction
+        scales: [ 0.1, 2 ]              # random rescale range (keeps size by padding/cropping) after mosaic transform.
+        shear: 2.0                    # shear degrees, randomly sampled from [-degrees, degrees]
+        target_size: ${dataset_params.train_dataset_params.input_dim}
+        filter_box_candidates: True   # whether to filter out transformed bboxes by edge size, area ratio, and aspect ratio.
+        wh_thr: 2                     # edge size threshold when filter_box_candidates = True (pixels)
+        area_thr: 0.1                 # threshold for area ratio between original image and the transformed one, when when filter_box_candidates = True
+        ar_thr: 20                    # aspect ratio threshold when filter_box_candidates = True
+    - DetectionMixup:
+        input_dim: ${dataset_params.train_dataset_params.input_dim}
+        mixup_scale: [ 0.5, 1.5 ]         # random rescale range for the additional sample in mixup
+        prob: 1.0                       # probability to apply per-sample mixup
+        flip_prob: 0.5                  # probability to apply horizontal flip
+    - DetectionHSV:
+        prob: 1.0                       # probability to apply HSV transform
+        hgain: 5                        # HSV transform hue gain (randomly sampled from [-hgain, hgain])
+        sgain: 30                       # HSV transform saturation gain (randomly sampled from [-sgain, sgain])
+        vgain: 30                       # HSV transform value gain (randomly sampled from [-vgain, vgain])
+    - DetectionHorizontalFlip:
+        prob: 0.5                       # probability to apply horizontal flip
+    - DetectionPaddedRescale:
+        input_dim: ${dataset_params.train_dataset_params.input_dim}
+        max_targets: 120
+    - DetectionTargetsFormatTransform:
+        input_dim: ${dataset_params.train_dataset_params.input_dim}
+        output_format: LABEL_CXCYWH
+  class_inclusion_list:
+  max_num_samples:
+
+train_dataloader_params:
+  batch_size: 25
+  num_workers: 8
+  shuffle: True
+  drop_last: True
+  pin_memory: True
+  collate_fn:
+    _target_: super_gradients.training.utils.detection_utils.DetectionCollateFN
+
+val_dataset_params:
+  data_dir: /data/coco # TO FILL: Where the data is stored.
+  images_dir: images/val2017 # TO FILL: Local path to directory that includes all the images. Path relative to `data_dir`. Can be the same as `labels_dir`.
+  labels_dir: labels/val2017 # TO FILL: Local path to directory that includes all the labels. Path relative to `data_dir`. Can be the same as `images_dir`.
+  classes: [ person, bicycle, car, motorcycle, airplane, bus, train, truck, boat, traffic light, fire hydrant, stop sign,
+             parking meter, bench, bird, cat, dog, horse, sheep, cow, elephant, bear, zebra, giraffe, backpack, umbrella, handbag,
+             tie, suitcase, frisbee, skis, snowboard, sports ball, kite, baseball bat, baseball glove, skateboard, surfboard,
+             tennis racket, bottle, wine glass, cup, fork, knife, spoon, bowl, banana, apple, sandwich, orange, broccoli, carrot,
+             hot dog, pizza, donut, cake, chair, couch, potted plant, bed, dining table, toilet, tv, laptop, mouse, remote,
+             keyboard, cell phone, microwave, oven, toaster, sink, refrigerator, book, clock, vase, scissors, teddy bear,
+             hair drier, toothbrush] # TO FILL: List of classes used in your dataset.
+  input_dim: [640, 640]
+  cache_dir:
+  cache: False
+  transforms:
+  - DetectionPaddedRescale:
+      input_dim: ${dataset_params.val_dataset_params.input_dim}
+  - DetectionTargetsFormatTransform:
+      max_targets: 50
+      input_dim: ${dataset_params.val_dataset_params.input_dim}
+      output_format: LABEL_CXCYWH
+  class_inclusion_list:
+  max_num_samples:
+
+val_dataloader_params:
+  batch_size: 25
+  num_workers: 8
+  drop_last: False
+  pin_memory: True
+  collate_fn:
+    _target_: super_gradients.training.utils.detection_utils.DetectionCollateFN
+
+_convert_: all
diff --git a/src/super_gradients/training/dataloaders/dataloaders.py b/src/super_gradients/training/dataloaders/dataloaders.py
index 3b5ea4c44b..29f5d03694 100644
--- a/src/super_gradients/training/dataloaders/dataloaders.py
+++ b/src/super_gradients/training/dataloaders/dataloaders.py
@@ -17,7 +17,7 @@
     Cifar10,
     Cifar100,
 )
-from super_gradients.training.datasets.detection_datasets import COCODetectionDataset, RoboflowDetectionDataset
+from super_gradients.training.datasets.detection_datasets import COCODetectionDataset, RoboflowDetectionDataset, YoloDarknetFormatDetectionDataset
 from super_gradients.training.datasets.detection_datasets.pascal_voc_detection import (
     PascalVOCUnifiedDetectionTrainDataset,
     PascalVOCDetectionDataset,
@@ -270,6 +270,28 @@ def roboflow_val_yolox(dataset_params: Dict = None, dataloader_params: Dict = No
     )
 
 
+@register_dataloader(Dataloaders.COCO_DETECTION_YOLO_FORMAT_TRAIN)
+def coco_detection_yolo_format_train(dataset_params: Dict = None, dataloader_params: Dict = None) -> DataLoader:
+    return get_data_loader(
+        config_name="coco_detection_yolo_format_base_dataset_params",
+        dataset_cls=YoloDarknetFormatDetectionDataset,
+        train=True,
+        dataset_params=dataset_params,
+        dataloader_params=dataloader_params,
+    )
+
+
+@register_dataloader(Dataloaders.COCO_DETECTION_YOLO_FORMAT_VAL)
+def coco_detection_yolo_format_val(dataset_params: Dict = None, dataloader_params: Dict = None) -> DataLoader:
+    return get_data_loader(
+        config_name="coco_detection_yolo_format_base_dataset_params",
+        dataset_cls=YoloDarknetFormatDetectionDataset,
+        train=False,
+        dataset_params=dataset_params,
+        dataloader_params=dataloader_params,
+    )
+
+
 @register_dataloader(Dataloaders.IMAGENET_TRAIN)
 def imagenet_train(dataset_params: Dict = None, dataloader_params: Dict = None, config_name="imagenet_dataset_params"):
     return get_data_loader(
diff --git a/src/super_gradients/training/datasets/detection_datasets/detection_dataset.py b/src/super_gradients/training/datasets/detection_datasets/detection_dataset.py
index a842131971..cb3b2e1677 100644
--- a/src/super_gradients/training/datasets/detection_datasets/detection_dataset.py
+++ b/src/super_gradients/training/datasets/detection_datasets/detection_dataset.py
@@ -119,7 +119,7 @@ def __init__(
 
         self.data_dir = data_dir
         if not Path(data_dir).exists():
-            raise FileNotFoundError(f"data_dir={data_dir} not found. Please make sure that data_dir points toward your dataset.")
+            raise RuntimeError(f"data_dir={data_dir} not found. Please make sure that data_dir points toward your dataset.")
 
         # Number of images that are available (regardless of ignored images)
         self.n_available_samples = self._setup_data_source()
diff --git a/src/super_gradients/training/datasets/detection_datasets/yolo_format_detection.py b/src/super_gradients/training/datasets/detection_datasets/yolo_format_detection.py
index 45c106d078..d567fec6da 100644
--- a/src/super_gradients/training/datasets/detection_datasets/yolo_format_detection.py
+++ b/src/super_gradients/training/datasets/detection_datasets/yolo_format_detection.py
@@ -146,7 +146,7 @@ def _setup_data_source(self) -> int:
             logger.warning(f"{len(labels_not_in_images)} label files are not associated to any image.")
 
         # Only keep names that are in both the images and the labels
-        valid_base_names = list(unique_image_file_base_names & unique_label_file_base_names)
+        valid_base_names = unique_image_file_base_names & unique_label_file_base_names
         if len(valid_base_names) != len(all_images_file_names):
             logger.warning(
                 f"As a consequence, "
@@ -154,12 +154,13 @@ def _setup_data_source(self) -> int:
                 f"{len(valid_base_names)}/{len(all_labels_file_names)} label files will be used."
             )
 
-        self.images_file_names = list(
-            sorted(image_full_name for image_full_name in all_images_file_names if remove_file_extension(image_full_name) in valid_base_names)
-        )
-        self.labels_file_names = list(
-            sorted(label_full_name for label_full_name in all_labels_file_names if remove_file_extension(label_full_name) in valid_base_names)
-        )
+        self.images_file_names = []
+        self.labels_file_names = []
+        for image_full_name in all_images_file_names:
+            base_name = remove_file_extension(image_full_name)
+            if base_name in valid_base_names:
+                self.images_file_names.append(image_full_name)
+                self.labels_file_names.append(base_name + ".txt")
         return len(self.images_file_names)
 
     def _load_annotation(self, sample_id: int) -> dict: