From 653f1bf5fe18df0a4a6ce4f26a3f3b1ca462c729 Mon Sep 17 00:00:00 2001
From: Glenn Jocher <glenn.jocher@ultralytics.com>
Date: Mon, 25 Jul 2022 18:20:01 +0200
Subject: [PATCH] New `HUBDatasetStats()` class (#8716)

* New `HUBDatasetStats()` class

Usage examples:
```
from utils.dataloaders import *

stats = HUBDatasetStats('coco128.yaml', autodownload=True)  # method 1
stats = HUBDatasetStats('path/to/coco128_with_yaml.zip')  # method 1

stats.get_json(save=False)
stats.process_images()
```

@kalenmike

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update dataloaders.py

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update dataloaders.py

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update dataloaders.py

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update dataloaders.py

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 utils/dataloaders.py | 146 +++++++++++++++++++++----------------------
 1 file changed, 70 insertions(+), 76 deletions(-)

diff --git a/utils/dataloaders.py b/utils/dataloaders.py
index c32f60fe4ec7..9ccfe2545d75 100755
--- a/utils/dataloaders.py
+++ b/utils/dataloaders.py
@@ -977,21 +977,35 @@ def verify_image_label(args):
         return [None, None, None, None, nm, nf, ne, nc, msg]
 
 
-def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False, profile=False, hub=False):
+class HUBDatasetStats():
     """ Return dataset statistics dictionary with images and instances counts per split per class
     To run in parent directory: export PYTHONPATH="$PWD/yolov5"
-    Usage1: from utils.dataloaders import *; dataset_stats('coco128.yaml', autodownload=True)
-    Usage2: from utils.dataloaders import *; dataset_stats('path/to/coco128_with_yaml.zip')
+    Usage1: from utils.dataloaders import *; HUBDatasetStats('coco128.yaml', autodownload=True)
+    Usage2: from utils.dataloaders import *; HUBDatasetStats('path/to/coco128_with_yaml.zip')
     Arguments
         path:           Path to data.yaml or data.zip (with data.yaml inside data.zip)
         autodownload:   Attempt to download dataset if not found locally
-        verbose:        Print stats dictionary
     """
 
-    def _round_labels(labels):
-        # Update labels to integer class and 6 decimal place floats
-        return [[int(c), *(round(x, 4) for x in points)] for c, *points in labels]
+    def __init__(self, path='coco128.yaml', autodownload=False):
+        # Initialize class
+        zipped, data_dir, yaml_path = self._unzip(Path(path))
+        try:
+            with open(check_yaml(yaml_path), errors='ignore') as f:
+                data = yaml.safe_load(f)  # data dict
+                if zipped:
+                    data['path'] = data_dir
+        except Exception as e:
+            raise Exception("error/HUB/dataset_stats/yaml_load") from e
+
+        check_dataset(data, autodownload)  # download dataset if missing
+        self.hub_dir = Path(data['path'] + '-hub')
+        self.im_dir = self.hub_dir / 'images'
+        self.im_dir.mkdir(parents=True, exist_ok=True)  # makes /images
+        self.stats = {'nc': data['nc'], 'names': data['names']}  # statistics dictionary
+        self.data = data
 
+    @staticmethod
     def _find_yaml(dir):
         # Return data.yaml file
         files = list(dir.glob('*.yaml')) or list(dir.rglob('*.yaml'))  # try root level first and then recursive
@@ -1002,7 +1016,7 @@ def _find_yaml(dir):
         assert len(files) == 1, f'Multiple *.yaml files found: {files}, only 1 *.yaml file allowed in {dir}'
         return files[0]
 
-    def _unzip(path):
+    def _unzip(self, path):
         # Unzip data.zip
         if not str(path).endswith('.zip'):  # path is data.yaml
             return False, None, path
@@ -1010,11 +1024,11 @@ def _unzip(path):
         ZipFile(path).extractall(path=path.parent)  # unzip
         dir = path.with_suffix('')  # dataset directory == zip name
         assert dir.is_dir(), f'Error unzipping {path}, {dir} not found. path/to/abc.zip MUST unzip to path/to/abc/'
-        return True, str(dir), _find_yaml(dir)  # zipped, data_dir, yaml_path
+        return True, str(dir), self._find_yaml(dir)  # zipped, data_dir, yaml_path
 
-    def _hub_ops(f, max_dim=1920):
+    def _hub_ops(self, f, max_dim=1920):
         # HUB ops for 1 image 'f': resize and save at reduced quality in /dataset-hub for web/app viewing
-        f_new = im_dir / Path(f).name  # dataset-hub image filename
+        f_new = self.im_dir / Path(f).name  # dataset-hub image filename
         try:  # use PIL
             im = Image.open(f)
             r = max_dim / max(im.height, im.width)  # ratio
@@ -1030,69 +1044,49 @@ def _hub_ops(f, max_dim=1920):
                 im = cv2.resize(im, (int(im_width * r), int(im_height * r)), interpolation=cv2.INTER_AREA)
             cv2.imwrite(str(f_new), im)
 
-    zipped, data_dir, yaml_path = _unzip(Path(path))
-    try:
-        with open(check_yaml(yaml_path), errors='ignore') as f:
-            data = yaml.safe_load(f)  # data dict
-            if zipped:
-                data['path'] = data_dir  # TODO: should this be dir.resolve()?`
-    except Exception:
-        raise Exception("error/HUB/dataset_stats/yaml_load")
-
-    check_dataset(data, autodownload)  # download dataset if missing
-    hub_dir = Path(data['path'] + ('-hub' if hub else ''))
-    stats = {'nc': data['nc'], 'names': data['names']}  # statistics dictionary
-    for split in 'train', 'val', 'test':
-        if data.get(split) is None:
-            stats[split] = None  # i.e. no test set
-            continue
-        x = []
-        dataset = LoadImagesAndLabels(data[split])  # load dataset
-        for label in tqdm(dataset.labels, total=dataset.n, desc='Statistics'):
-            x.append(np.bincount(label[:, 0].astype(int), minlength=data['nc']))
-        x = np.array(x)  # shape(128x80)
-        stats[split] = {
-            'instance_stats': {
-                'total': int(x.sum()),
-                'per_class': x.sum(0).tolist()},
-            'image_stats': {
-                'total': dataset.n,
-                'unlabelled': int(np.all(x == 0, 1).sum()),
-                'per_class': (x > 0).sum(0).tolist()},
-            'labels': [{
-                str(Path(k).name): _round_labels(v.tolist())} for k, v in zip(dataset.im_files, dataset.labels)]}
-
-        if hub:
-            im_dir = hub_dir / 'images'
-            im_dir.mkdir(parents=True, exist_ok=True)
-            for _ in tqdm(ThreadPool(NUM_THREADS).imap(_hub_ops, dataset.im_files), total=dataset.n, desc='HUB Ops'):
+    def get_json(self, save=False, verbose=False):
+        # Return dataset JSON for Ultralytics HUB
+        def _round(labels):
+            # Update labels to integer class and 6 decimal place floats
+            return [[int(c), *(round(x, 4) for x in points)] for c, *points in labels]
+
+        for split in 'train', 'val', 'test':
+            if self.data.get(split) is None:
+                self.stats[split] = None  # i.e. no test set
+                continue
+            dataset = LoadImagesAndLabels(self.data[split])  # load dataset
+            x = np.array([
+                np.bincount(label[:, 0].astype(int), minlength=self.data['nc'])
+                for label in tqdm(dataset.labels, total=dataset.n, desc='Statistics')])  # shape(128x80)
+            self.stats[split] = {
+                'instance_stats': {
+                    'total': int(x.sum()),
+                    'per_class': x.sum(0).tolist()},
+                'image_stats': {
+                    'total': dataset.n,
+                    'unlabelled': int(np.all(x == 0, 1).sum()),
+                    'per_class': (x > 0).sum(0).tolist()},
+                'labels': [{
+                    str(Path(k).name): _round(v.tolist())} for k, v in zip(dataset.im_files, dataset.labels)]}
+
+        # Save, print and return
+        if save:
+            stats_path = self.hub_dir / 'stats.json'
+            print(f'Saving {stats_path.resolve()}...')
+            with open(stats_path, 'w') as f:
+                json.dump(self.stats, f)  # save stats.json
+        if verbose:
+            print(json.dumps(self.stats, indent=2, sort_keys=False))
+        return self.stats
+
+    def process_images(self):
+        # Compress images for Ultralytics HUB
+        for split in 'train', 'val', 'test':
+            if self.data.get(split) is None:
+                continue
+            dataset = LoadImagesAndLabels(self.data[split])  # load dataset
+            desc = f'{split} images'
+            for _ in tqdm(ThreadPool(NUM_THREADS).imap(self._hub_ops, dataset.im_files), total=dataset.n, desc=desc):
                 pass
-
-    # Profile
-    stats_path = hub_dir / 'stats.json'
-    if profile:
-        for _ in range(1):
-            file = stats_path.with_suffix('.npy')
-            t1 = time.time()
-            np.save(file, stats)
-            t2 = time.time()
-            x = np.load(file, allow_pickle=True)
-            print(f'stats.npy times: {time.time() - t2:.3f}s read, {t2 - t1:.3f}s write')
-
-            file = stats_path.with_suffix('.json')
-            t1 = time.time()
-            with open(file, 'w') as f:
-                json.dump(stats, f)  # save stats *.json
-            t2 = time.time()
-            with open(file) as f:
-                x = json.load(f)  # load hyps dict
-            print(f'stats.json times: {time.time() - t2:.3f}s read, {t2 - t1:.3f}s write')
-
-    # Save, print and return
-    if hub:
-        print(f'Saving {stats_path.resolve()}...')
-        with open(stats_path, 'w') as f:
-            json.dump(stats, f)  # save stats.json
-    if verbose:
-        print(json.dumps(stats, indent=2, sort_keys=False))
-    return stats
+        print(f'Done. All images saved to {self.im_dir}')
+        return self.im_dir