From 8c6f9e15bfc0000d18b976a95b9d7c17d407ec91 Mon Sep 17 00:00:00 2001
From: Glenn Jocher <glenn.jocher@ultralytics.com>
Date: Thu, 8 Jul 2021 11:42:30 +0200
Subject: [PATCH] Update `dataset_stats()` for zipped datasets (#3926)

* Update `dataset_stats()` for zipped datasets

@KalenMike

* cleanup
---
 utils/datasets.py | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/utils/datasets.py b/utils/datasets.py
index 0bcfdcc1cda6..a527230b868a 100755
--- a/utils/datasets.py
+++ b/utils/datasets.py
@@ -888,9 +888,11 @@ def verify_image_label(args):
 
 def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False):
     """ Return dataset statistics dictionary with images and instances counts per split per class
-    Usage: from utils.datasets import *; dataset_stats('coco128.yaml', verbose=True)
+    Usage1: from utils.datasets import *; dataset_stats('coco128.yaml', verbose=True)
+    Usage2: from utils.datasets import *; dataset_stats('../datasets/coco128.zip', verbose=True)
+    
     Arguments
-        path:           Path to data.yaml
+        path:           Path to data.yaml or data.zip (with data.yaml inside data.zip)
         autodownload:   Attempt to download dataset if not found locally
         verbose:        Print stats dictionary
     """
@@ -899,8 +901,20 @@ def round_labels(labels):
         # Update labels to integer class and 6 decimal place floats
         return [[int(c), *[round(x, 6) for x in points]] for c, *points in labels]
 
-    with open(check_file(path)) as f:
+    def unzip(path):
+        # Unzip data.zip TODO: CONSTRAINT: path/to/abc.zip MUST unzip to 'path/to/abc/'
+        if str(path).endswith('.zip'):  # path is data.zip
+            assert os.system(f'unzip -q {path} -d {path.parent}') == 0, f'Error unzipping {path}'
+            data_dir = path.with_suffix('')  # dataset directory
+            return True, data_dir, list(data_dir.rglob('*.yaml'))[0]  # zipped, data_dir, yaml_path
+        else:  # path is data.yaml
+            return False, None, path
+
+    zipped, data_dir, yaml_path = unzip(Path(path))
+    with open(check_file(yaml_path)) as f:
         data = yaml.safe_load(f)  # data dict
+        if zipped:
+            data['path'] = data_dir  # TODO: should this be dir.resolve()?
     check_dataset(data, autodownload)  # download dataset if missing
     nc = data['nc']  # number of classes
     stats = {'nc': nc, 'names': data['names']}  # statistics dictionary