From 653f1bf5fe18df0a4a6ce4f26a3f3b1ca462c729 Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Mon, 25 Jul 2022 18:20:01 +0200 Subject: [PATCH] New `HUBDatasetStats()` class (#8716) * New `HUBDatasetStats()` class Usage examples: ``` from utils.dataloaders import * stats = HUBDatasetStats('coco128.yaml', autodownload=True) # method 1 stats = HUBDatasetStats('path/to/coco128_with_yaml.zip') # method 1 stats.get_json(save=False) stats.process_images() ``` @kalenmike * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update dataloaders.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update dataloaders.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update dataloaders.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update dataloaders.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- utils/dataloaders.py | 146 +++++++++++++++++++++---------------------- 1 file changed, 70 insertions(+), 76 deletions(-) diff --git a/utils/dataloaders.py b/utils/dataloaders.py index c32f60fe4ec7..9ccfe2545d75 100755 --- a/utils/dataloaders.py +++ b/utils/dataloaders.py @@ -977,21 +977,35 @@ def verify_image_label(args): return [None, None, None, None, nm, nf, ne, nc, msg] -def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False, profile=False, hub=False): +class HUBDatasetStats(): """ Return dataset statistics dictionary with images and instances counts per split per class To run in parent directory: export PYTHONPATH="$PWD/yolov5" - Usage1: from utils.dataloaders import *; dataset_stats('coco128.yaml', autodownload=True) - Usage2: from utils.dataloaders import *; dataset_stats('path/to/coco128_with_yaml.zip') + Usage1: from utils.dataloaders import *; HUBDatasetStats('coco128.yaml', autodownload=True) + Usage2: from utils.dataloaders import *; HUBDatasetStats('path/to/coco128_with_yaml.zip') Arguments path: Path to data.yaml or data.zip (with data.yaml inside data.zip) autodownload: Attempt to download dataset if not found locally - verbose: Print stats dictionary """ - def _round_labels(labels): - # Update labels to integer class and 6 decimal place floats - return [[int(c), *(round(x, 4) for x in points)] for c, *points in labels] + def __init__(self, path='coco128.yaml', autodownload=False): + # Initialize class + zipped, data_dir, yaml_path = self._unzip(Path(path)) + try: + with open(check_yaml(yaml_path), errors='ignore') as f: + data = yaml.safe_load(f) # data dict + if zipped: + data['path'] = data_dir + except Exception as e: + raise Exception("error/HUB/dataset_stats/yaml_load") from e + + check_dataset(data, autodownload) # download dataset if missing + self.hub_dir = Path(data['path'] + '-hub') + self.im_dir = self.hub_dir / 'images' + self.im_dir.mkdir(parents=True, exist_ok=True) # makes /images + self.stats = {'nc': data['nc'], 'names': data['names']} # statistics dictionary + self.data = data + @staticmethod def _find_yaml(dir): # Return data.yaml file files = list(dir.glob('*.yaml')) or list(dir.rglob('*.yaml')) # try root level first and then recursive @@ -1002,7 +1016,7 @@ def _find_yaml(dir): assert len(files) == 1, f'Multiple *.yaml files found: {files}, only 1 *.yaml file allowed in {dir}' return files[0] - def _unzip(path): + def _unzip(self, path): # Unzip data.zip if not str(path).endswith('.zip'): # path is data.yaml return False, None, path @@ -1010,11 +1024,11 @@ def _unzip(path): ZipFile(path).extractall(path=path.parent) # unzip dir = path.with_suffix('') # dataset directory == zip name assert dir.is_dir(), f'Error unzipping {path}, {dir} not found. path/to/abc.zip MUST unzip to path/to/abc/' - return True, str(dir), _find_yaml(dir) # zipped, data_dir, yaml_path + return True, str(dir), self._find_yaml(dir) # zipped, data_dir, yaml_path - def _hub_ops(f, max_dim=1920): + def _hub_ops(self, f, max_dim=1920): # HUB ops for 1 image 'f': resize and save at reduced quality in /dataset-hub for web/app viewing - f_new = im_dir / Path(f).name # dataset-hub image filename + f_new = self.im_dir / Path(f).name # dataset-hub image filename try: # use PIL im = Image.open(f) r = max_dim / max(im.height, im.width) # ratio @@ -1030,69 +1044,49 @@ def _hub_ops(f, max_dim=1920): im = cv2.resize(im, (int(im_width * r), int(im_height * r)), interpolation=cv2.INTER_AREA) cv2.imwrite(str(f_new), im) - zipped, data_dir, yaml_path = _unzip(Path(path)) - try: - with open(check_yaml(yaml_path), errors='ignore') as f: - data = yaml.safe_load(f) # data dict - if zipped: - data['path'] = data_dir # TODO: should this be dir.resolve()?` - except Exception: - raise Exception("error/HUB/dataset_stats/yaml_load") - - check_dataset(data, autodownload) # download dataset if missing - hub_dir = Path(data['path'] + ('-hub' if hub else '')) - stats = {'nc': data['nc'], 'names': data['names']} # statistics dictionary - for split in 'train', 'val', 'test': - if data.get(split) is None: - stats[split] = None # i.e. no test set - continue - x = [] - dataset = LoadImagesAndLabels(data[split]) # load dataset - for label in tqdm(dataset.labels, total=dataset.n, desc='Statistics'): - x.append(np.bincount(label[:, 0].astype(int), minlength=data['nc'])) - x = np.array(x) # shape(128x80) - stats[split] = { - 'instance_stats': { - 'total': int(x.sum()), - 'per_class': x.sum(0).tolist()}, - 'image_stats': { - 'total': dataset.n, - 'unlabelled': int(np.all(x == 0, 1).sum()), - 'per_class': (x > 0).sum(0).tolist()}, - 'labels': [{ - str(Path(k).name): _round_labels(v.tolist())} for k, v in zip(dataset.im_files, dataset.labels)]} - - if hub: - im_dir = hub_dir / 'images' - im_dir.mkdir(parents=True, exist_ok=True) - for _ in tqdm(ThreadPool(NUM_THREADS).imap(_hub_ops, dataset.im_files), total=dataset.n, desc='HUB Ops'): + def get_json(self, save=False, verbose=False): + # Return dataset JSON for Ultralytics HUB + def _round(labels): + # Update labels to integer class and 6 decimal place floats + return [[int(c), *(round(x, 4) for x in points)] for c, *points in labels] + + for split in 'train', 'val', 'test': + if self.data.get(split) is None: + self.stats[split] = None # i.e. no test set + continue + dataset = LoadImagesAndLabels(self.data[split]) # load dataset + x = np.array([ + np.bincount(label[:, 0].astype(int), minlength=self.data['nc']) + for label in tqdm(dataset.labels, total=dataset.n, desc='Statistics')]) # shape(128x80) + self.stats[split] = { + 'instance_stats': { + 'total': int(x.sum()), + 'per_class': x.sum(0).tolist()}, + 'image_stats': { + 'total': dataset.n, + 'unlabelled': int(np.all(x == 0, 1).sum()), + 'per_class': (x > 0).sum(0).tolist()}, + 'labels': [{ + str(Path(k).name): _round(v.tolist())} for k, v in zip(dataset.im_files, dataset.labels)]} + + # Save, print and return + if save: + stats_path = self.hub_dir / 'stats.json' + print(f'Saving {stats_path.resolve()}...') + with open(stats_path, 'w') as f: + json.dump(self.stats, f) # save stats.json + if verbose: + print(json.dumps(self.stats, indent=2, sort_keys=False)) + return self.stats + + def process_images(self): + # Compress images for Ultralytics HUB + for split in 'train', 'val', 'test': + if self.data.get(split) is None: + continue + dataset = LoadImagesAndLabels(self.data[split]) # load dataset + desc = f'{split} images' + for _ in tqdm(ThreadPool(NUM_THREADS).imap(self._hub_ops, dataset.im_files), total=dataset.n, desc=desc): pass - - # Profile - stats_path = hub_dir / 'stats.json' - if profile: - for _ in range(1): - file = stats_path.with_suffix('.npy') - t1 = time.time() - np.save(file, stats) - t2 = time.time() - x = np.load(file, allow_pickle=True) - print(f'stats.npy times: {time.time() - t2:.3f}s read, {t2 - t1:.3f}s write') - - file = stats_path.with_suffix('.json') - t1 = time.time() - with open(file, 'w') as f: - json.dump(stats, f) # save stats *.json - t2 = time.time() - with open(file) as f: - x = json.load(f) # load hyps dict - print(f'stats.json times: {time.time() - t2:.3f}s read, {t2 - t1:.3f}s write') - - # Save, print and return - if hub: - print(f'Saving {stats_path.resolve()}...') - with open(stats_path, 'w') as f: - json.dump(stats, f) # save stats.json - if verbose: - print(json.dumps(stats, indent=2, sort_keys=False)) - return stats + print(f'Done. All images saved to {self.im_dir}') + return self.im_dir