From 846fec6cc259beddd72b5504f37506162ad92a5d Mon Sep 17 00:00:00 2001
From: Glenn Jocher <glenn.jocher@ultralytics.com>
Date: Wed, 28 Jul 2021 02:04:10 +0200
Subject: [PATCH] Train from `--data path/to/dataset.zip` feature (#4185)

* Train from `--data path/to/dataset.zip` feature

* Update dataset_stats()

* cleanup

* cleanup2
---
 data/{Argoverse_HD.yaml => Argoverse.yaml} |  2 +-
 hubconf.py                                 |  2 +-
 models/experimental.py                     |  2 +-
 train.py                                   | 11 ++--
 utils/datasets.py                          | 66 ++++++++++++++++------
 utils/{google_utils.py => downloads.py}    |  6 +-
 utils/general.py                           | 40 +++++++++----
 utils/loggers/wandb/wandb_utils.py         | 62 ++++++++++----------
 val.py                                     |  4 +-
 9 files changed, 122 insertions(+), 73 deletions(-)
 rename data/{Argoverse_HD.yaml => Argoverse.yaml} (97%)
 rename utils/{google_utils.py => downloads.py} (98%)

diff --git a/data/Argoverse_HD.yaml b/data/Argoverse.yaml
similarity index 97%
rename from data/Argoverse_HD.yaml
rename to data/Argoverse.yaml
index e379b1ec99df..c42624c5783f 100644
--- a/data/Argoverse_HD.yaml
+++ b/data/Argoverse.yaml
@@ -1,6 +1,6 @@
 # YOLOv5 🚀 by Ultralytics https://ultralytics.com, licensed under GNU GPL v3.0
 # Argoverse-HD dataset (ring-front-center camera) http://www.cs.cmu.edu/~mengtial/proj/streaming/
-# Example usage: python train.py --data Argoverse_HD.yaml
+# Example usage: python train.py --data Argoverse.yaml
 # parent
 # ├── yolov5
 # └── datasets
diff --git a/hubconf.py b/hubconf.py
index 55536c3a42f3..7ef512655ae2 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -27,7 +27,7 @@ def _create(name, pretrained=True, channels=3, classes=80, autoshape=True, verbo
 
     from models.yolo import Model, attempt_load
     from utils.general import check_requirements, set_logging
-    from utils.google_utils import attempt_download
+    from utils.downloads import attempt_download
     from utils.torch_utils import select_device
 
     file = Path(__file__).absolute()
diff --git a/models/experimental.py b/models/experimental.py
index 0d996d913b0c..276ca954b173 100644
--- a/models/experimental.py
+++ b/models/experimental.py
@@ -5,7 +5,7 @@
 import torch.nn as nn
 
 from models.common import Conv, DWConv
-from utils.google_utils import attempt_download
+from utils.downloads import attempt_download
 
 
 class CrossConv(nn.Module):
diff --git a/train.py b/train.py
index bd1fa9c74328..020883ce98ba 100644
--- a/train.py
+++ b/train.py
@@ -35,7 +35,7 @@
 from utils.general import labels_to_class_weights, increment_path, labels_to_image_weights, init_seeds, \
     strip_optimizer, get_latest_run, check_dataset, check_file, check_git_status, check_img_size, \
     check_requirements, print_mutation, set_logging, one_cycle, colorstr
-from utils.google_utils import attempt_download
+from utils.downloads import attempt_download
 from utils.loss import ComputeLoss
 from utils.plots import plot_labels, plot_evolution
 from utils.torch_utils import ModelEMA, select_device, intersect_dicts, torch_distributed_zero_first, de_parallel
@@ -78,9 +78,9 @@ def train(hyp,  # path/to/hyp.yaml or hyp dictionary
     plots = not evolve  # create plots
     cuda = device.type != 'cpu'
     init_seeds(1 + RANK)
-    with open(data, encoding='ascii', errors='ignore') as f:
-        data_dict = yaml.safe_load(f)
-
+    with torch_distributed_zero_first(RANK):
+        data_dict = check_dataset(data)  # check
+    train_path, val_path = data_dict['train'], data_dict['val']
     nc = 1 if single_cls else int(data_dict['nc'])  # number of classes
     names = ['item'] if single_cls and len(data_dict['names']) != 1 else data_dict['names']  # class names
     assert len(names) == nc, f'{len(names)} names found for nc={nc} dataset in {data}'  # check
@@ -106,9 +106,6 @@ def train(hyp,  # path/to/hyp.yaml or hyp dictionary
         LOGGER.info(f'Transferred {len(csd)}/{len(model.state_dict())} items from {weights}')  # report
     else:
         model = Model(cfg, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device)  # create
-    with torch_distributed_zero_first(RANK):
-        check_dataset(data_dict)  # check
-    train_path, val_path = data_dict['train'], data_dict['val']
 
     # Freeze
     freeze = []  # parameter names to freeze (full or partial)
diff --git a/utils/datasets.py b/utils/datasets.py
index 5b5ded4bbc41..fffe39a61459 100755
--- a/utils/datasets.py
+++ b/utils/datasets.py
@@ -884,11 +884,11 @@ def verify_image_label(args):
         return [None, None, None, None, nm, nf, ne, nc, msg]
 
 
-def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False):
+def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False, profile=False, hub=False):
     """ Return dataset statistics dictionary with images and instances counts per split per class
-    Usage1: from utils.datasets import *; dataset_stats('coco128.yaml', verbose=True)
-    Usage2: from utils.datasets import *; dataset_stats('../datasets/coco128.zip', verbose=True)
-    
+    To run in parent directory: export PYTHONPATH="$PWD/yolov5"
+    Usage1: from utils.datasets import *; dataset_stats('coco128.yaml', autodownload=True)
+    Usage2: from utils.datasets import *; dataset_stats('../datasets/coco128_with_yaml.zip')
     Arguments
         path:           Path to data.yaml or data.zip (with data.yaml inside data.zip)
         autodownload:   Attempt to download dataset if not found locally
@@ -897,35 +897,42 @@ def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False):
 
     def round_labels(labels):
         # Update labels to integer class and 6 decimal place floats
-        return [[int(c), *[round(x, 6) for x in points]] for c, *points in labels]
+        return [[int(c), *[round(x, 4) for x in points]] for c, *points in labels]
 
     def unzip(path):
         # Unzip data.zip TODO: CONSTRAINT: path/to/abc.zip MUST unzip to 'path/to/abc/'
         if str(path).endswith('.zip'):  # path is data.zip
+            assert Path(path).is_file(), f'Error unzipping {path}, file not found'
             assert os.system(f'unzip -q {path} -d {path.parent}') == 0, f'Error unzipping {path}'
-            data_dir = path.with_suffix('')  # dataset directory
-            return True, data_dir, list(data_dir.rglob('*.yaml'))[0]  # zipped, data_dir, yaml_path
+            dir = path.with_suffix('')  # dataset directory
+            return True, str(dir), next(dir.rglob('*.yaml'))  # zipped, data_dir, yaml_path
         else:  # path is data.yaml
             return False, None, path
 
+    def hub_ops(f, max_dim=1920):
+        # HUB ops for 1 image 'f'
+        im = Image.open(f)
+        r = max_dim / max(im.height, im.width)  # ratio
+        if r < 1.0:  # image too large
+            im = im.resize((int(im.width * r), int(im.height * r)))
+        im.save(im_dir / Path(f).name, quality=75)  # save
+
     zipped, data_dir, yaml_path = unzip(Path(path))
     with open(check_file(yaml_path), encoding='ascii', errors='ignore') as f:
         data = yaml.safe_load(f)  # data dict
         if zipped:
             data['path'] = data_dir  # TODO: should this be dir.resolve()?
     check_dataset(data, autodownload)  # download dataset if missing
-    nc = data['nc']  # number of classes
-    stats = {'nc': nc, 'names': data['names']}  # statistics dictionary
+    hub_dir = Path(data['path'] + ('-hub' if hub else ''))
+    stats = {'nc': data['nc'], 'names': data['names']}  # statistics dictionary
     for split in 'train', 'val', 'test':
         if data.get(split) is None:
             stats[split] = None  # i.e. no test set
             continue
         x = []
-        dataset = LoadImagesAndLabels(data[split], augment=False, rect=True)  # load dataset
-        if split == 'train':
-            cache_path = Path(dataset.label_files[0]).parent.with_suffix('.cache')  # *.cache path
+        dataset = LoadImagesAndLabels(data[split])  # load dataset
         for label in tqdm(dataset.labels, total=dataset.n, desc='Statistics'):
-            x.append(np.bincount(label[:, 0].astype(int), minlength=nc))
+            x.append(np.bincount(label[:, 0].astype(int), minlength=data['nc']))
         x = np.array(x)  # shape(128x80)
         stats[split] = {'instance_stats': {'total': int(x.sum()), 'per_class': x.sum(0).tolist()},
                         'image_stats': {'total': dataset.n, 'unlabelled': int(np.all(x == 0, 1).sum()),
@@ -933,10 +940,37 @@ def unzip(path):
                         'labels': [{str(Path(k).name): round_labels(v.tolist())} for k, v in
                                    zip(dataset.img_files, dataset.labels)]}
 
+        if hub:
+            im_dir = hub_dir / 'images'
+            im_dir.mkdir(parents=True, exist_ok=True)
+            for _ in tqdm(ThreadPool(NUM_THREADS).imap(hub_ops, dataset.img_files), total=dataset.n, desc='HUB Ops'):
+                pass
+
+    # Profile
+    stats_path = hub_dir / 'stats.json'
+    if profile:
+        for _ in range(1):
+            file = stats_path.with_suffix('.npy')
+            t1 = time.time()
+            np.save(file, stats)
+            t2 = time.time()
+            x = np.load(file, allow_pickle=True)
+            print(f'stats.npy times: {time.time() - t2:.3f}s read, {t2 - t1:.3f}s write')
+
+            file = stats_path.with_suffix('.json')
+            t1 = time.time()
+            with open(file, 'w') as f:
+                json.dump(stats, f)  # save stats *.json
+            t2 = time.time()
+            with open(file, 'r') as f:
+                x = json.load(f)  # load hyps dict
+            print(f'stats.json times: {time.time() - t2:.3f}s read, {t2 - t1:.3f}s write')
+
     # Save, print and return
-    with open(cache_path.with_suffix('.json'), 'w') as f:
-        json.dump(stats, f)  # save stats *.json
+    if hub:
+        print(f'Saving {stats_path.resolve()}...')
+        with open(stats_path, 'w') as f:
+            json.dump(stats, f)  # save stats.json
     if verbose:
         print(json.dumps(stats, indent=2, sort_keys=False))
-        # print(yaml.dump([stats], sort_keys=False, default_flow_style=False))
     return stats
diff --git a/utils/google_utils.py b/utils/downloads.py
similarity index 98%
rename from utils/google_utils.py
rename to utils/downloads.py
index aefc7de2db2e..00156962380b 100644
--- a/utils/google_utils.py
+++ b/utils/downloads.py
@@ -1,4 +1,4 @@
-# Google utils: https://cloud.google.com/storage/docs/reference/libraries
+# Download utils
 
 import os
 import platform
@@ -115,6 +115,10 @@ def get_token(cookie="./cookie"):
                 return line.split()[-1]
     return ""
 
+
+# Google utils: https://cloud.google.com/storage/docs/reference/libraries ----------------------------------------------
+#
+#
 # def upload_blob(bucket_name, source_file_name, destination_blob_name):
 #     # Uploads a file to a bucket
 #     # https://cloud.google.com/storage/docs/uploading-objects#storage-upload-object-python
diff --git a/utils/general.py b/utils/general.py
index db81f7679cd7..6b00ddf2ff72 100755
--- a/utils/general.py
+++ b/utils/general.py
@@ -24,7 +24,7 @@
 import torchvision
 import yaml
 
-from utils.google_utils import gsutil_getsize
+from utils.downloads import gsutil_getsize
 from utils.metrics import box_iou, fitness
 from utils.torch_utils import init_torch_seeds
 
@@ -224,16 +224,30 @@ def check_file(file):
 
 
 def check_dataset(data, autodownload=True):
-    # Download dataset if not found locally
-    path = Path(data.get('path', ''))  # optional 'path' field
-    if path:
-        for k in 'train', 'val', 'test':
-            if data.get(k):  # prepend path
-                data[k] = str(path / data[k]) if isinstance(data[k], str) else [str(path / x) for x in data[k]]
+    # Download and/or unzip dataset if not found locally
+    # Usage: https://github.com/ultralytics/yolov5/releases/download/v1.0/coco128_with_yaml.zip
+
+    # Download (optional)
+    extract_dir = ''
+    if isinstance(data, (str, Path)) and str(data).endswith('.zip'):  # i.e. gs://bucket/dir/coco128.zip
+        download(data, dir='../datasets', unzip=True, delete=False, curl=False, threads=1)
+        data = next((Path('../datasets') / Path(data).stem).rglob('*.yaml'))
+        extract_dir, autodownload = data.parent, False
+
+    # Read yaml (optional)
+    if isinstance(data, (str, Path)):
+        with open(data, encoding='ascii', errors='ignore') as f:
+            data = yaml.safe_load(f)  # dictionary
+
+    # Parse yaml
+    path = extract_dir or Path(data.get('path') or '')  # optional 'path' default to '.'
+    for k in 'train', 'val', 'test':
+        if data.get(k):  # prepend path
+            data[k] = str(path / data[k]) if isinstance(data[k], str) else [str(path / x) for x in data[k]]
 
     assert 'nc' in data, "Dataset 'nc' key missing."
     if 'names' not in data:
-        data['names'] = [str(i) for i in range(data['nc'])]  # assign class names if missing
+        data['names'] = [f'class{i}' for i in range(data['nc'])]  # assign class names if missing
     train, val, test, s = [data.get(x) for x in ('train', 'val', 'test', 'download')]
     if val:
         val = [Path(x).resolve() for x in (val if isinstance(val, list) else [val])]  # val path
@@ -256,13 +270,17 @@ def check_dataset(data, autodownload=True):
             else:
                 raise Exception('Dataset not found.')
 
+    return data  # dictionary
+
 
 def download(url, dir='.', unzip=True, delete=True, curl=False, threads=1):
-    # Multi-threaded file download and unzip function
+    # Multi-threaded file download and unzip function, used in data.yaml for autodownload
     def download_one(url, dir):
         # Download 1 file
         f = dir / Path(url).name  # filename
-        if not f.exists():
+        if Path(url).is_file():  # exists in current path
+            Path(url).rename(f)  # move to dir
+        elif not f.exists():
             print(f'Downloading {url} to {f}...')
             if curl:
                 os.system(f"curl -L '{url}' -o '{f}' --retry 9 -C -")  # curl download, retry and resume on fail
@@ -286,7 +304,7 @@ def download_one(url, dir):
         pool.close()
         pool.join()
     else:
-        for u in tuple(url) if isinstance(url, str) else url:
+        for u in [url] if isinstance(url, (str, Path)) else url:
             download_one(u, dir)
 
 
diff --git a/utils/loggers/wandb/wandb_utils.py b/utils/loggers/wandb/wandb_utils.py
index cd5939155169..f4f228df4e24 100644
--- a/utils/loggers/wandb/wandb_utils.py
+++ b/utils/loggers/wandb/wandb_utils.py
@@ -100,7 +100,7 @@ class WandbLogger():
     """
 
     def __init__(self, opt, run_id, data_dict, job_type='Training'):
-        '''
+        """
         - Initialize WandbLogger instance
         - Upload dataset if opt.upload_dataset is True
         - Setup trainig processes if job_type is 'Training'
@@ -111,7 +111,7 @@ def __init__(self, opt, run_id, data_dict, job_type='Training'):
         data_dict (Dict) -- Dictionary conataining info about the dataset to be used
         job_type (str) -- To set the job_type for this run 
 
-       '''
+       """
         # Pre-training routine --
         self.job_type = job_type
         self.wandb, self.wandb_run = wandb, None if not wandb else wandb.run
@@ -157,7 +157,7 @@ def __init__(self, opt, run_id, data_dict, job_type='Training'):
                 self.data_dict = self.check_and_upload_dataset(opt)
 
     def check_and_upload_dataset(self, opt):
-        '''
+        """
         Check if the dataset format is compatible and upload it as W&B artifact
         
         arguments:
@@ -165,7 +165,7 @@ def check_and_upload_dataset(self, opt):
         
         returns:
         Updated dataset info dictionary where local dataset paths are replaced by WAND_ARFACT_PREFIX links.
-        '''
+        """
         assert wandb, 'Install wandb to upload dataset'
         config_path = self.log_dataset_artifact(check_file(opt.data),
                                                 opt.single_cls,
@@ -176,7 +176,7 @@ def check_and_upload_dataset(self, opt):
         return wandb_data_dict
 
     def setup_training(self, opt, data_dict):
-        '''
+        """
         Setup the necessary processes for training YOLO models:
           - Attempt to download model checkpoint and dataset artifacts if opt.resume stats with WANDB_ARTIFACT_PREFIX
           - Update data_dict, to contain info of previous run if resumed and the paths of dataset artifact if downloaded
@@ -188,7 +188,7 @@ def setup_training(self, opt, data_dict):
         
         returns:
         data_dict (Dict) -- contains the updated info about the dataset to be used for training
-        '''
+        """
         self.log_dict, self.current_epoch = {}, 0
         self.bbox_interval = opt.bbox_interval
         if isinstance(opt.resume, str):
@@ -224,7 +224,7 @@ def setup_training(self, opt, data_dict):
         return data_dict
 
     def download_dataset_artifact(self, path, alias):
-        '''
+        """
         download the model checkpoint artifact if the path starts with WANDB_ARTIFACT_PREFIX
         
         arguments:
@@ -234,7 +234,7 @@ def download_dataset_artifact(self, path, alias):
         returns:
         (str, wandb.Artifact) -- path of the downladed dataset and it's corresponding artifact object if dataset
         is found otherwise returns (None, None)
-        '''
+        """
         if isinstance(path, str) and path.startswith(WANDB_ARTIFACT_PREFIX):
             artifact_path = Path(remove_prefix(path, WANDB_ARTIFACT_PREFIX) + ":" + alias)
             dataset_artifact = wandb.use_artifact(artifact_path.as_posix().replace("\\", "/"))
@@ -244,12 +244,12 @@ def download_dataset_artifact(self, path, alias):
         return None, None
 
     def download_model_artifact(self, opt):
-        '''
+        """
         download the model checkpoint artifact if the resume path starts with WANDB_ARTIFACT_PREFIX
         
         arguments:
         opt (namespace) -- Commandline arguments for this run
-        '''
+        """
         if opt.resume.startswith(WANDB_ARTIFACT_PREFIX):
             model_artifact = wandb.use_artifact(remove_prefix(opt.resume, WANDB_ARTIFACT_PREFIX) + ":latest")
             assert model_artifact is not None, 'Error: W&B model artifact doesn\'t exist'
@@ -262,7 +262,7 @@ def download_model_artifact(self, opt):
         return None, None
 
     def log_model(self, path, opt, epoch, fitness_score, best_model=False):
-        '''
+        """
         Log the model checkpoint as W&B artifact
         
         arguments:
@@ -271,7 +271,7 @@ def log_model(self, path, opt, epoch, fitness_score, best_model=False):
         epoch (int)  -- Current epoch number
         fitness_score (float) -- fitness score for current epoch 
         best_model (boolean) -- Boolean representing if the current checkpoint is the best yet.
-        '''
+        """
         model_artifact = wandb.Artifact('run_' + wandb.run.id + '_model', type='model', metadata={
             'original_url': str(path),
             'epochs_trained': epoch + 1,
@@ -286,7 +286,7 @@ def log_model(self, path, opt, epoch, fitness_score, best_model=False):
         print("Saving model artifact on epoch ", epoch + 1)
 
     def log_dataset_artifact(self, data_file, single_cls, project, overwrite_config=False):
-        '''
+        """
         Log the dataset as W&B artifact and return the new data file with W&B links
         
         arguments:
@@ -298,10 +298,8 @@ def log_dataset_artifact(self, data_file, single_cls, project, overwrite_config=
         
         returns:
         the new .yaml file with artifact links. it can be used to start training directly from artifacts
-        '''
-        with open(data_file, encoding='ascii', errors='ignore') as f:
-            data = yaml.safe_load(f)  # data dict
-        check_dataset(data)
+        """
+        data = check_dataset(data_file)  # parse and check
         nc, names = (1, ['item']) if single_cls else (int(data['nc']), data['names'])
         names = {k: v for k, v in enumerate(names)}  # to index dictionary
         self.train_artifact = self.create_dataset_table(LoadImagesAndLabels(
@@ -330,17 +328,17 @@ def log_dataset_artifact(self, data_file, single_cls, project, overwrite_config=
         return path
 
     def map_val_table_path(self):
-        '''
+        """
         Map the validation dataset Table like name of file -> it's id in the W&B Table.
         Useful for - referencing artifacts for evaluation.
-        '''
+        """
         self.val_table_path_map = {}
         print("Mapping dataset")
         for i, data in enumerate(tqdm(self.val_table.data)):
             self.val_table_path_map[data[3]] = data[0]
 
     def create_dataset_table(self, dataset, class_to_id, name='dataset'):
-        '''
+        """
         Create and return W&B artifact containing W&B Table of the dataset.
         
         arguments:
@@ -350,7 +348,7 @@ def create_dataset_table(self, dataset, class_to_id, name='dataset'):
         
         returns:
         dataset artifact to be logged or used
-        '''
+        """
         # TODO: Explore multiprocessing to slpit this loop parallely| This is essential for speeding up the the logging
         artifact = wandb.Artifact(name=name, type="dataset")
         img_files = tqdm([dataset.path]) if isinstance(dataset.path, str) and Path(dataset.path).is_dir() else None
@@ -382,14 +380,14 @@ def create_dataset_table(self, dataset, class_to_id, name='dataset'):
         return artifact
 
     def log_training_progress(self, predn, path, names):
-        '''
+        """
         Build evaluation Table. Uses reference from validation dataset table.
         
         arguments:
         predn (list): list of predictions in the native space in the format - [xmin, ymin, xmax, ymax, confidence, class]
         path (str): local path of the current evaluation image 
         names (dict(int, str)): hash map that maps class ids to labels
-        '''
+        """
         class_set = wandb.Classes([{'id': id, 'name': name} for id, name in names.items()])
         box_data = []
         total_conf = 0
@@ -412,17 +410,17 @@ def log_training_progress(self, predn, path, names):
                                    )
 
     def val_one_image(self, pred, predn, path, names, im):
-        '''
+        """
         Log validation data for one image. updates the result Table if validation dataset is uploaded and log bbox media panel
         
         arguments:
         pred (list): list of scaled predictions in the format - [xmin, ymin, xmax, ymax, confidence, class]
         predn (list): list of predictions in the native space - [xmin, ymin, xmax, ymax, confidence, class]
         path (str): local path of the current evaluation image 
-        '''
+        """
         if self.val_table and self.result_table:  # Log Table if Val dataset is uploaded as artifact
             self.log_training_progress(predn, path, names)
-        
+
         if len(self.bbox_media_panel_images) < self.max_imgs_to_log and self.current_epoch > 0:
             if self.current_epoch % self.bbox_interval == 0:
                 box_data = [{"position": {"minX": xyxy[0], "minY": xyxy[1], "maxX": xyxy[2], "maxY": xyxy[3]},
@@ -434,23 +432,23 @@ def val_one_image(self, pred, predn, path, names, im):
                 self.bbox_media_panel_images.append(wandb.Image(im, boxes=boxes, caption=path.name))
 
     def log(self, log_dict):
-        '''
+        """
         save the metrics to the logging dictionary
         
         arguments:
         log_dict (Dict) -- metrics/media to be logged in current step
-        '''
+        """
         if self.wandb_run:
             for key, value in log_dict.items():
                 self.log_dict[key] = value
 
     def end_epoch(self, best_result=False):
-        '''
+        """
         commit the log_dict, model artifacts and Tables to W&B and flush the log_dict.
         
         arguments:
         best_result (boolean): Boolean representing if the result of this evaluation is best or not
-        '''
+        """
         if self.wandb_run:
             with all_logging_disabled():
                 if self.bbox_media_panel_images:
@@ -468,9 +466,9 @@ def end_epoch(self, best_result=False):
                 self.result_artifact = wandb.Artifact("run_" + wandb.run.id + "_progress", "evaluation")
 
     def finish_run(self):
-        '''
+        """
         Log metrics if any and finish the current W&B run
-        '''
+        """
         if self.wandb_run:
             if self.log_dict:
                 with all_logging_disabled():
diff --git a/val.py b/val.py
index c58bcdb209c2..ee2287644b92 100644
--- a/val.py
+++ b/val.py
@@ -123,9 +123,7 @@ def run(data,
         #     model = nn.DataParallel(model)
 
         # Data
-        with open(data, encoding='ascii', errors='ignore') as f:
-            data = yaml.safe_load(f)
-        check_dataset(data)  # check
+        data = check_dataset(data)  # check
 
     # Half
     half &= device.type != 'cpu'  # half precision only supported on CUDA