ultralytics · kalenmike · Jul 29, 2021 · Aug 3, 2021 · Aug 3, 2021 · Aug 3, 2021
diff --git a/requirements.txt b/requirements.txt
@@ -4,7 +4,7 @@
 matplotlib>=3.2.2
 numpy>=1.18.5
 opencv-python>=4.1.2
-Pillow>=8.0.0
+#Pillow>=8.0.0
 PyYAML>=5.3.1
 scipy>=1.4.1
 torch>=1.7.0

diff --git a/train.py b/train.py
@@ -97,7 +97,7 @@ def train(hyp,  # path/to/hyp.yaml or hyp dictionary
     cuda = device.type != 'cpu'
     init_seeds(1 + RANK)
     with torch_distributed_zero_first(RANK):
-        data_dict = data_dict or check_dataset(data)  # check if None
+        data_dict = data_dict or (check_dataset(data, opt.sandbox) if opt.sandbox else check_dataset(data)) # check if None, check for sandbox dir
     train_path, val_path = data_dict['train'], data_dict['val']
     nc = 1 if single_cls else int(data_dict['nc'])  # number of classes
     names = ['item'] if single_cls and len(data_dict['names']) != 1 else data_dict['names']  # class names
@@ -451,6 +451,7 @@ def parse_opt(known=False):
     parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode')
     parser.add_argument('--workers', type=int, default=8, help='maximum number of dataloader workers')
     parser.add_argument('--project', default='runs/train', help='save to project/name')
+    parser.add_argument('--sandbox', default=None, help='save all files including datasets here, overwrites project')
     parser.add_argument('--entity', default=None, help='W&B entity')
     parser.add_argument('--name', default='exp', help='save to project/name')
     parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment')
@@ -490,6 +491,11 @@ def main(opt, callbacks=Callbacks()):
         if opt.evolve:
             opt.project = 'runs/evolve'
             opt.exist_ok = opt.resume
+
+        # Overwrite project if sandbox is set
+        if opt.sandbox:
+            opt.project = opt.sandbox
+
         opt.save_dir = str(increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok))
 
     # DDP mode

diff --git a/utils/datasets.py b/utils/datasets.py
@@ -862,46 +862,46 @@ def verify_image_label(args):
     # Verify one image-label pair
     im_file, lb_file, prefix = args
     nm, nf, ne, nc, msg, segments = 0, 0, 0, 0, '', []  # number (missing, found, empty, corrupt), message, segments
-    try:
-        # verify images
-        im = Image.open(im_file)
-        im.verify()  # PIL verify
-        shape = exif_size(im)  # image size
-        assert (shape[0] > 9) & (shape[1] > 9), f'image size {shape} <10 pixels'
-        assert im.format.lower() in IMG_FORMATS, f'invalid image format {im.format}'
-        if im.format.lower() in ('jpg', 'jpeg'):
-            with open(im_file, 'rb') as f:
-                f.seek(-2, 2)
-                if f.read() != b'\xff\xd9':  # corrupt JPEG
-                    Image.open(im_file).save(im_file, format='JPEG', subsampling=0, quality=100)  # re-save image
-                    msg = f'{prefix}WARNING: corrupt JPEG restored and saved {im_file}'
-
-        # verify labels
-        if os.path.isfile(lb_file):
-            nf = 1  # label found
-            with open(lb_file, 'r') as f:
-                l = [x.split() for x in f.read().strip().splitlines() if len(x)]
-                if any([len(x) > 8 for x in l]):  # is segment
-                    classes = np.array([x[0] for x in l], dtype=np.float32)
-                    segments = [np.array(x[1:], dtype=np.float32).reshape(-1, 2) for x in l]  # (cls, xy1...)
-                    l = np.concatenate((classes.reshape(-1, 1), segments2boxes(segments)), 1)  # (cls, xywh)
-                l = np.array(l, dtype=np.float32)
-            if len(l):
-                assert l.shape[1] == 5, 'labels require 5 columns each'
-                assert (l >= 0).all(), 'negative labels'
-                assert (l[:, 1:] <= 1).all(), 'non-normalized or out of bounds coordinate labels'
-                assert np.unique(l, axis=0).shape[0] == l.shape[0], 'duplicate labels'
-            else:
-                ne = 1  # label empty
-                l = np.zeros((0, 5), dtype=np.float32)
+    # try:
+    # verify images
+    im = Image.open(im_file)
+    im.verify()  # PIL verify
+    shape = exif_size(im)  # image size
+    assert (shape[0] > 9) & (shape[1] > 9), f'image size {shape} <10 pixels'
+    assert im.format.lower() in IMG_FORMATS, f'invalid image format {im.format}'
+    if im.format.lower() in ('jpg', 'jpeg'):
+        with open(im_file, 'rb') as f:
+            f.seek(-2, 2)
+            if f.read() != b'\xff\xd9':  # corrupt JPEG
+                Image.open(im_file).save(im_file, format='JPEG', subsampling=0, quality=100)  # re-save image
+                msg = f'{prefix}WARNING: corrupt JPEG restored and saved {im_file}'
+
+    # verify labels
+    if os.path.isfile(lb_file):
+        nf = 1  # label found
+        with open(lb_file, 'r') as f:
+            l = [x.split() for x in f.read().strip().splitlines() if len(x)]
+            if any([len(x) > 8 for x in l]):  # is segment
+                classes = np.array([x[0] for x in l], dtype=np.float32)
+                segments = [np.array(x[1:], dtype=np.float32).reshape(-1, 2) for x in l]  # (cls, xy1...)
+                l = np.concatenate((classes.reshape(-1, 1), segments2boxes(segments)), 1)  # (cls, xywh)
+            l = np.array(l, dtype=np.float32)
+        if len(l):
+            assert l.shape[1] == 5, 'labels require 5 columns each'
+            assert (l >= 0).all(), 'negative labels'
+            assert (l[:, 1:] <= 1).all(), 'non-normalized or out of bounds coordinate labels'
+            assert np.unique(l, axis=0).shape[0] == l.shape[0], 'duplicate labels'
         else:
-            nm = 1  # label missing
+            ne = 1  # label empty
             l = np.zeros((0, 5), dtype=np.float32)
-        return im_file, l, shape, segments, nm, nf, ne, nc, msg
-    except Exception as e:
-        nc = 1
-        msg = f'{prefix}WARNING: Ignoring corrupted image and/or label {im_file}: {e}'
-        return [None, None, None, None, nm, nf, ne, nc, msg]
+    else:
+        nm = 1  # label missing
+        l = np.zeros((0, 5), dtype=np.float32)
+    return im_file, l, shape, segments, nm, nf, ne, nc, msg
+    # except Exception as e:
+    #     nc = 1
+    #     msg = f'{prefix}WARNING: Ignoring corrupted image and/or label {im_file}: {e}'
+    #     return [None, None, None, None, nm, nf, ne, nc, msg]
 
 
 def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False, profile=False, hub=False):

diff --git a/utils/general.py b/utils/general.py
@@ -300,42 +300,52 @@ def check_file(file, suffix=''):
         return files[0]  # return file
 
 
-def check_dataset(data, autodownload=True):
+def check_dataset(data, save_root="../datasets", autodownload=True):
     # Download and/or unzip dataset if not found locally
     # Usage: https://github.com/ultralytics/yolov5/releases/download/v1.0/coco128_with_yaml.zip
 
     # Download (optional)
     extract_dir = ''
     if isinstance(data, (str, Path)) and str(data).endswith('.zip'):  # i.e. gs://bucket/dir/coco128.zip
-        download(data, dir='../datasets', unzip=True, delete=False, curl=False, threads=1)
-        data = next((Path('../datasets') / Path(data).stem).rglob('*.yaml'))
+        download(data, dir=save_root, unzip=True, delete=False, curl=False, threads=1)
+        data = next((Path(save_root) / Path(data).stem).rglob('*.yaml'))
         extract_dir, autodownload = data.parent, False
 
     # Read yaml (optional)
     if isinstance(data, (str, Path)):
         with open(data, errors='ignore') as f:
             data = yaml.safe_load(f)  # dictionary
 
+    assert 'nc' in data, "Dataset 'nc' key missing."
+    if 'names' not in data:
+        data['names'] = [f'class{i}' for i in range(data['nc'])]  # assign class names if missing
+    train, val, test, s = [data.get(x) for x in ('train', 'val', 'test', 'download')]
+
+    # Define path by priority: 1.Custom by save_root argument, 2.YAML.path, 3.Default
+    is_default_root = save_root == '../datasets' # Returns bool
+    has_yaml_path = data and data.get('path')
+
+    if is_default_root and has_yaml_path:
+        # Set path as yaml path
+        path = Path(data.get('path'))
+    else:
+        path = Path(save_root) / Path(s).stem # append dataset subdir to save_root
+
     # Parse yaml
-    path = extract_dir or Path(data.get('path') or '')  # optional 'path' default to '.'
     for k in 'train', 'val', 'test':
         if data.get(k):  # prepend path
             data[k] = str(path / data[k]) if isinstance(data[k], str) else [str(path / x) for x in data[k]]
 
-    assert 'nc' in data, "Dataset 'nc' key missing."
-    if 'names' not in data:
-        data['names'] = [f'class{i}' for i in range(data['nc'])]  # assign class names if missing
-    train, val, test, s = [data.get(x) for x in ('train', 'val', 'test', 'download')]
     if val:
-        val = [Path(x).resolve() for x in (val if isinstance(val, list) else [val])]  # val path
+        val = [Path(path / x).resolve() for x in (val if isinstance(val, list) else [val])]  # val path
         if not all(x.exists() for x in val):
             print('\nWARNING: Dataset not found, nonexistent paths: %s' % [str(x) for x in val if not x.exists()])
             if s and autodownload:  # download script
                 if s.startswith('http') and s.endswith('.zip'):  # URL
                     f = Path(s).name  # filename
-                    print(f'Downloading {s} ...')
+                    print(f'Downloading {s} to {f}...')
                     torch.hub.download_url_to_file(s, f)
-                    root = path.parent if 'path' in data else '..'  # unzip directory i.e. '../'
+                    root = path.parent # unzip directory i.e. '../'
                     Path(root).mkdir(parents=True, exist_ok=True)  # create root
                     r = os.system(f'unzip -q {f} -d {root} && rm {f}')  # unzip
                 elif s.startswith('bash '):  # bash script