Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added user option to specify dataset download path #4774

Closed
wants to merge 21 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
matplotlib>=3.2.2
numpy>=1.18.5
opencv-python>=4.1.2
Pillow>=8.0.0
#Pillow>=8.0.0
PyYAML>=5.3.1
scipy>=1.4.1
torch>=1.7.0
Expand Down
8 changes: 7 additions & 1 deletion train.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
cuda = device.type != 'cpu'
init_seeds(1 + RANK)
with torch_distributed_zero_first(RANK):
data_dict = data_dict or check_dataset(data) # check if None
data_dict = data_dict or (check_dataset(data, opt.sandbox) if opt.sandbox else check_dataset(data)) # check if None, check for sandbox dir
train_path, val_path = data_dict['train'], data_dict['val']
nc = 1 if single_cls else int(data_dict['nc']) # number of classes
names = ['item'] if single_cls and len(data_dict['names']) != 1 else data_dict['names'] # class names
Expand Down Expand Up @@ -451,6 +451,7 @@ def parse_opt(known=False):
parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode')
parser.add_argument('--workers', type=int, default=8, help='maximum number of dataloader workers')
parser.add_argument('--project', default='runs/train', help='save to project/name')
parser.add_argument('--sandbox', default=None, help='save all files including datasets here, overwrites project')
parser.add_argument('--entity', default=None, help='W&B entity')
parser.add_argument('--name', default='exp', help='save to project/name')
parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment')
Expand Down Expand Up @@ -490,6 +491,11 @@ def main(opt, callbacks=Callbacks()):
if opt.evolve:
opt.project = 'runs/evolve'
opt.exist_ok = opt.resume

# Overwrite project if sandbox is set
if opt.sandbox:
opt.project = opt.sandbox

opt.save_dir = str(increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok))

# DDP mode
Expand Down
76 changes: 38 additions & 38 deletions utils/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -862,46 +862,46 @@ def verify_image_label(args):
# Verify one image-label pair
im_file, lb_file, prefix = args
nm, nf, ne, nc, msg, segments = 0, 0, 0, 0, '', [] # number (missing, found, empty, corrupt), message, segments
try:
# verify images
im = Image.open(im_file)
im.verify() # PIL verify
shape = exif_size(im) # image size
assert (shape[0] > 9) & (shape[1] > 9), f'image size {shape} <10 pixels'
assert im.format.lower() in IMG_FORMATS, f'invalid image format {im.format}'
if im.format.lower() in ('jpg', 'jpeg'):
with open(im_file, 'rb') as f:
f.seek(-2, 2)
if f.read() != b'\xff\xd9': # corrupt JPEG
Image.open(im_file).save(im_file, format='JPEG', subsampling=0, quality=100) # re-save image
msg = f'{prefix}WARNING: corrupt JPEG restored and saved {im_file}'

# verify labels
if os.path.isfile(lb_file):
nf = 1 # label found
with open(lb_file, 'r') as f:
l = [x.split() for x in f.read().strip().splitlines() if len(x)]
if any([len(x) > 8 for x in l]): # is segment
classes = np.array([x[0] for x in l], dtype=np.float32)
segments = [np.array(x[1:], dtype=np.float32).reshape(-1, 2) for x in l] # (cls, xy1...)
l = np.concatenate((classes.reshape(-1, 1), segments2boxes(segments)), 1) # (cls, xywh)
l = np.array(l, dtype=np.float32)
if len(l):
assert l.shape[1] == 5, 'labels require 5 columns each'
assert (l >= 0).all(), 'negative labels'
assert (l[:, 1:] <= 1).all(), 'non-normalized or out of bounds coordinate labels'
assert np.unique(l, axis=0).shape[0] == l.shape[0], 'duplicate labels'
else:
ne = 1 # label empty
l = np.zeros((0, 5), dtype=np.float32)
# try:
# verify images
im = Image.open(im_file)
im.verify() # PIL verify
shape = exif_size(im) # image size
assert (shape[0] > 9) & (shape[1] > 9), f'image size {shape} <10 pixels'
assert im.format.lower() in IMG_FORMATS, f'invalid image format {im.format}'
if im.format.lower() in ('jpg', 'jpeg'):
with open(im_file, 'rb') as f:
f.seek(-2, 2)
if f.read() != b'\xff\xd9': # corrupt JPEG
Image.open(im_file).save(im_file, format='JPEG', subsampling=0, quality=100) # re-save image
msg = f'{prefix}WARNING: corrupt JPEG restored and saved {im_file}'

# verify labels
if os.path.isfile(lb_file):
nf = 1 # label found
with open(lb_file, 'r') as f:
l = [x.split() for x in f.read().strip().splitlines() if len(x)]
if any([len(x) > 8 for x in l]): # is segment
classes = np.array([x[0] for x in l], dtype=np.float32)
segments = [np.array(x[1:], dtype=np.float32).reshape(-1, 2) for x in l] # (cls, xy1...)
l = np.concatenate((classes.reshape(-1, 1), segments2boxes(segments)), 1) # (cls, xywh)
l = np.array(l, dtype=np.float32)
if len(l):
assert l.shape[1] == 5, 'labels require 5 columns each'
assert (l >= 0).all(), 'negative labels'
assert (l[:, 1:] <= 1).all(), 'non-normalized or out of bounds coordinate labels'
assert np.unique(l, axis=0).shape[0] == l.shape[0], 'duplicate labels'
else:
nm = 1 # label missing
ne = 1 # label empty
l = np.zeros((0, 5), dtype=np.float32)
return im_file, l, shape, segments, nm, nf, ne, nc, msg
except Exception as e:
nc = 1
msg = f'{prefix}WARNING: Ignoring corrupted image and/or label {im_file}: {e}'
return [None, None, None, None, nm, nf, ne, nc, msg]
else:
nm = 1 # label missing
l = np.zeros((0, 5), dtype=np.float32)
return im_file, l, shape, segments, nm, nf, ne, nc, msg
# except Exception as e:
# nc = 1
# msg = f'{prefix}WARNING: Ignoring corrupted image and/or label {im_file}: {e}'
# return [None, None, None, None, nm, nf, ne, nc, msg]


def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False, profile=False, hub=False):
Expand Down
32 changes: 21 additions & 11 deletions utils/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,42 +300,52 @@ def check_file(file, suffix=''):
return files[0] # return file


def check_dataset(data, autodownload=True):
def check_dataset(data, save_root="../datasets", autodownload=True):
# Download and/or unzip dataset if not found locally
# Usage: https://github.com/ultralytics/yolov5/releases/download/v1.0/coco128_with_yaml.zip

# Download (optional)
extract_dir = ''
if isinstance(data, (str, Path)) and str(data).endswith('.zip'): # i.e. gs://bucket/dir/coco128.zip
download(data, dir='../datasets', unzip=True, delete=False, curl=False, threads=1)
data = next((Path('../datasets') / Path(data).stem).rglob('*.yaml'))
download(data, dir=save_root, unzip=True, delete=False, curl=False, threads=1)
data = next((Path(save_root) / Path(data).stem).rglob('*.yaml'))
extract_dir, autodownload = data.parent, False

# Read yaml (optional)
if isinstance(data, (str, Path)):
with open(data, errors='ignore') as f:
data = yaml.safe_load(f) # dictionary

assert 'nc' in data, "Dataset 'nc' key missing."
if 'names' not in data:
data['names'] = [f'class{i}' for i in range(data['nc'])] # assign class names if missing
train, val, test, s = [data.get(x) for x in ('train', 'val', 'test', 'download')]

# Define path by priority: 1.Custom by save_root argument, 2.YAML.path, 3.Default
is_default_root = save_root == '../datasets' # Returns bool
has_yaml_path = data and data.get('path')

if is_default_root and has_yaml_path:
# Set path as yaml path
path = Path(data.get('path'))
else:
path = Path(save_root) / Path(s).stem # append dataset subdir to save_root

# Parse yaml
path = extract_dir or Path(data.get('path') or '') # optional 'path' default to '.'
for k in 'train', 'val', 'test':
if data.get(k): # prepend path
data[k] = str(path / data[k]) if isinstance(data[k], str) else [str(path / x) for x in data[k]]

assert 'nc' in data, "Dataset 'nc' key missing."
if 'names' not in data:
data['names'] = [f'class{i}' for i in range(data['nc'])] # assign class names if missing
train, val, test, s = [data.get(x) for x in ('train', 'val', 'test', 'download')]
if val:
val = [Path(x).resolve() for x in (val if isinstance(val, list) else [val])] # val path
val = [Path(path / x).resolve() for x in (val if isinstance(val, list) else [val])] # val path
if not all(x.exists() for x in val):
print('\nWARNING: Dataset not found, nonexistent paths: %s' % [str(x) for x in val if not x.exists()])
if s and autodownload: # download script
if s.startswith('http') and s.endswith('.zip'): # URL
f = Path(s).name # filename
print(f'Downloading {s} ...')
print(f'Downloading {s} to {f}...')
torch.hub.download_url_to_file(s, f)
root = path.parent if 'path' in data else '..' # unzip directory i.e. '../'
root = path.parent # unzip directory i.e. '../'
Path(root).mkdir(parents=True, exist_ok=True) # create root
r = os.system(f'unzip -q {f} -d {root} && rm {f}') # unzip
elif s.startswith('bash '): # bash script
Expand Down