diff --git a/utils/datasets.py b/utils/datasets.py index ed18f449ddd3..9a4b3f9fcc9f 100755 --- a/utils/datasets.py +++ b/utils/datasets.py @@ -1032,20 +1032,24 @@ def extract_boxes(path='../coco128/'): # from utils.datasets import *; extract_ b[[1, 3]] = np.clip(b[[1, 3]], 0, h) assert cv2.imwrite(str(f), im[b[1]:b[3], b[0]:b[2]]), f'box failure in {f}' - -def autosplit(path='../coco128', weights=(0.9, 0.1, 0.0)): # from utils.datasets import *; autosplit('../coco128') +def autosplit(path='../coco128', weights=(0.9, 0.1, 0.0), annotated_only=False): """ Autosplit a dataset into train/val/test splits and save path/autosplit_*.txt files - # Arguments - path: Path to images directory - weights: Train, val, test weights (list) + Usage: from utils.datasets import *; autosplit('../coco128') + Arguments + path: Path to images directory + weights: Train, val, test weights (list) + annotated_only: Only use images with an annotated txt file """ path = Path(path) # images dir - files = list(path.rglob('*.*')) + files = sum([list(path.rglob(f"*.{img_ext}")) for img_ext in img_formats], []) # image files only n = len(files) # number of files indices = random.choices([0, 1, 2], weights=weights, k=n) # assign each image to a split + txt = ['autosplit_train.txt', 'autosplit_val.txt', 'autosplit_test.txt'] # 3 txt files [(path / x).unlink() for x in txt if (path / x).exists()] # remove existing + + print(f'Autosplitting images from {path}' + ', using *.txt labeled images only' * annotated_only) for i, img in tqdm(zip(indices, files), total=n): - if img.suffix[1:] in img_formats: + if not annotated_only or Path(img2label_paths([str(img)])[0]).exists(): # check label with open(path / txt[i], 'a') as f: f.write(str(img) + '\n') # add image to txt file