diff --git a/utils/datasets.py b/utils/datasets.py index ed18f449ddd3..4f5fbb4bc824 100755 --- a/utils/datasets.py +++ b/utils/datasets.py @@ -1033,19 +1033,32 @@ def extract_boxes(path='../coco128/'): # from utils.datasets import *; extract_ assert cv2.imwrite(str(f), im[b[1]:b[3], b[0]:b[2]]), f'box failure in {f}' -def autosplit(path='../coco128', weights=(0.9, 0.1, 0.0)): # from utils.datasets import *; autosplit('../coco128') +def autosplit(path='../coco128', weights=(0.9, 0.1, 0.0), annotated_only=False): # from utils.datasets import *; autosplit('../coco128') + """ Autosplit a dataset into train/val/test splits and save path/autosplit_*.txt files # Arguments - path: Path to images directory - weights: Train, val, test weights (list) + path: Path to images directory + weights: Train, val, test weights (list) + annotated_only: Only use images with an annotated txt file """ + path = Path(path) # images dir - files = list(path.rglob('*.*')) + + # make sure we only work with images files + files = sum([list(path.rglob(f"*.{img_ext}")) for img_ext in img_formats], []) n = len(files) # number of files + indices = random.choices([0, 1, 2], weights=weights, k=n) # assign each image to a split + txt = ['autosplit_train.txt', 'autosplit_val.txt', 'autosplit_test.txt'] # 3 txt files [(path / x).unlink() for x in txt if (path / x).exists()] # remove existing + + if annotated_only: + print("Only annotated images with a .txt file associated will be used to create the dataset") + for i, img in tqdm(zip(indices, files), total=n): - if img.suffix[1:] in img_formats: + # in case we want to use only annotated files + if not annotated_only or (annotated_only and Path(img2label_paths([str(img)])[0]).exists()): with open(path / txt[i], 'a') as f: f.write(str(img) + '\n') # add image to txt file +