From 14a46601b2ed7239d0532973eb69df907661e3cb Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Tue, 1 Nov 2022 13:59:04 +0100 Subject: [PATCH 1/4] Add `min_items` filter option @AyushExel @Laughing-q dataset filter Signed-off-by: Glenn Jocher --- utils/dataloaders.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/utils/dataloaders.py b/utils/dataloaders.py index 403252ff6227..aaa8c5c45dcd 100644 --- a/utils/dataloaders.py +++ b/utils/dataloaders.py @@ -444,6 +444,7 @@ def __init__(self, single_cls=False, stride=32, pad=0.0, + min_items=0, prefix=''): self.img_size = img_size self.augment = augment @@ -475,7 +476,7 @@ def __init__(self, # self.img_files = sorted([x for x in f if x.suffix[1:].lower() in IMG_FORMATS]) # pathlib assert self.im_files, f'{prefix}No images found' except Exception as e: - raise Exception(f'{prefix}Error loading data from {path}: {e}\n{HELP_URL}') + raise Exception(f'{prefix}Error loading data from {path}: {e}\n{HELP_URL}') from e # Check cache self.label_files = img2label_paths(self.im_files) # labels @@ -505,6 +506,18 @@ def __init__(self, self.shapes = np.array(shapes) self.im_files = list(cache.keys()) # update self.label_files = img2label_paths(cache.keys()) # update + + # Filter images + if min_items: + include = np.array([len(x) > min_items for x in self.labels]).nonzero()[0].astype(int) + LOGGER.info(f'{prefix}{nf - len(include)}/{nf} images filtered from dataset') + self.im_files = [self.im_files[i] for i in include] + self.label_files = [self.label_files[i] for i in include] + self.labels = [self.labels[i] for i in include] + self.segments = [self.segments[i] for i in include] + self.shapes = self.shapes[include] # wh + + # Create indices n = len(shapes) # number of images bi = np.floor(np.arange(n) / batch_size).astype(int) # batch index nb = bi[-1] + 1 # number of batches From c2ef920d0e44c26d376f7d6779178a39a07d9f75 Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Tue, 1 Nov 2022 14:02:27 +0100 Subject: [PATCH 2/4] Update dataloaders.py Signed-off-by: Glenn Jocher --- utils/dataloaders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/dataloaders.py b/utils/dataloaders.py index aaa8c5c45dcd..6b6e83e30456 100644 --- a/utils/dataloaders.py +++ b/utils/dataloaders.py @@ -518,7 +518,7 @@ def __init__(self, self.shapes = self.shapes[include] # wh # Create indices - n = len(shapes) # number of images + n = len(self.shapes) # number of images bi = np.floor(np.arange(n) / batch_size).astype(int) # batch index nb = bi[-1] + 1 # number of batches self.batch = bi # batch index of image From 37c1bfb198b3883fd5e6382d23a59c60aafe2762 Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Tue, 1 Nov 2022 14:18:41 +0100 Subject: [PATCH 3/4] fix --- utils/segment/dataloaders.py | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/utils/segment/dataloaders.py b/utils/segment/dataloaders.py index a63d6ec013fd..aa7ba985d4fe 100644 --- a/utils/segment/dataloaders.py +++ b/utils/segment/dataloaders.py @@ -81,24 +81,25 @@ def create_dataloader(path, class LoadImagesAndLabelsAndMasks(LoadImagesAndLabels): # for training/testing def __init__( - self, - path, - img_size=640, - batch_size=16, - augment=False, - hyp=None, - rect=False, - image_weights=False, - cache_images=False, - single_cls=False, - stride=32, - pad=0, - prefix="", - downsample_ratio=1, - overlap=False, + self, + path, + img_size=640, + batch_size=16, + augment=False, + hyp=None, + rect=False, + image_weights=False, + cache_images=False, + single_cls=False, + stride=32, + pad=0, + min_items=0, + prefix="", + downsample_ratio=1, + overlap=False, ): super().__init__(path, img_size, batch_size, augment, hyp, rect, image_weights, cache_images, single_cls, - stride, pad, prefix) + stride, pad, min_items, prefix) self.downsample_ratio = downsample_ratio self.overlap = overlap From cb9d354e8091f3fba677af5daf478fb713560b4e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 1 Nov 2022 13:19:04 +0000 Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- utils/segment/dataloaders.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/utils/segment/dataloaders.py b/utils/segment/dataloaders.py index aa7ba985d4fe..9de6f0fbf903 100644 --- a/utils/segment/dataloaders.py +++ b/utils/segment/dataloaders.py @@ -81,22 +81,22 @@ def create_dataloader(path, class LoadImagesAndLabelsAndMasks(LoadImagesAndLabels): # for training/testing def __init__( - self, - path, - img_size=640, - batch_size=16, - augment=False, - hyp=None, - rect=False, - image_weights=False, - cache_images=False, - single_cls=False, - stride=32, - pad=0, - min_items=0, - prefix="", - downsample_ratio=1, - overlap=False, + self, + path, + img_size=640, + batch_size=16, + augment=False, + hyp=None, + rect=False, + image_weights=False, + cache_images=False, + single_cls=False, + stride=32, + pad=0, + min_items=0, + prefix="", + downsample_ratio=1, + overlap=False, ): super().__init__(path, img_size, batch_size, augment, hyp, rect, image_weights, cache_images, single_cls, stride, pad, min_items, prefix)