assign test split for folder dataset (#220)

* define test split for folder * assign normal_test * dir names fixed * fix test
openvinotoolkit · Apr 12, 2022 · 2da55ae · 2da55ae
1 parent 487ff45
commit 2da55ae
Show file tree

Hide file tree

Showing 3 changed files with 71 additions and 35 deletions.
diff --git a/anomalib/data/__init__.py b/anomalib/data/__init__.py
@@ -69,9 +69,10 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> LightningDataModule
     elif config.dataset.format.lower() == "folder":
         datamodule = FolderDataModule(
             root=config.dataset.path,
-            normal=config.dataset.normal,
-            abnormal=config.dataset.abnormal,
+            normal_dir=config.dataset.normal_dir,
+            abnormal_dir=config.dataset.abnormal_dir,
             task=config.dataset.task,
+            normal_test_dir=config.dataset.normal_test_dir,
             mask_dir=config.dataset.mask,
             extensions=config.dataset.extensions,
             split_ratio=config.dataset.split_ratio,

diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py
@@ -57,9 +57,37 @@ def _check_and_convert_path(path: Union[str, Path]) -> Path:
     return path
 
 
+def _prepare_files_labels(
+    path: Union[str, Path], path_type: str, extensions: Optional[Tuple[str, ...]] = None
+) -> Tuple[list, list]:
+    """Return a list of filenames and list corresponding labels.
+
+    Args:
+        path (Union[str, Path]): Path to the directory containing images.
+        path_type (str): Type of images in the provided path ("normal", "abnormal", "normal_test")
+        extensions (Optional[Tuple[str, ...]], optional): Type of the image extensions to read from the
+            directory.
+
+    Returns:
+        List, List: Filenames of the images provided in the paths, labels of the images provided in the paths
+    """
+    path = _check_and_convert_path(path)
+    if extensions is None:
+        extensions = IMG_EXTENSIONS
+
+    filenames = [f for f in path.glob(r"**/*") if f.suffix in extensions]
+    if len(filenames) == 0:
+        raise RuntimeError(f"Found 0 {path_type} images in {path}")
+
+    labels = [path_type] * len(filenames)
+
+    return filenames, labels
+
+
 def make_dataset(
     normal_dir: Union[str, Path],
     abnormal_dir: Union[str, Path],
+    normal_test_dir: Optional[Union[str, Path]] = None,
     mask_dir: Optional[Union[str, Path]] = None,
     split: Optional[str] = None,
     split_ratio: float = 0.2,
@@ -72,6 +100,9 @@ def make_dataset(
     Args:
         normal_dir (Union[str, Path]): Path to the directory containing normal images.
         abnormal_dir (Union[str, Path]): Path to the directory containing abnormal images.
+        normal_test_dir (Optional[Union[str, Path]], optional): Path to the directory containing
+            normal images for the test dataset. Normal test images will be a split of `normal_dir`
+            if `None`. Defaults to None.
         mask_dir (Optional[Union[str, Path]], optional): Path to the directory containing
             the mask annotations. Defaults to None.
         split (Optional[str], optional): Dataset split (ie., either train or test). Defaults to None.
@@ -87,40 +118,31 @@ def make_dataset(
     Returns:
         DataFrame: an output dataframe containing samples for the requested split (ie., train or test)
     """
-    normal_dir = _check_and_convert_path(normal_dir)
-    abnormal_dir = _check_and_convert_path(abnormal_dir)
-
-    if extensions is None:
-        extensions = IMG_EXTENSIONS
-
-    # Get filenames from normal and abnormal directory.
-    normal_filenames = [f for f in normal_dir.glob(r"**/*") if f.suffix in extensions]
-    abnormal_filenames = [f for f in abnormal_dir.glob(r"**/*") if f.suffix in extensions]
-    filenames = normal_filenames + abnormal_filenames
 
-    if len(normal_filenames) == 0:
-        raise RuntimeError(f"Found 0 normal images in {normal_dir}")
+    filenames = []
+    labels = []
+    dirs = {"normal": normal_dir, "abnormal": abnormal_dir}
 
-    if len(abnormal_filenames) == 0:
-        raise RuntimeError(f"Found 0 annormal images in {abnormal_dir}")
+    if normal_test_dir:
+        dirs = {**dirs, **{"normal_test": normal_test_dir}}
 
-    # Add normal and abnormal labels to the samples as `label` column.
-    normal_labels = ["normal"] * len(normal_filenames)
-    abnormal_labels = ["abnormal"] * len(abnormal_filenames)
-    labels = normal_labels + abnormal_labels
+    for dir_type, path in dirs.items():
+        filename, label = _prepare_files_labels(path, dir_type, extensions)
+        filenames += filename
+        labels += label
 
     samples = DataFrame({"image_path": filenames, "label": labels})
 
     # Create label index for normal (0) and abnormal (1) images.
-    samples.loc[(samples.label == "normal"), "label_index"] = 0
+    samples.loc[(samples.label == "normal") | (samples.label == "normal_test"), "label_index"] = 0
     samples.loc[(samples.label == "abnormal"), "label_index"] = 1
     samples.label_index = samples.label_index.astype(int)
 
     # If a path to mask is provided, add it to the sample dataframe.
     if mask_dir is not None:
         mask_dir = _check_and_convert_path(mask_dir)
-        normal_gt = ["" for f in normal_filenames]
-        abnormal_gt = [str(mask_dir / f.name) for f in abnormal_filenames]
+        normal_gt = ["" for f in samples.loc[samples.label_index == 0]["image_path"]]
+        abnormal_gt = [str(mask_dir / f.name) for f in samples.loc[samples.label_index == 1]["image_path"]]
         gt_filenames = normal_gt + abnormal_gt
 
         samples["mask_path"] = gt_filenames
@@ -133,10 +155,12 @@ def make_dataset(
     # By default, all the normal samples are assigned as train.
     #   and all the abnormal samples are test.
     samples.loc[(samples.label == "normal"), "split"] = "train"
-    samples.loc[(samples.label == "abnormal"), "split"] = "test"
-    samples = split_normal_images_in_train_set(
-        samples=samples, split_ratio=split_ratio, seed=seed, normal_label="normal"
-    )
+    samples.loc[(samples.label == "abnormal") | (samples.label == "normal_test"), "split"] = "test"
+
+    if not normal_test_dir:
+        samples = split_normal_images_in_train_set(
+            samples=samples, split_ratio=split_ratio, seed=seed, normal_label="normal"
+        )
 
     # If `create_validation_set` is set to True, the test set is split into half.
     if create_validation_set:
@@ -159,6 +183,7 @@ def __init__(
         abnormal_dir: Union[Path, str],
         split: str,
         pre_process: PreProcessor,
+        normal_test_dir: Optional[Union[Path, str]] = None,
         split_ratio: float = 0.2,
         mask_dir: Optional[Union[Path, str]] = None,
         extensions: Optional[Tuple[str, ...]] = None,
@@ -174,6 +199,8 @@ def __init__(
             split (Optional[str], optional): Dataset split (ie., either train or test). Defaults to None.
             pre_process (Optional[PreProcessor], optional): Image Pro-processor to apply transform.
                 Defaults to None.
+            normal_test_dir (Optional[Union[str, Path]], optional): Path to the directory containing
+                normal images for the test dataset. Defaults to None.
             split_ratio (float, optional): Ratio to split normal training images and add to the
                 test set in case test set doesn't contain any normal images.
                 Defaults to 0.2.
@@ -207,6 +234,7 @@ def __init__(
         self.samples = make_dataset(
             normal_dir=normal_dir,
             abnormal_dir=abnormal_dir,
+            normal_test_dir=normal_test_dir,
             mask_dir=mask_dir,
             split=split,
             split_ratio=split_ratio,
@@ -268,9 +296,10 @@ class FolderDataModule(LightningDataModule):
     def __init__(
         self,
         root: Union[str, Path],
-        normal: str = "normal",
-        abnormal: str = "abnormal",
+        normal_dir: str = "normal",
+        abnormal_dir: str = "abnormal",
         task: str = "classification",
+        normal_test_dir: Optional[Union[Path, str]] = None,
         mask_dir: Optional[Union[Path, str]] = None,
         extensions: Optional[Tuple[str, ...]] = None,
         split_ratio: float = 0.2,
@@ -287,12 +316,14 @@ def __init__(
 
         Args:
             root (Union[str, Path]): Path to the root folder containing normal and abnormal dirs.
-            normal (str, optional): Name of the directory containing normal images.
+            normal_dir (str, optional): Name of the directory containing normal images.
                 Defaults to "normal".
-            abnormal (str, optional): Name of the directory containing abnormal images.
+            abnormal_dir (str, optional): Name of the directory containing abnormal images.
                 Defaults to "abnormal".
             task (str, optional): Task type. Could be either classification or segmentation.
                 Defaults to "classification".
+            normal_test_dir (Optional[Union[str, Path]], optional): Path to the directory containing
+                normal images for the test dataset. Defaults to None.
             mask_dir (Optional[Union[str, Path]], optional): Path to the directory containing
                 the mask annotations. Defaults to None.
             extensions (Optional[Tuple[str, ...]], optional): Type of the image extensions to read from the
@@ -382,8 +413,11 @@ def __init__(
         super().__init__()
 
         self.root = _check_and_convert_path(root)
-        self.normal_dir = self.root / normal
-        self.abnormal_dir = self.root / abnormal
+        self.normal_dir = self.root / normal_dir
+        self.abnormal_dir = self.root / abnormal_dir
+        self.normal_test = normal_test_dir
+        if normal_test_dir:
+            self.normal_test = self.root / normal_test_dir
         self.mask_dir = mask_dir
         self.extensions = extensions
         self.split_ratio = split_ratio
@@ -457,6 +491,7 @@ def setup(self, stage: Optional[str] = None) -> None:
             normal_dir=self.normal_dir,
             abnormal_dir=self.abnormal_dir,
             split="test",
+            normal_test_dir=self.normal_test,
             split_ratio=self.split_ratio,
             mask_dir=self.mask_dir,
             pre_process=self.pre_process_val,

diff --git a/tests/pre_merge/datasets/test_dataset.py b/tests/pre_merge/datasets/test_dataset.py
@@ -56,8 +56,8 @@ def folder_data_module():
     root = get_dataset_path(dataset="bottle")
     datamodule = FolderDataModule(
         root=root,
-        normal="good",
-        abnormal="broken_large",
+        normal_dir="good",
+        abnormal_dir="broken_large",
         mask_dir=os.path.join(root, "ground_truth/broken_large"),
         task="segmentation",
         split_ratio=0.2,