Merge pull request #25 from geoaigroup/Haidar

Data_processing files adjustments
geoaigroup · Feb 5, 2024 · c0e4292 · c0e4292
2 parents ef34d0c + f4ff536
commit c0e4292
Show file tree

Hide file tree

Showing 4 changed files with 64 additions and 46 deletions.
diff --git a/data_processing/augmentation.py b/data_processing/augmentation.py
@@ -39,11 +39,11 @@ class TorchRandomRotate(nn.Module):
 
     Args:
         degrees (tuple or list): Range of possible rotation angles.
-        probability (float): Probability of applying the rotation.
-        interpolation (InterpolationMode): Interpolation mode for rotation.
-        center (tuple or None): Center of rotation. If None, the center is the center of the image.
-        fill (float): Value to fill the image during rotation.
-        mask_fill (float): Value to fill the mask during rotation.
+        probability (float, default=1): Probability of applying the rotation.
+        interpolation (InterpolationMode, default=BILINEAR): Interpolation mode for rotation.
+        center (tuple, Optional): Center of rotation. If None, the center is the center of the image.
+        fill (float, default=0): Value to fill the image during rotation.
+        mask_fill (float, default=0): Value to fill the mask during rotation.
 
     Returns:
         tuple: Tuple containing the rotated image and mask (if provided).
@@ -119,10 +119,10 @@ class RandomMaskIgnore(nn.Module):
     It modifies the input mask tensor in-place, and the modified tensor is returned.
 
     Args:
-        min_length (int): Minimum length of the randomly generated bounding box.
-        max_length (int): Maximum length of the randomly generated bounding box.
-        proba (float): Probability of applying the random mask modification.
-        ignore_index (int): Value used to fill the masked region.
+        min_length (int, default=50): Minimum length of the randomly generated bounding box.
+        max_length (int, default=100): Maximum length of the randomly generated bounding box.
+        proba (float, default=0.5): Probability of applying the random mask modification.
+        ignore_index (int, default=-10): Value used to fill the masked region.
     """
 
     def __init__(self,min_length=50,max_length=100,proba=0.5,ignore_index=-10):
@@ -196,9 +196,9 @@ class MaskPixelDrop(nn.Module):
     MaskPixelDrop randomly drops pixels in the input mask tensor based on specified probabilities for positive and negative drops.
     
     Args:
-        neg_drop (int or tuple): Probability range for dropping negative pixels. Default is (0, 50).
-        pos_drop (int or tuple): Probability range for dropping positive pixels. Default is (0, 50).
-        ignore_index (int): Value used to fill the dropped pixels. Default is -10.
+        neg_drop (int or tuple, default for int =50): Probability range for dropping negative pixels. Default is (0, 50).
+        pos_drop (int or tuple, default for int =50): Probability range for dropping positive pixels. Default is (0, 50).
+        ignore_index (int, default=-10): Value used to fill the dropped pixels. Default is -10.
 
     Returns:
         torch.Tensor: Modified mask tensor with dropped pixels.

diff --git a/data_processing/post_process.py b/data_processing/post_process.py
@@ -7,7 +7,6 @@
 from skimage.segmentation import watershed
 from skimage.measure import label
 from PIL import Image,ImageDraw
-
 import pandas as pd
 from shapely.geometry import shape
 from shapely.wkt import dumps
@@ -23,10 +22,10 @@ def post_process(self, pred, thresh=0.5, thresh_b=0.6, mina=100, mina_b=50):
 
         Args:
             pred (numpy.ndarray): Prediction mask with shape (height, width, channels).
-            thresh (float): Threshold for considering pixels as part of the final segmentation.
-            thresh_b (float): Threshold for considering pixels as borders between objects.
-            mina (int): Minimum area threshold for retaining segmented regions.
-            mina_b (int): Minimum area threshold for retaining basins.
+            thresh (float, default=0.5): Threshold for considering pixels as part of the final segmentation.
+            thresh_b (float, default=0.6): Threshold for considering pixels as borders between objects.
+            mina (int, default=100): Minimum area threshold for retaining segmented regions.
+            mina_b (int, default=50): Minimum area threshold for retaining basins.
 
         Returns:
             numpy.ndarray: Refined segmentation mask.
@@ -147,8 +146,8 @@ def instance_mask_to_gdf(instance_mask, transform=None, crs=None):
 
         Args:
             instance_mask (numpy.ndarray): Input instance mask with shape (H, W), where each instance is labeled by a unique id/number.
-            transform (affine.Affine or None): Geospatial transform of the raster. Default is None.
-            crs (str or None): CRS of the raster. Default is None.
+            transform (affine.Affine, Optional): Geospatial transform of the raster. Default is None.
+            crs (str, Optional): CRS of the raster. Default is None.
 
         Returns:
             geopandas.GeoDataFrame: GeoDataFrame of the shapes projected to the specified CRS using the transform.

diff --git a/data_processing/splitting.py b/data_processing/splitting.py
@@ -1,35 +1,49 @@
 from sklearn.model_selection import StratifiedKFold
 import pandas as pd
 
-def split_K_stratified_folds(df, nfolds, seed, id_key, split_key, label_keys, verbose=False):
+class Splitting():
     """
-    Split the DataFrame into K stratified folds based on specified keys.
-
+    Splitting class for all split methods
+ 
     Args:
         df (pd.DataFrame): Input DataFrame.
         nfolds (int): Number of folds.
         seed (int): Random seed for reproducibility.
         id_key (str): Key representing the identifier for grouping.
         split_key (str): Key for stratification.
         label_keys (list): List of keys for labels.
-        verbose (bool): If True, print fold statistics.
-
-    Returns:
-        pd.DataFrame: DataFrame with an additional 'fold' column indicating the fold number.
+        verbose (bool, default=False): If True, print fold statistics.
     """
-    X = df.groupby(id_key)[split_key].first().index.values
-    y = df.groupby(id_key)[split_key].first().values
-    skf = StratifiedKFold(n_splits=nfolds, random_state=seed, shuffle=True)
-
-    for i, (tfold, vfold) in enumerate(skf.split(X, y)):
-        df.loc[df[id_key].isin(X[vfold]), 'fold'] = int(i)
-
-    folds = [int(fold) for fold in df.groupby('fold').first().index.values]
-    if verbose:
-        for fold in folds:
-            for label_key in label_keys:
-                print(f'fold:\t{fold}')
-                print(f'Label Key:{label_key}')
-                print(df.loc[df['fold'] == fold].set_index(['fold', label_key]).groupby(level=label_key).count())
-    df.reset_index(drop=True, inplace=True)
-    return df
+    def __init__(self, df, nfolds, seed, id_key, split_key, label_keys, verbose=False):
+        self.df = df
+        self.nfolds = nfolds
+        self.seed = seed
+        self.id_key = id_key
+        self.split_key = split_key
+        self.label_keys = label_keys
+        self.verbose = verbose
+
+    def split_K_stratified_folds(self):
+        """
+        Split the DataFrame into K stratified folds based on specified keys.
+        
+        Returns:
+            pd.DataFrame: DataFrame with an additional 'fold' column indicating the fold number.
+        """
+        X = self.df.groupby(self.id_key)[self.split_key].first().index.values
+        y = self.df.groupby(self.id_key)[self.split_key].first().values
+        skf = StratifiedKFold(n_splits=self.nfolds, random_state=self.seed, shuffle=True)
+
+        for i, (tfold, vfold) in enumerate(skf.split(X, y)):
+            self.df.loc[self.df[self.id_key].isin(X[vfold]), 'fold'] = int(i)
+
+        folds = [int(fold) for fold in self.df.groupby('fold').first().index.values]
+        if self.verbose:
+            for fold in folds:
+                for label_key in self.label_keys:
+                    print(f'fold:\t{fold}')
+                    print(f'Label Key:{label_key}')
+                    print(self.df.loc[self.df['fold'] == fold].set_index(['fold', label_key]).groupby(level=label_key).count())
+        self.df.reset_index(drop=True, inplace=True)
+        return self.df
+
diff --git a/data_processing/test_splitting.py b/data_processing/test_splitting.py
@@ -14,26 +14,31 @@ def setUp(self):
             'label': np.random.choice([0, 1], size=num_samples)
         }
         self.df = pd.DataFrame(data)
+        self.splitter = None
 
     def test_split_k_folds(self):
-        df_result = split_K_stratified_folds(self.df, nfolds=5, seed=42, id_key='id', split_key='split_key', label_keys=['label'])
+        self.splitter = Splitting(self.df, nfolds=5, seed=42, id_key='id', split_key='split_key', label_keys=['label'])
+        df_result = self.splitter.split_K_stratified_folds()
         unique_folds = df_result['fold'].unique()
 
         self.assertEqual(len(unique_folds), 5)  # Check if correct number of folds are created
 
     def test_split_k_folds_verbose(self):
-        df_result = split_K_stratified_folds(self.df, nfolds=5, seed=42, id_key='id', split_key='split_key', label_keys=['label'], verbose=True)
+        self.splitter = Splitting(self.df, nfolds=5, seed=42, id_key='id', split_key='split_key', label_keys=['label'], verbose=True)
+        df_result =self.splitter.split_K_stratified_folds()
 
         unique_folds = df_result['fold'].unique()
 
         self.assertEqual(len(unique_folds), 5)  # Check if correct number of folds are created
 
     def test_split_k_folds_labels(self):
-        df_result = split_K_stratified_folds(self.df, nfolds=5, seed=42, id_key='id', split_key='split_key', label_keys=['label'])
+        self.splitter = Splitting(self.df, nfolds=5, seed=42, id_key='id', split_key='split_key', label_keys=['label'])
+        df_result =self.splitter.split_K_stratified_folds()
         unique_labels = df_result.set_index(['fold', 'label']).groupby(level='label').count()
 
         self.assertTrue(unique_labels.min()['id'] > 1)  # Check if each label has samples in each fold
 
     def test_split_k_folds_reset_index(self):
-        df_result = split_K_stratified_folds(self.df, nfolds=5, seed=42, id_key='id', split_key='split_key', label_keys=['label'])
+        self.splitter = Splitting(self.df, nfolds=5, seed=42, id_key='id', split_key='split_key', label_keys=['label'])
+        df_result =self.splitter.split_K_stratified_folds()
         self.assertTrue('id' in df_result.columns)  # Check if 'id' column is present after resetting index