diff --git a/data_processing/augmentation.py b/data_processing/augmentation.py index 3688081..04b0710 100644 --- a/data_processing/augmentation.py +++ b/data_processing/augmentation.py @@ -39,11 +39,11 @@ class TorchRandomRotate(nn.Module): Args: degrees (tuple or list): Range of possible rotation angles. - probability (float): Probability of applying the rotation. - interpolation (InterpolationMode): Interpolation mode for rotation. - center (tuple or None): Center of rotation. If None, the center is the center of the image. - fill (float): Value to fill the image during rotation. - mask_fill (float): Value to fill the mask during rotation. + probability (float, default=1): Probability of applying the rotation. + interpolation (InterpolationMode, default=BILINEAR): Interpolation mode for rotation. + center (tuple, Optional): Center of rotation. If None, the center is the center of the image. + fill (float, default=0): Value to fill the image during rotation. + mask_fill (float, default=0): Value to fill the mask during rotation. Returns: tuple: Tuple containing the rotated image and mask (if provided). @@ -119,10 +119,10 @@ class RandomMaskIgnore(nn.Module): It modifies the input mask tensor in-place, and the modified tensor is returned. Args: - min_length (int): Minimum length of the randomly generated bounding box. - max_length (int): Maximum length of the randomly generated bounding box. - proba (float): Probability of applying the random mask modification. - ignore_index (int): Value used to fill the masked region. + min_length (int, default=50): Minimum length of the randomly generated bounding box. + max_length (int, default=100): Maximum length of the randomly generated bounding box. + proba (float, default=0.5): Probability of applying the random mask modification. + ignore_index (int, default=-10): Value used to fill the masked region. """ def __init__(self,min_length=50,max_length=100,proba=0.5,ignore_index=-10): @@ -196,9 +196,9 @@ class MaskPixelDrop(nn.Module): MaskPixelDrop randomly drops pixels in the input mask tensor based on specified probabilities for positive and negative drops. Args: - neg_drop (int or tuple): Probability range for dropping negative pixels. Default is (0, 50). - pos_drop (int or tuple): Probability range for dropping positive pixels. Default is (0, 50). - ignore_index (int): Value used to fill the dropped pixels. Default is -10. + neg_drop (int or tuple, default for int =50): Probability range for dropping negative pixels. Default is (0, 50). + pos_drop (int or tuple, default for int =50): Probability range for dropping positive pixels. Default is (0, 50). + ignore_index (int, default=-10): Value used to fill the dropped pixels. Default is -10. Returns: torch.Tensor: Modified mask tensor with dropped pixels. diff --git a/data_processing/post_process.py b/data_processing/post_process.py index 3c9cdf6..9806c24 100644 --- a/data_processing/post_process.py +++ b/data_processing/post_process.py @@ -7,7 +7,6 @@ from skimage.segmentation import watershed from skimage.measure import label from PIL import Image,ImageDraw - import pandas as pd from shapely.geometry import shape from shapely.wkt import dumps @@ -23,10 +22,10 @@ def post_process(self, pred, thresh=0.5, thresh_b=0.6, mina=100, mina_b=50): Args: pred (numpy.ndarray): Prediction mask with shape (height, width, channels). - thresh (float): Threshold for considering pixels as part of the final segmentation. - thresh_b (float): Threshold for considering pixels as borders between objects. - mina (int): Minimum area threshold for retaining segmented regions. - mina_b (int): Minimum area threshold for retaining basins. + thresh (float, default=0.5): Threshold for considering pixels as part of the final segmentation. + thresh_b (float, default=0.6): Threshold for considering pixels as borders between objects. + mina (int, default=100): Minimum area threshold for retaining segmented regions. + mina_b (int, default=50): Minimum area threshold for retaining basins. Returns: numpy.ndarray: Refined segmentation mask. @@ -147,8 +146,8 @@ def instance_mask_to_gdf(instance_mask, transform=None, crs=None): Args: instance_mask (numpy.ndarray): Input instance mask with shape (H, W), where each instance is labeled by a unique id/number. - transform (affine.Affine or None): Geospatial transform of the raster. Default is None. - crs (str or None): CRS of the raster. Default is None. + transform (affine.Affine, Optional): Geospatial transform of the raster. Default is None. + crs (str, Optional): CRS of the raster. Default is None. Returns: geopandas.GeoDataFrame: GeoDataFrame of the shapes projected to the specified CRS using the transform. diff --git a/data_processing/splitting.py b/data_processing/splitting.py index 6c083f6..c3e5515 100644 --- a/data_processing/splitting.py +++ b/data_processing/splitting.py @@ -1,10 +1,10 @@ from sklearn.model_selection import StratifiedKFold import pandas as pd -def split_K_stratified_folds(df, nfolds, seed, id_key, split_key, label_keys, verbose=False): +class Splitting(): """ - Split the DataFrame into K stratified folds based on specified keys. - + Splitting class for all split methods + Args: df (pd.DataFrame): Input DataFrame. nfolds (int): Number of folds. @@ -12,24 +12,38 @@ def split_K_stratified_folds(df, nfolds, seed, id_key, split_key, label_keys, ve id_key (str): Key representing the identifier for grouping. split_key (str): Key for stratification. label_keys (list): List of keys for labels. - verbose (bool): If True, print fold statistics. - - Returns: - pd.DataFrame: DataFrame with an additional 'fold' column indicating the fold number. + verbose (bool, default=False): If True, print fold statistics. """ - X = df.groupby(id_key)[split_key].first().index.values - y = df.groupby(id_key)[split_key].first().values - skf = StratifiedKFold(n_splits=nfolds, random_state=seed, shuffle=True) - - for i, (tfold, vfold) in enumerate(skf.split(X, y)): - df.loc[df[id_key].isin(X[vfold]), 'fold'] = int(i) - - folds = [int(fold) for fold in df.groupby('fold').first().index.values] - if verbose: - for fold in folds: - for label_key in label_keys: - print(f'fold:\t{fold}') - print(f'Label Key:{label_key}') - print(df.loc[df['fold'] == fold].set_index(['fold', label_key]).groupby(level=label_key).count()) - df.reset_index(drop=True, inplace=True) - return df + def __init__(self, df, nfolds, seed, id_key, split_key, label_keys, verbose=False): + self.df = df + self.nfolds = nfolds + self.seed = seed + self.id_key = id_key + self.split_key = split_key + self.label_keys = label_keys + self.verbose = verbose + + def split_K_stratified_folds(self): + """ + Split the DataFrame into K stratified folds based on specified keys. + + Returns: + pd.DataFrame: DataFrame with an additional 'fold' column indicating the fold number. + """ + X = self.df.groupby(self.id_key)[self.split_key].first().index.values + y = self.df.groupby(self.id_key)[self.split_key].first().values + skf = StratifiedKFold(n_splits=self.nfolds, random_state=self.seed, shuffle=True) + + for i, (tfold, vfold) in enumerate(skf.split(X, y)): + self.df.loc[self.df[self.id_key].isin(X[vfold]), 'fold'] = int(i) + + folds = [int(fold) for fold in self.df.groupby('fold').first().index.values] + if self.verbose: + for fold in folds: + for label_key in self.label_keys: + print(f'fold:\t{fold}') + print(f'Label Key:{label_key}') + print(self.df.loc[self.df['fold'] == fold].set_index(['fold', label_key]).groupby(level=label_key).count()) + self.df.reset_index(drop=True, inplace=True) + return self.df + diff --git a/data_processing/test_splitting.py b/data_processing/test_splitting.py index 28b65d8..cafba85 100644 --- a/data_processing/test_splitting.py +++ b/data_processing/test_splitting.py @@ -14,26 +14,31 @@ def setUp(self): 'label': np.random.choice([0, 1], size=num_samples) } self.df = pd.DataFrame(data) + self.splitter = None def test_split_k_folds(self): - df_result = split_K_stratified_folds(self.df, nfolds=5, seed=42, id_key='id', split_key='split_key', label_keys=['label']) + self.splitter = Splitting(self.df, nfolds=5, seed=42, id_key='id', split_key='split_key', label_keys=['label']) + df_result = self.splitter.split_K_stratified_folds() unique_folds = df_result['fold'].unique() self.assertEqual(len(unique_folds), 5) # Check if correct number of folds are created def test_split_k_folds_verbose(self): - df_result = split_K_stratified_folds(self.df, nfolds=5, seed=42, id_key='id', split_key='split_key', label_keys=['label'], verbose=True) + self.splitter = Splitting(self.df, nfolds=5, seed=42, id_key='id', split_key='split_key', label_keys=['label'], verbose=True) + df_result =self.splitter.split_K_stratified_folds() unique_folds = df_result['fold'].unique() self.assertEqual(len(unique_folds), 5) # Check if correct number of folds are created def test_split_k_folds_labels(self): - df_result = split_K_stratified_folds(self.df, nfolds=5, seed=42, id_key='id', split_key='split_key', label_keys=['label']) + self.splitter = Splitting(self.df, nfolds=5, seed=42, id_key='id', split_key='split_key', label_keys=['label']) + df_result =self.splitter.split_K_stratified_folds() unique_labels = df_result.set_index(['fold', 'label']).groupby(level='label').count() self.assertTrue(unique_labels.min()['id'] > 1) # Check if each label has samples in each fold def test_split_k_folds_reset_index(self): - df_result = split_K_stratified_folds(self.df, nfolds=5, seed=42, id_key='id', split_key='split_key', label_keys=['label']) + self.splitter = Splitting(self.df, nfolds=5, seed=42, id_key='id', split_key='split_key', label_keys=['label']) + df_result =self.splitter.split_K_stratified_folds() self.assertTrue('id' in df_result.columns) # Check if 'id' column is present after resetting index