Skip to content

Commit

Permalink
Merge pull request #25 from geoaigroup/Haidar
Browse files Browse the repository at this point in the history
Data_processing files adjustments
  • Loading branch information
MhmdDimassi authored Feb 5, 2024
2 parents ef34d0c + f4ff536 commit c0e4292
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 46 deletions.
24 changes: 12 additions & 12 deletions data_processing/augmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,11 @@ class TorchRandomRotate(nn.Module):
Args:
degrees (tuple or list): Range of possible rotation angles.
probability (float): Probability of applying the rotation.
interpolation (InterpolationMode): Interpolation mode for rotation.
center (tuple or None): Center of rotation. If None, the center is the center of the image.
fill (float): Value to fill the image during rotation.
mask_fill (float): Value to fill the mask during rotation.
probability (float, default=1): Probability of applying the rotation.
interpolation (InterpolationMode, default=BILINEAR): Interpolation mode for rotation.
center (tuple, Optional): Center of rotation. If None, the center is the center of the image.
fill (float, default=0): Value to fill the image during rotation.
mask_fill (float, default=0): Value to fill the mask during rotation.
Returns:
tuple: Tuple containing the rotated image and mask (if provided).
Expand Down Expand Up @@ -119,10 +119,10 @@ class RandomMaskIgnore(nn.Module):
It modifies the input mask tensor in-place, and the modified tensor is returned.
Args:
min_length (int): Minimum length of the randomly generated bounding box.
max_length (int): Maximum length of the randomly generated bounding box.
proba (float): Probability of applying the random mask modification.
ignore_index (int): Value used to fill the masked region.
min_length (int, default=50): Minimum length of the randomly generated bounding box.
max_length (int, default=100): Maximum length of the randomly generated bounding box.
proba (float, default=0.5): Probability of applying the random mask modification.
ignore_index (int, default=-10): Value used to fill the masked region.
"""

def __init__(self,min_length=50,max_length=100,proba=0.5,ignore_index=-10):
Expand Down Expand Up @@ -196,9 +196,9 @@ class MaskPixelDrop(nn.Module):
MaskPixelDrop randomly drops pixels in the input mask tensor based on specified probabilities for positive and negative drops.
Args:
neg_drop (int or tuple): Probability range for dropping negative pixels. Default is (0, 50).
pos_drop (int or tuple): Probability range for dropping positive pixels. Default is (0, 50).
ignore_index (int): Value used to fill the dropped pixels. Default is -10.
neg_drop (int or tuple, default for int =50): Probability range for dropping negative pixels. Default is (0, 50).
pos_drop (int or tuple, default for int =50): Probability range for dropping positive pixels. Default is (0, 50).
ignore_index (int, default=-10): Value used to fill the dropped pixels. Default is -10.
Returns:
torch.Tensor: Modified mask tensor with dropped pixels.
Expand Down
13 changes: 6 additions & 7 deletions data_processing/post_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from skimage.segmentation import watershed
from skimage.measure import label
from PIL import Image,ImageDraw

import pandas as pd
from shapely.geometry import shape
from shapely.wkt import dumps
Expand All @@ -23,10 +22,10 @@ def post_process(self, pred, thresh=0.5, thresh_b=0.6, mina=100, mina_b=50):
Args:
pred (numpy.ndarray): Prediction mask with shape (height, width, channels).
thresh (float): Threshold for considering pixels as part of the final segmentation.
thresh_b (float): Threshold for considering pixels as borders between objects.
mina (int): Minimum area threshold for retaining segmented regions.
mina_b (int): Minimum area threshold for retaining basins.
thresh (float, default=0.5): Threshold for considering pixels as part of the final segmentation.
thresh_b (float, default=0.6): Threshold for considering pixels as borders between objects.
mina (int, default=100): Minimum area threshold for retaining segmented regions.
mina_b (int, default=50): Minimum area threshold for retaining basins.
Returns:
numpy.ndarray: Refined segmentation mask.
Expand Down Expand Up @@ -147,8 +146,8 @@ def instance_mask_to_gdf(instance_mask, transform=None, crs=None):
Args:
instance_mask (numpy.ndarray): Input instance mask with shape (H, W), where each instance is labeled by a unique id/number.
transform (affine.Affine or None): Geospatial transform of the raster. Default is None.
crs (str or None): CRS of the raster. Default is None.
transform (affine.Affine, Optional): Geospatial transform of the raster. Default is None.
crs (str, Optional): CRS of the raster. Default is None.
Returns:
geopandas.GeoDataFrame: GeoDataFrame of the shapes projected to the specified CRS using the transform.
Expand Down
60 changes: 37 additions & 23 deletions data_processing/splitting.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,49 @@
from sklearn.model_selection import StratifiedKFold
import pandas as pd

def split_K_stratified_folds(df, nfolds, seed, id_key, split_key, label_keys, verbose=False):
class Splitting():
"""
Split the DataFrame into K stratified folds based on specified keys.
Splitting class for all split methods
Args:
df (pd.DataFrame): Input DataFrame.
nfolds (int): Number of folds.
seed (int): Random seed for reproducibility.
id_key (str): Key representing the identifier for grouping.
split_key (str): Key for stratification.
label_keys (list): List of keys for labels.
verbose (bool): If True, print fold statistics.
Returns:
pd.DataFrame: DataFrame with an additional 'fold' column indicating the fold number.
verbose (bool, default=False): If True, print fold statistics.
"""
X = df.groupby(id_key)[split_key].first().index.values
y = df.groupby(id_key)[split_key].first().values
skf = StratifiedKFold(n_splits=nfolds, random_state=seed, shuffle=True)

for i, (tfold, vfold) in enumerate(skf.split(X, y)):
df.loc[df[id_key].isin(X[vfold]), 'fold'] = int(i)

folds = [int(fold) for fold in df.groupby('fold').first().index.values]
if verbose:
for fold in folds:
for label_key in label_keys:
print(f'fold:\t{fold}')
print(f'Label Key:{label_key}')
print(df.loc[df['fold'] == fold].set_index(['fold', label_key]).groupby(level=label_key).count())
df.reset_index(drop=True, inplace=True)
return df
def __init__(self, df, nfolds, seed, id_key, split_key, label_keys, verbose=False):
self.df = df
self.nfolds = nfolds
self.seed = seed
self.id_key = id_key
self.split_key = split_key
self.label_keys = label_keys
self.verbose = verbose

def split_K_stratified_folds(self):
"""
Split the DataFrame into K stratified folds based on specified keys.
Returns:
pd.DataFrame: DataFrame with an additional 'fold' column indicating the fold number.
"""
X = self.df.groupby(self.id_key)[self.split_key].first().index.values
y = self.df.groupby(self.id_key)[self.split_key].first().values
skf = StratifiedKFold(n_splits=self.nfolds, random_state=self.seed, shuffle=True)

for i, (tfold, vfold) in enumerate(skf.split(X, y)):
self.df.loc[self.df[self.id_key].isin(X[vfold]), 'fold'] = int(i)

folds = [int(fold) for fold in self.df.groupby('fold').first().index.values]
if self.verbose:
for fold in folds:
for label_key in self.label_keys:
print(f'fold:\t{fold}')
print(f'Label Key:{label_key}')
print(self.df.loc[self.df['fold'] == fold].set_index(['fold', label_key]).groupby(level=label_key).count())
self.df.reset_index(drop=True, inplace=True)
return self.df

13 changes: 9 additions & 4 deletions data_processing/test_splitting.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,26 +14,31 @@ def setUp(self):
'label': np.random.choice([0, 1], size=num_samples)
}
self.df = pd.DataFrame(data)
self.splitter = None

def test_split_k_folds(self):
df_result = split_K_stratified_folds(self.df, nfolds=5, seed=42, id_key='id', split_key='split_key', label_keys=['label'])
self.splitter = Splitting(self.df, nfolds=5, seed=42, id_key='id', split_key='split_key', label_keys=['label'])
df_result = self.splitter.split_K_stratified_folds()
unique_folds = df_result['fold'].unique()

self.assertEqual(len(unique_folds), 5) # Check if correct number of folds are created

def test_split_k_folds_verbose(self):
df_result = split_K_stratified_folds(self.df, nfolds=5, seed=42, id_key='id', split_key='split_key', label_keys=['label'], verbose=True)
self.splitter = Splitting(self.df, nfolds=5, seed=42, id_key='id', split_key='split_key', label_keys=['label'], verbose=True)
df_result =self.splitter.split_K_stratified_folds()

unique_folds = df_result['fold'].unique()

self.assertEqual(len(unique_folds), 5) # Check if correct number of folds are created

def test_split_k_folds_labels(self):
df_result = split_K_stratified_folds(self.df, nfolds=5, seed=42, id_key='id', split_key='split_key', label_keys=['label'])
self.splitter = Splitting(self.df, nfolds=5, seed=42, id_key='id', split_key='split_key', label_keys=['label'])
df_result =self.splitter.split_K_stratified_folds()
unique_labels = df_result.set_index(['fold', 'label']).groupby(level='label').count()

self.assertTrue(unique_labels.min()['id'] > 1) # Check if each label has samples in each fold

def test_split_k_folds_reset_index(self):
df_result = split_K_stratified_folds(self.df, nfolds=5, seed=42, id_key='id', split_key='split_key', label_keys=['label'])
self.splitter = Splitting(self.df, nfolds=5, seed=42, id_key='id', split_key='split_key', label_keys=['label'])
df_result =self.splitter.split_K_stratified_folds()
self.assertTrue('id' in df_result.columns) # Check if 'id' column is present after resetting index

0 comments on commit c0e4292

Please sign in to comment.