From 16e7a63d65fbad4f8f1a2f828729444e4359871b Mon Sep 17 00:00:00 2001
From: aghand0ur <ghandour.aub@gmail.com>
Date: Mon, 11 Dec 2023 18:22:46 +0200
Subject: [PATCH] going thru data_processing

---
 .../__pycache__/preprocessing.cpython-310.pyc | Bin 4804 -> 0 bytes
 {augmentation => data_processing}/cropping.py |  81 --------
 data_processing/filtering.py                  |   9 -
 data_processing/post_process.py               |  75 +++-----
 .../{preprocessing.py => pre_processing.py}   |  86 +--------
 data_processing/split_data.py                 | 181 ------------------
 data_processing/split_to_folds.py             |  42 ----
 post_process.py                               | 123 ------------
 8 files changed, 31 insertions(+), 566 deletions(-)
 delete mode 100644 data_processing/__pycache__/preprocessing.cpython-310.pyc
 rename {augmentation => data_processing}/cropping.py (64%)
 delete mode 100644 data_processing/filtering.py
 rename data_processing/{preprocessing.py => pre_processing.py} (64%)
 delete mode 100644 data_processing/split_data.py
 delete mode 100644 data_processing/split_to_folds.py
 delete mode 100644 post_process.py
diff --git a/data_processing/__pycache__/preprocessing.cpython-310.pyc b/data_processing/__pycache__/preprocessing.cpython-310.pyc
deleted file mode 100644
index a19775be51537fb3a9ecf83b6a68dfcf0e5be707..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4804
zcma)9TW{RP6()z|F83;FwYtl;e2J4SP^nHFH%;BZinX;ABx~2J#x~J}rFXb1N_*3g
zk|TrLhbl%<plR}M6p;HCpueR*U>@6-s!sv(=7*-=8ES1waf6bWGiT0RI6U*6%k&C`
ztcK@FrsF?2uWA3J#_`8M;|5Cj9E59JZ)+XhVS2(Fj;UmpW1(j4R43)6RG-mKcWlQ#
z8kcdhYFy6AsXFiERb6n3svdDhR6Xj9s(Q>BLv6N8opEPetzosxoe5{6Q+CQnYfd_o
zTyv&)%9-YAXNKF(EYCP|sONdsIm2_#S)O;!p)C03=J|sQyzordwT#A#eB^V@xyVQP
z80t&B#K%#;#%K5hpZr{RF5lAl6rcWF<J0@hd7bNZtulLv|JQ02U7AtQ@lmHcek6iM
z2-XTWyuM#yGSlz1_uNLWyDP1oC}@X=I%XX*BBaT~6iA?rf}0`k!y>pJfossX;bVfa
zQEsAq3sq~8cA!TrGGcAh<oXOWJ*7of%Y3E}^e8oy?&~YsC-Vaq)7fZGSVH5Yx@pd6
zvHt3w>)Q52l!`31!h5Gzh|)1rvrnuW*@<*oJNdmI9zCVaX#*oRPVR&ov=(-8jhnHK
zzs;azWBnQ4>)3T&``W4rQi(`|9DavKkj5%G$+~@+?s88EZ%?LHt82Gb*QIrDZDoB`
zkRoNS=B}@ORIRU9@6@Fwyl%^PjM~~=Y1Qwo-Mu2MM;pHIr4@u-uUj#teqS>8zSIN9
zY=@76uQf4`WuE&>9iIo@lDAa7TD^9ydi4s?D_7rsXX)3WFT$nerIn>SqPOWcqHw9@
z_vrZDcJNSmVsF@9;$GyrebH<9VHk8<OMT(LY%TWp<fzBFOW$=H8(z2Tw|{|;e+$59
z^LmM0)AQ`N`mA1NrY_Dx_&;$Y3g{Jp7jz_Ox)>u;A~H^dbR)_{CP1Wh>u$AL6O$yE
zA~Fq9F$KX?%o3Rcam+4%6exux<P3z#o~dKF=MQN7IU?sl8if4eL+3Zdb_i4cA#5Ap
z!?t-6YcY8rAYGx;IDzdKJ%nwb&Ddh$yQs}ACYED%kmeRoJ!1o_dj`@pde>0f=)H5)
zn^C<*NVDj@k2;6mPmg-@)C-i_vGq(Ryl!S;jj0z_R2rCZiWg$D$zXRU_Qs2l656-;
zNNiw@97;Ymj?iwv+l(GVyJWk&4*U3HXSBcJcY7|m9{?`!L#MF2d#s6p8yL6>@x}>p
z;7PW&>=Xj;n0Qf);3yI+D*QhPNdu~Fui;Cx-SfCid118I_hq_?X7tvZ$54}2sFCJF
zugj&`_o9ts2ug>ZAdxh{oJP_Jp&#-gknUlNA(>wS{!v!clahxq<n;@QA^aEv)*A%q
z;X`hgkVYOf{00OX)?*DP#c)#4^uSaS_!lK1Pin=6ILq~zdgx?RF{I>wI7?(eo4&<J
z0v~7Y(;D_sok1VG^S%KY@xbW|-b(nGi4Clpjm_BH%#jyYb!tW{CtUhSV$Wzz1L&fC
zWMeC~jz&|hMicU0tZ~!}ghn4|zsK%ltI2?%Oidka>)k*7<R5?DtG*{LKnvm`kxN8K
zJBdflfUD#a1iVhtH;7a~q#1OhcU~T>@;J(*1*u#D5i0O0Us6Hj|IDS6by9n(CTMt3
z<aeDsMMPgjE<9|QfJ1+9`s_0VE0-K(2aAW~-Ab&)OrwmmwwbNx)obdDY+j$%ZC$*H
zAq|uh4>_9~DB%K#uQ>>L4&yn_xPj1a@I24(EI5<fP*Xvc=(BLeZrc+r9|qF)pPP$O
z&}>*(VE7PEQCtCv02u>Lt`;NMx;ny{<^m+3Fm$Y{*APcWf{up;c_0uGdSXu&J6^bT
ztesB=ggeb<@R`i1-lS2nq#63{rZW+A`#X_)Y9*(fh)=F3eu{00cZkpikNIcYb%SmY
zx$Yk!56Szcbb7P;<GJtu{o-)+p$3yWq~AW&pj8E#T7!Tyd<ZTCAElMQV_-`gq)}`@
z7y8+Rv8-rc>7P6pP(U~049qMWXAhvMO}NdkbivU|ahMs0n;hh#G#o1CJ=I`N*{9lH
z$W-%j{vgvrI`~&I*DN`1&Yl^ALR^4JXgu`<bAQ7I#lZ;X=E<3h-@R(3FLQ&@cyzNc
zNZmu&F!Amiy4JO?XyK=ikQ&na03|9O&`P6f7rCcMa5nSU3pt6gXl%0-k8v1jT#QGW
z3{F1tg)t~~FR5=MREVea?P&a9{MBn<l~P<f$UK8%!9SO}>8Q+e`|A6Io@nA=GA_g;
zu{BHI0Btgud?A|JoQ?~elDPaA2L87&7+12=F<E&qaY`$bgQ;j{b2grgr<x4h94~yq
z24!&bahVs%y$z;OW^it^D09)7cp9UPeFN#_)?i+>r;geW2WR5>80qKWY<x!5=djoL
z`0Nvg_PJrw*);HH3vu~)4+X-(`6#`4_TXH69$L77uU(8U;49<!n@fY&;@3FaVnD^^
zuk`J?_~OArl#4Iz<J5JaXnPLla-Q4q#Q>7Q<>*3uc^|k@GgjfGPkjvm_M3S2U?E<h
z`SBdM%fxMev;D6)jn=G^+u-IDmyYcwV>ti1wvFow?0BRm^+n}JpX*B%<CN2U*?ep+
zh8ulpBkQ9}!5*#znFyI?*zAeUV~e~=^YJ+(db<Jl7gTehCl+YZ0vYi_6J7_`2V|#F
z(31uacIj5ZDZnmPE=tx5Wx5@N5uAy58wMxsX2|_UkNeX0yBz5vVl3G|t|I+*5J^K|
z9kbW>yV4Xs!gaEreh-<0%KVg{Sf&|9tB-CP<kB|8H3$XyJ2^FU+jOgG^}24<b7{NM
zY_)q2rPUXR57KDtz9~}(1;JxqP~de&)o-|o2%&(h;36&b_rfHy7>EQ(;%PU1FWSLa
znNq<7wb#eKxy;}4dn*VBzZ71_mqy#~R$doBqYo6C!fSQ>ZsgKOo$Q_E^_#11ea)#l
zx%&FuwUuhp$lSbp`%c0ZR&L*`x%K6b?tHjbyCu}07s&bfJaMm~%pW!B7ei^1GN5PB
zE`8r5>f{8{9$zR;Id*fW-Nx*elS}R$ZchX)C#x<Wibs+o8jh_TH6g?#&as8)!y`9)
zGNV>d!h#w!?0K1}OWl+DM@|8ewXreWtdk!$X_jOgp=6IxHhhOY!d*u7I$2ezBNO-O
zRBw>-GD^)nXhhQ5_1ZhWlS#(4c=LOa1NRe`)bb@}hMOR!Zt5jOjw0Skgy9m)s$e|-
z-Mlf&vg`unHcRQ#teisNHp)g;H*s5`cNEDR1#;Y9aE~#Oz@ZPT6^$aB)U#-_e`hae
z(Z~K~{^ymkX}F&X6IZbtMJg%rbK(`bP8{8tl5mAcis(m2=ejcMx}6^1X%nA!-R&K(
zo%D!zX~Gtfm!VT}biWaAf&2j_By`(amK9S<R$=Swl{e&A$BX*yUWB_IZg9lO)KP8p
z+t6}%r_<j{609n%M~Q??hg)sm6J0?Gi?kD51wEnePQN0V8Ex~9%)qJikw|-?sFBn{
zF45SKtkn>8YLBYpZ0{`&olj!lwAI9ljc_k?%$v9G)|_<x_JjAU%Qb1;USF-=buu;g
z=7-DmdbRGPYwr8YwUxTM5<1!UUs{x1bE|i5-(0O9!(<B1jrTf~rTF|*TBy=8rJq+R
xQ`*;)Gk6boKP|igk_TW^b~k1tt9$w80T>%FNx*ZQ<_%`qwq39b*~!xEe*p?Mo2LK(

diff --git a/augmentation/cropping.py b/data_processing/cropping.py
similarity index 64%
rename from augmentation/cropping.py
rename to data_processing/cropping.py
index 6310f74..c4eb046 100644
--- a/augmentation/cropping.py
+++ b/data_processing/cropping.py
@@ -1,84 +1,3 @@
-import numpy as np
-import random
-from skimage.measure import label as label_fn
-
-
-def random_crop(image_stack, mask, image_size):
-    '''
-    THIS FUNCTION DEFINES RANDOM IMAGE CROPPING.
-     :param image_stack: input image in size [Time Stamp, Image Dimension (Channel), Height, Width]
-    :param mask: input mask of the image, to filter out uninterested areas [Height, Width]
-    :param image_size: It determine how the data is partitioned into the NxN windows
-    :return: image_stack, mask
-    '''
-
-    H, W = image_stack.shape[2:]
-
-    # skip random crop is image smaller than crop size
-    if H - image_size // 2 <= image_size:
-        return image_stack, mask
-    if W - image_size // 2 <= image_size:
-        return image_stack, mask
-    flag = True
-    for i in range(0,100):
-        h = np.random.randint(image_size, H - image_size // 2)
-        w = np.random.randint(image_size, W - image_size // 2)
-
-        image_stack = image_stack[:, :, h - int(np.floor(image_size // 2)):int(np.ceil(h + image_size // 2)),
-                    w - int(np.floor(image_size // 2)):int(np.ceil(w + image_size // 2))]
-        mask = mask[h - int(np.floor(image_size // 2)):int(np.ceil(h + image_size // 2)),
-            w - int(np.floor(image_size // 2)):int(np.ceil(w + image_size // 2))]
-        if 1 in mask:
-            break
-    return image_stack, mask
-
-def random_crop_around_aoi(img,mask,size = 32,min_area=0):
-    h,w = img.shape[2:]
-    mask_original = mask.copy()
-    size_h,size_w = size,size
-    
-    if h <= size and w <= size:
-        return img,mask
-    if h < size:
-        size_h = h
-    if w < size:
-        size_w = w
-        
-    if mask.max() == 0:
-        t,b,l,r = 0,h-1,0,w-1
-    else:
-        mask = label_fn(mask,connectivity=2)
-        values = [value for value in np.unique(mask)[1:] if mask[mask==value].sum()/value >= min_area]
-        
-        if len(values) == 0:
-            t,b,l,r = 0,h-1,0,w-1
-        else:
-            sval = values[random.randint(0,len(values)-1)]
-            mask[mask!=sval] = 0
-            mask = ((mask / sval) * 255.0).astype(np.uint8)
-            pos = np.nonzero(mask)
-            t, b, l, r = pos[0].min(),pos[0].max(),pos[1].min(),pos[1].max()
-        
-    h_aoi,w_aoi = b-t,r-l
-    pt = random.randint(t+h_aoi//2, b-h_aoi//2),random.randint(l+w_aoi//2, r-w_aoi//2)
-    
-    max_up = pt[0]
-    max_left = pt[1]
-    min_up = max(0,size_h - (h - pt[0]))
-    min_left = max(0,size_w - (w - pt[1]))
-    
-    t_crop = pt[0] - min(max_up, random.randint(min_up, size_h-1))
-    l_crop = pt[1] - min(max_left, random.randint(min_left, size_w-1))
-
-    cropped_img = img[:,:,t_crop:t_crop+size_h,l_crop:l_crop+size_w]
-    cropped_mask = mask_original[t_crop:t_crop+size_h,l_crop:l_crop+size_w]
-
-    return cropped_img,cropped_mask
-
-
-
-###Crop images keep georefrenced
-
 import rasterio
 import os
 from rasterio.windows import Window
diff --git a/data_processing/filtering.py b/data_processing/filtering.py
deleted file mode 100644
index ccc32ec..0000000
--- a/data_processing/filtering.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import numpy as np
-
-def noise_filter(washed,mina):
-    values = np.unique(washed)
-    for val in values[1:]:
-        area = (washed[washed == val]>0).sum()
-        if(area<=mina):  
-            washed[washed == val] = 0
-    return washed
\ No newline at end of file
diff --git a/data_processing/post_process.py b/data_processing/post_process.py
index bb029c6..e4cef0d 100644
--- a/data_processing/post_process.py
+++ b/data_processing/post_process.py
@@ -18,66 +18,51 @@
 from shapely.wkt import dumps
 from shapely.ops import cascaded_union
 import geopandas as gpd
-def post_process(raw,thresh = 0.5,mina=40,save=None):
+
+def post_process(pred,thresh = 0.5,thresh_b = 0.6,mina=100,mina_b=50):
+    if len(pred.shape) < 2:
+        return None
+    if len(pred.shape) == 2:
+        pred = pred[...,np.newaxis]
     
-    try:
-        ch = raw.shape[2]
-    except:
-        ch=1
-    if(ch == 2):
-        rraw = ranger(raw)
-        
-        rbuilds = raw[...,0]
-        rborders = raw[...,1]
-        
-        nuclei = rbuilds * (1 - rborders)
-        
-        builds = raw[...,0]
-        
-        basins = label(nuclei>0.1,background = 0, connectivity = 2)
-        #Image.fromarray(basins>0).show()
-        #basins = noise_filter(basins, mina = 2 )
-        basins = label(basins,background = 0, connectivity = 2)
-        washed = watershed(image = -builds,
+    ch = pred.shape[2]
+    buildings = pred[...,0]
+    if ch > 1:
+        borders = pred[...,1]
+        nuclei = buildings * (1.0 - borders)
+
+        if ch == 3:
+            spacing = pred[...,2]
+            nuclei *= (1.0 - spacing)
+
+        basins = label(nuclei>thresh_b,background = 0, connectivity = 2)
+        if mina_b > 0:
+            basins = noise_filter(basins, mina = mina_b)
+            basins = label(basins,background = 0, connectivity = 2)
+
+        washed = watershed(image = -buildings,
                            markers = basins,
-                           mask = builds>thresh,
+                           mask = buildings>thresh,
                            watershed_line=False)
-        washed = label(washed,background = 0, connectivity = 2)
-        washed = noise_filter(washed, mina=thresh)
-        washed = label(washed,background = 0, connectivity = 2)
-        #col = colorize(washed)
-        #Image.fromarray(col).show()
-        
+
     elif(ch == 1):
-        builds = raw[...,0]
-        washed  = label(builds > thresh,background = 0, connectivity = 2)
-        washed = noise_filter(washed, mina=thresh)
-        washed = label(washed,background = 0, connectivity = 2)
-        #col = colorize(washed)
-        #Image.fromarray(col).show()
-        
-    else:
-        raise NotImplementedError(
-            )
+        washed  = buildings > thresh 
+
+
+    washed = label(washed,background = 0, connectivity = 2)
+    washed = noise_filter(washed, mina=mina)
+    washed = label(washed,background = 0, connectivity = 2)
         
     return washed
 
 def noise_filter(washed,mina):
     values = np.unique(washed)
-    #a =0
-    #print(values)
     for val in values[1:]:
-        #a+=1
         area = (washed[washed == val]>0).sum()
         if(area<=mina):  
             washed[washed == val] = 0
-    #print(a)
     return washed
 
-def ranger(x):
-    x1 = x.copy()
-    return np.tanh((x1 - 0.5)/0.1) * (0.5)+0.5
-
 def extract_poly(mask):
     shps = shapes(mask.astype(np.int16),mask>0)
     polys =[]
diff --git a/data_processing/preprocessing.py b/data_processing/pre_processing.py
similarity index 64%
rename from data_processing/preprocessing.py
rename to data_processing/pre_processing.py
index ad29996..4922cc5 100644
--- a/data_processing/preprocessing.py
+++ b/data_processing/pre_processing.py
@@ -48,91 +48,7 @@
 np.random.seed(42)
 
 
-
-
-"""
-https://github.com/geoaigroup/challenges/blob/main/ai4foodsecurity-challenge/lstm-cnn.ipynb
-
-The data are stored as numpy arrays with dimension height x width x bands x timesteps. 
-All of the reflectance values are in the range [0,1]. We also add 
-two spectral indices (NDWI and LSWI) and SAR bands (VV, VH, incidence angle/IA).
-"""
-
-def add_lswi_channel(X):
-    _X = np.ndarray([HEIGHT, WIDTH, X.shape[2]+1, N_TIMESTEPS])
-    # copy the values from the original array
-    for i in range(X.shape[2]):
-        _X[:,:,i,:] = X[:,:,i,:]
-    # calculate values for LSWI channel
-    for i in range(N_TIMESTEPS):
-        lswi = (X[:,:,NIR,i]-X[:,:,SWIR1,i])/(X[:,:,NIR,i]+X[:,:,SWIR1,i])
-        _X[:,:,-1,i] = lswi
-    # make sure we didn't introduce any NaNs
-    _X[np.where(np.isnan(_X))] = 0
-    return _X
-
-def add_ndwi_channel(X):
-    _X = np.ndarray([HEIGHT, WIDTH, X.shape[2]+1, N_TIMESTEPS])
-    # copy the values from the original array
-    for i in range(X.shape[2]):
-        _X[:,:,i,:] = X[:,:,i,:]
-    # calculate values for NDWI channel
-    for i in range(N_TIMESTEPS):
-        ndwi = (X[:,:,GREEN,i]-X[:,:,SWIR1,i])/(X[:,:,GREEN,i]+X[:,:,SWIR1,i])
-        _X[:,:,-1,i] = ndwi
-    # make sure we didn't introduce any NaNs
-    _X[np.where(np.isnan(_X))] = 0
-    return _X
-
-def add_sar_channel(X, band, path):
-    _X = np.ndarray([HEIGHT, WIDTH, X.shape[2]+1, N_TIMESTEPS])
-    # copy the values from the original array
-    for i in range(X.shape[2]):
-        _X[:,:,i,:] = X[:,:,i,:]
-    # load the corresponding SAR band
-    if band=='vv' or band=='VV':
-        sarpath = path.replace('pheno_timeseries', 'vv_timeseries')
-        #sarpath = path.replace('fixed_timeseries', 'vv_timeseries')
-    elif band=='vh' or band=='VH':
-        sarpath = path.replace('pheno_timeseries', 'vh_timeseries')
-        #sarpath = path.replace('fixed_timeseries', 'vh_timeseries')
-    elif band=='ia' or band=='IA':
-        sarpath = path.replace('pheno_timeseries', 'ia_timeseries')
-        #sarpath = path.replace('fixed_timeseries', 'ia_timeseries')
-    sar = np.load(sarpath).astype(np.float32)
-    for i in range(N_TIMESTEPS):
-        _X[:,:,-1,i] = sar[...,i]
-    # make sure we didn't introduce any NaNs
-    _X[np.where(np.isnan(_X))] = 0
-    return _X
-
-def load_data(x_path, y_path, flatten=True, convert_nans=True):
-    # Load the time series image data
-    X = np.load(x_path).astype(np.float32)
-    # Load the associated labels
-    Y = np.load(y_path).astype(np.int8)
-    
-    # Convert all the NaNs to zeros
-    if convert_nans:
-        X[np.where(np.isnan(X))] = 0
-        
-    X[np.where(X==0)] = 0.00000001
-    # Add band indices
-    X = add_lswi_channel(X)
-    X = add_ndwi_channel(X)
-    X = add_sar_channel(X, 'vv', x_path)
-    X = add_sar_channel(X, 'vh', x_path)
-    X = add_sar_channel(X, 'ia', x_path)
-    if flatten:
-        # Reduce the h x w x b x t dataset to h*w x b x t
-        X = np.reshape(X, (X.shape[0]*X.shape[1], X.shape[2], X.shape[3]))
-        Y = np.reshape(Y, (Y.shape[0]*Y.shape[1]))
-    assert X.shape[0] == Y.shape[0] 
-    return X, Y 
-
-
-
-class loading_large_tile:
+class LargeTiffLoader:
    
     def __init__(self,input_image_directory,input_mask_directory,image_suffix='.tif',mask_suffix='.tif'):
         self.image_directory=input_image_directory
diff --git a/data_processing/split_data.py b/data_processing/split_data.py
deleted file mode 100644
index 4546073..0000000
--- a/data_processing/split_data.py
+++ /dev/null
@@ -1,181 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Sun Oct 13 12:15:42 2019
-
-@author: hasan
-"""
-import json
-import os
-import pandas as pd
-import shutil
-import random
-
-def merge_lists(a,b):
-    if(len(a)>=len(b)):
-        for item in b:
-            a.append(item)
-        return a
-    else:
-        for item in a:
-            b.append(item)
-        return b
-    
-def get_dicts(dict_dir):
-    dict_dir=os.path.expanduser((os.path.join(dict_dir,'labels_dictionary')))
-    print(dict_dir)
-    with open(dict_dir,'r') as dfile:
-        labels=dfile.read()
-    label_file=json.loads(labels)
-    counter=0
-    dis_dict=dict()
-    reg_dict=dict()
-    names_ac_dis=dict()
-    names_ac_reg=dict()
-    for label in label_file:
-        img_name=label_file[label]['img_name']
-        # and len(label_file[label]['classes'])>0
-        if(img_name.split('.')[0].endswith('_pre_disaster')):
-            this_region=img_name.split('_')[0]
-            this_disaster=label_file[label]['disaster']
-            if(this_region not in reg_dict.keys()):
-                reg_dict[this_region]=1
-                names_ac_reg[this_region]=[]
-                names_ac_reg[this_region].append(img_name.split('.')[0])
-            else:
-                reg_dict[this_region]+=1
-                names_ac_reg[this_region].append(img_name.split('.')[0])
-            if(this_disaster not in dis_dict.keys()):
-                dis_dict[this_disaster]=1
-                names_ac_dis[this_disaster]=[]
-                names_ac_dis[this_disaster].append(img_name.split('.')[0])
-            else:
-                dis_dict[this_disaster]+=1
-                names_ac_dis[this_disaster].append(img_name.split('.')[0])
-            counter+=1
-    '''       
-    df_reg=pd.DataFrame.from_dict(reg_dict,orient='index')
-    df_dis=pd.DataFrame.from_dict(dis_dict,orient='index')
-    df_reg.plot(kind='bar',title='Data distributions acc to regions')
-    df_dis.plot(kind='bar',title='Data distributions acc to disaster types')
-    mean1=int(df_reg.mean(axis=0)[0])
-    mean2=int(df_dis.mean(axis=0)[0])
-    print('mean1= '+str(mean1))
-    print('mean2= '+str(mean2))
-    print("number of images: "+str(counter))
-    print(names_ac_dis['tsunami'])
-    '''
-    return dis_dict,reg_dict,names_ac_dis,names_ac_reg,counter
-
-def copy_files(src,dst,file_list,extension=''):
-    print('start')
-    for files in file_list:
-        src_file_path = src + files+extension+'.png'
-        dst_file_path = dst + files+extension+'.png'
-        if os.path.exists(dst_file_path):
-            print(dst_file_path+" already exists")
-        else:
-            #print("Copying: " + dst_file_path)
-            try:
-                shutil.copyfile(src_file_path,dst_file_path)
-                print(dst_file_path)
-            except IOError:
-                a=0
-                print(src_file_path + " does not exist")
-                #input("Please, press enter to continue.")
-                
-def get_split(stat,names,count,ratio=0.9,var=0.1):
-    df_reg=pd.DataFrame.from_dict(stat,orient='index')
-    #mean=int(df_reg.mean(axis=0)[0])+1
-    #print(str(mean))
-    #imprE=int(float((1-ratio)*float(mean)))
-    evalstat=dict()
-    for key in stat.keys():
-        temp=stat[key]
-        evalstat[key]={}
-        if(temp>=24):
-            evalstat[key]['count']=30
-            evalstat[key]['keep']=False
-        else:
-            evalstat[key]['count']=temp
-            evalstat[key]['keep']=True
-        '''
-        if(temp>=(mean*2)):
-            evalstat[key]['count']=int(imprE*(1+var))
-            evalstat[key]['keep']=False
-        else:
-            if(int(float(float(temp)*(1+var)))>=mean):
-                evalstat[key]['count']=int(float((1-ratio)*temp))
-                evalstat[key]['keep']=False
-            else:
-                
-                if(int(float(float(temp)*(1+var)))>=int(mean/2)):
-                    evalstat[key]['count']=int(float((1-0.5)*temp))
-                    evalstat[key]['keep']=False
-                else:
-                    evalstat[key]['count']=temp
-                    evalstat[key]['keep']=True
-        ''' 
-    print('new distribution' + str(evalstat))
-    plot_dict=dict()
-    counter=0
-    for key in evalstat.keys():
-        plot_dict[key]=evalstat[key]['count']
-        counter+=evalstat[key]['count']
-    print("counter is : "+str(counter))
-    df_new=pd.DataFrame.from_dict(plot_dict,orient='index')
-    df_reg.plot(kind='bar',title='Data distributions acc to regions')
-    df_new.plot(kind='bar',title='Data distributions acc to regions in eval set')
-    
-    
-    evallist=[]
-    trainlist=[]
-    for key in evalstat.keys():
-        if(evalstat[key]['keep']==True):
-            templist=names[key]
-            random.shuffle(templist)
-            evallist=merge_lists(evallist,templist)
-            trainlist=merge_lists(trainlist,templist)
-        else:
-            templist=names[key]
-            random.shuffle(templist)
-            evallist=merge_lists(evallist,templist[:int(evalstat[key]['count'])])
-            trainlist=merge_lists(trainlist,templist[int(evalstat[key]['count']):])
-    random.shuffle(evallist)
-    random.shuffle(trainlist)
-    
-    print("length of val set: "+str(len(evallist)))
-    print("length of train set: "+str(len(trainlist)))
-    x=input("please press any key")
-    
-    return trainlist,evallist
-   
-         
-    
-if __name__ == '__main__':
-    d_pth='/usr/local/NotSynced/xView2/xview2_data'
-    a,b,c,d,e = get_dicts(d_pth)
-    train,eval_ = get_split(b,d,e)
-    '''
-    lists=[]
-    for key in d.keys():
-        temp=d[key]
-        random.shuffle(temp)
-        lists=merge_lists(lists,temp)
-        random.shuffle(lists)
-    '''
-    main_path=os.path.expanduser(os.getcwd())
-    all_mskpth=os.path.join(main_path,'masks/')  
-    src='/usr/local/NotSynced/xView2/train/images/'
-    destTrain=''.join([d_pth,'/train/data/'])
-    destEval=''.join([d_pth,'/val/data/'])
-    copy_files(src,destTrain,train)
-    copy_files(src,destEval,eval_)
-
-    destTrain=''.join([d_pth,'/train/masks/'])
-    destEval=''.join([d_pth,'/val/masks/'])
-
-    copy_files(all_mskpth,destTrain,train,extension='_mask')
-    copy_files(all_mskpth,destEval,eval_,extension='_mask')
-    
-         
diff --git a/data_processing/split_to_folds.py b/data_processing/split_to_folds.py
deleted file mode 100644
index a72c8c4..0000000
--- a/data_processing/split_to_folds.py
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Wed Feb  5 11:16:29 2020
-
-@author: hasan
-"""
-from sklearn.model_selection import StratifiedKFold
-import os
-
-from tqdm import tqdm
-import pandas as pd
-
-def main():
-    path='/home/jamada/Desktop/OpenCitiesAI/Dataset/Tilestrain_tier_4/images'
-    items=[]
-    for image in os.listdir(path):
-        if('.png' in image ):
-            isplit=image.split('_')
-            region_id=f'{isplit[0]}_{isplit[1]}'
-            items.append({'id':image,'region_id':region_id,'x':isplit[2],'y':isplit[3].split('.')[0]})
-    
-    df=pd.DataFrame(items,columns=['id','region_id','x','y'])
-    df['tile_id'] =df['region_id'].astype(str)+'_'+ df['x'].astype(str) + '_' + df['y'].astype(str)
-    X = df.groupby('tile_id')['region_id'].first().index.values
-    y = df.groupby('tile_id')['region_id'].first().values
-
-    skf=StratifiedKFold(n_splits=5, random_state=98, shuffle=True)
-    for i,(tfold,vfold) in enumerate(skf.split(X,y)):
-        df.loc[df['tile_id'].isin(X[vfold]),'fold']=int(i)
-
-    df.to_csv('folds4.csv')
-    folds=[int(fold) for fold in df.groupby('fold').first().index.values]
-
-
-    for fold in folds:
-        print(f'fold:\t{fold}')
-        print(df.loc[df['fold']==fold].set_index(['fold','region_id']).count(level='region_id'))
-        
-
-if __name__=='__main__':
-    main()
\ No newline at end of file
diff --git a/post_process.py b/post_process.py
deleted file mode 100644
index cf0cdce..0000000
--- a/post_process.py
+++ /dev/null
@@ -1,123 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Thu Oct 22 15:27:22 2020
-
-@author: jamada
-"""
-
-
-import numpy as np
-from skimage.morphology import watershed,dilation,square,erosion
-from skimage.measure import label
-from PIL import Image,ImageDraw
-from Data.utils import colorize
-from rasterio.features import shapes
-import pandas as pd
-from shapely.geometry import shape
-from shapely.wkt import dumps
-from shapely.ops import cascaded_union
-import geopandas as gpd
-def post_process(raw,thresh = 0.5,mina=40,save=None):
-    
-    try:
-        ch = raw.shape[2]
-    except:
-        ch=1
-    if(ch == 2):
-        rraw = ranger(raw)
-        
-        rbuilds = raw[...,0]
-        rborders = raw[...,1]
-        
-        nuclei = rbuilds * (1 - rborders)
-        
-        builds = raw[...,0]
-        
-        basins = label(nuclei>0.1,background = 0, connectivity = 2)
-        #Image.fromarray(basins>0).show()
-        #basins = noise_filter(basins, mina = 2 )
-        basins = label(basins,background = 0, connectivity = 2)
-        washed = watershed(image = -builds,
-                           markers = basins,
-                           mask = builds>thresh,
-                           watershed_line=False)
-        washed = label(washed,background = 0, connectivity = 2)
-        washed = noise_filter(washed, mina=thresh)
-        washed = label(washed,background = 0, connectivity = 2)
-        #col = colorize(washed)
-        #Image.fromarray(col).show()
-        
-    elif(ch == 1):
-        builds = raw[...,0]
-        washed  = label(builds > thresh,background = 0, connectivity = 2)
-        washed = noise_filter(washed, mina=thresh)
-        washed = label(washed,background = 0, connectivity = 2)
-        #col = colorize(washed)
-        #Image.fromarray(col).show()
-        
-    else:
-        raise NotImplementedError(
-            )
-        
-    return washed
-
-def noise_filter(washed,mina):
-    values = np.unique(washed)
-    #a =0
-    #print(values)
-    for val in values[1:]:
-        #a+=1
-        area = (washed[washed == val]>0).sum()
-        if(area<=mina):  
-            washed[washed == val] = 0
-    #print(a)
-    return washed
-
-def ranger(x):
-    x1 = x.copy()
-    return np.tanh((x1 - 0.5)/0.1) * (0.5)+0.5
-
-def extract_poly(mask):
-    shps = shapes(mask.astype(np.int16),mask>0)
-    polys =[]
-    
-    for shp,value in shps:
-        
-        p = shape(shp).buffer(0.0)
-        
-        typ = p.geom_type    
-        if(typ == 'Polygon' or typ == 'MultiPolygon'):  
-            polys.append(p.simplify(0.01))
-        else:
-            continue
-    if(len(polys) == 0):
-        return None
-    else:
-        return cascaded_union(polys)
-        #break
-    
-
-def mask_to_polys(iid,mask,mina = 4):
-    vals = sorted(np.unique(mask))
-    polys = []
-    areas = []
-    for i in vals[1:]:
-        poly = extract_poly(mask == i)
-        
-        if(poly is not None):
-            if(poly.area > mina):
-                polys.append(poly)
-                areas.append(poly.area)
-    gdf = gpd.GeoDataFrame(
-                            {'Id' : list(range(1,len(polys)+1)),
-                            'geometry'    : polys,
-                             'area'       : areas
-                                })
-    return gdf
-    
-        
-        
-        
-    
-    
\ No newline at end of file