-
Notifications
You must be signed in to change notification settings - Fork 30
/
util.py
470 lines (408 loc) · 21.1 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
"""
Util file for misc functions
"""
from constants import *
import numpy as np
import itertools
import matplotlib.pyplot as plt
import argparse
import pickle
import pandas as pd
import time
import random
from datetime import datetime
from sklearn.model_selection import GroupShuffleSplit
from constants import *
import torch
def random_seed(seed_value, use_cuda):
if seed_value is not None:
np.random.seed(seed_value) # cpu vars
torch.manual_seed(seed_value) # cpu vars
random.seed(seed_value) # Python
if use_cuda:
torch.cuda.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value) # gpu vars
torch.backends.cudnn.deterministic = True #needed
torch.backends.cudnn.benchmark = False
def dates2doy(dates):
""" Transforms list of dates in YYYY-MM-DD format to a vector of days of year
"""
data = []
# convert each date to doy
for date in dates:
y, m, d = date.split('-')
doy = datetime(int(y), int(m), int(d)).timetuple().tm_yday
data.append(doy)
# save dates as npy array
data = np.array(data)
return data
def str2bool(v):
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Boolean value expected.')
def softmax(x):
"""
Computes softmax values for a vector x.
Args:
x - (numpy array) a vector of real values
Returns: a vector of probabilities, of the same dimensions as x
"""
e_x = np.exp(x - np.max(x))
return e_x / e_x.sum()
def create_categorical_df_col(df, from_col, to_col):
"""
Creates a categorical column in a dataframe from an existing column
For example, column 'classes' of possibilities 'cat', 'dog', 'bird'
can be categorized in a new column 'class_nums' of possibilities 0, 1, 2.
Args:
df - pandas data frame
from_col - (str) specifies column name that you wish to categorize
into integer categorical values
to_col - (str) specifies column name that will be added with the
new categorical labels
Returns:
df - pandas data frame with additional column of categorical labels
"""
df[from_col] = pd.Categorical(df[from_col])
df[to_col] = df[from_col].astype('category').cat.codes
return df
def plot_confusion_matrix(cm, classes,
normalize=True,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
"""
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
fig = plt.figure()
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, format(cm[i, j], fmt),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()
return fig
def split_with_group(df, group, train_frac, test_frac, data_cols, lbl_cols, random_seed=None, shuffle=True, save=False):
"""
Splits a dataframe into train, val, and test splits while keeping groups
separated between splits.
For example, a data frame may contain the column 'poly_ID' that should be
kept separated between dataset splits.
train_frac + test_frac must be <= 1. When < 1, the reaminder of the dataset
goes into a validation set
Args:
df - (pandas dataframe) the dataframe to be split into train, val, test splits
group - (str) the column name to separate by
train_frac - (float) percentage between 0-1 to put into the training set (train_frac + test_frac <= 1)
test_frac - (float) percentage between 0-1 to put into the test set (train_frac + test_frac <= 1)
data_cols - (indexed column(s), i.e. 3:-1) the column(s) of the data frame that contain the data
lbl_cols - (int, i.e. -1) the column of the data frame that contains the labels
random_seed - (int) when splitting and if shuffling the dataset after splitting, use this random_seed to do so
shuffle - (boolean) if True, shuffle the dataset once it's already split
save - (boolean) if True, save output splits into a pickle file
Returns:
X_train - (np.ndarray) training data
y_train - (np.ndarray) training labels
X_val - (np.ndarray) validation data
y_val - (np.ndarray) validation labels
X_test - (np.ndarray) test data
y_test - (np.ndarray) test labels
"""
X = df
groups = df[group]
train_inds, test_inds = next(GroupShuffleSplit(n_splits=3, test_size=test_frac,
train_size=train_frac, random_state=random_seed).split(X, groups=groups))
val_inds = []
for i in range(X.shape[0]):
if i not in train_inds and i not in test_inds:
val_inds.append(i)
val_inds = np.asarray(val_inds)
if random_seed:
np.random.seed(random_seed)
if shuffle:
np.random.shuffle(train_inds)
np.random.shuffle(val_inds)
np.random.shuffle(test_inds)
X_train, y_train = X.values[train_inds, data_cols], X.values[train_inds, lbl_cols].astype(int)
X_val, y_val = X.values[val_inds, data_cols], X.values[val_inds, lbl_cols].astype(int)
X_test, y_test = X.values[test_inds, data_cols], X.values[test_inds, lbl_cols].astype(int)
if save:
fname = '_'.join('dataset_splits', time.strftime("%Y%m%d-%H%M%S"), '.pickle')
with open(fname, "wb") as f:
pickle.dump((X_train, y_train, X_val, y_val, X_test, y_test), f)
return X_train, y_train, X_val, y_val, X_test, y_test
def crop_ind(y, name_list = [1, 2, 3, 4, 5]):
"""
Crop row Index for y, just interested in some croptypes
Args:
y - y_label (croptype) of train/val/test or pixel arrays
name_list - what croptypes we are interested in, normally 1-5, just top 5 crops in the country
Returns:
crop_index - the index of array or vector to indicate where are the places for the croptype from 1-5
"""
crop_index = [name in name_list for name in y]
crop_index = np.where(crop_index)
return crop_index
def get_y_label(home, country, data_set, data_type, ylabel_dir, raster_npy_dir):
"""
Get y label for different set small/full, different type train/val/test
Args:
home - (str) the base directory of data
country - (str) string for the country 'Ghana', 'Tanzania', 'SouthSudan'
data_set - (str) balanced 'small' or unbalanced 'full' dataset
data_type - (str) 'train'/'val'/'test'
ylabel_dir - (str) dir to save ylabel
raster_npy_dir - (str) string for the mask raster dir 'raster_npy' or 'raster_64x64_npy'
Output:
ylabel_dir/..
save as grid_nums*row*col 3D array
"""
gridded_IDs = sorted(np.load(os.path.join(home, country, country+'_'+data_set+'_'+data_type)))
# Match the Mask
mask_dir = os.path.join(home, country, raster_npy_dir)
mask_fnames = [country+'_64x64_'+gridded_ID+'_label.npy' for gridded_ID in gridded_IDs]
# Geom_ID Mask Array
mask_array = np.zeros((len(gridded_IDs),64,64))
for i in range(len(gridded_IDs)):
fname = os.path.join(mask_dir,mask_fnames[i])
# Save Mask as one big array
mask_array[i,:,:] = np.load(fname)[0:64,0:64]
output_fname = "_".join([data_set, data_type, 'croptypemask', 'g'+str(len(gridded_IDs)),'r64', 'c64'+'.npy'])
np.save(os.path.join(ylabel_dir,output_fname), mask_array)
return mask_array
def mask_tif_npy(home, country, csv_source, crop_dict_dir, raster_dir):
"""
Transfer cropmask from .tif files by field_id to cropmask .npy by crop_id given crop_dict
Args:
home - (str) the base directory of data
country - (str) string for the country 'Ghana', 'Tanzania', 'SouthSudan'
csv_source - (str) string for the csv field id file corresponding with the country
crop_dict_dir - (str) string for the crop_dict dictionary {0: 'unlabeled', 1: 'groundnuts' ...}
raster_dir - (str) string for the mask raster dir 'raster' or 'raster_64x64'
Outputs:
./raster_npy/..
"""
fname = os.path.join(home, country, csv_source)
crop_csv = pd.read_csv(fname)
mask_dir = os.path.join(home, country, raster_dir)
mask_dir_npy = os.path.join(home, country, raster_dir+'_npy')
mask_fnames = [f for f in os.listdir(mask_dir) if f.endswith('.tif')]
mask_ids = [f.split('_')[-1].replace('.tif', '') for f in mask_fnames]
mask_fnames = [mask_fnames[ID] for ID in np.argsort(mask_ids)]
mask_ids = np.array([mask_ids[ID] for ID in np.argsort(mask_ids)])
crop_dict = np.load(os.path.join(home, crop_dict_dir))
clustered_geom_id = [np.array(crop_csv['geom_id'][crop_csv['crop']==crop_name]) for crop_name in crop_dict.item().values()]
for mask_fname in mask_fnames:
with rasterio.open(os.path.join(mask_dir,mask_fname)) as src:
mask_array = src.read()[0,:,:]
mask_array_geom_id = np.unique(mask_array)
mask_array_crop_id = np.zeros(mask_array.shape)
mask_array_crop_id[:] = np.nan
for geom_id in mask_array_geom_id:
if geom_id>0:
crop_num = np.where([geom_id in clustered_geom_id[i] for i in np.arange(len(clustered_geom_id))])[0][0]
mask_array_crop_id[mask_array==geom_id] = crop_num
elif geom_id == 0:
mask_array_crop_id[mask_array==geom_id] = 0
np.save(os.path.join(mask_dir_npy,mask_fname.replace('.tif', '.npy')), mask_array_crop_id)
def fill_NA(X):
"""
Fill NA values with mean of each band
Args:
X - (numpy array) a vector of real values
Returns: numpy array the same dimensions as x, no NAs
"""
X_noNA = np.where(np.isnan(X), ma.array(X, mask=np.isnan(X)).mean(axis=0), X)
return(X_noNA)
def get_train_parser():
parser = argparse.ArgumentParser()
parser.add_argument('--model_name', type=str,
help="model's name",
required=True)
parser.add_argument('--dataset', type=str,
help="Full or small?",
choices=('full', 'small'),
default='full')
parser.add_argument('--country', type=str,
help="country to predict over",
default="ghana")
parser.add_argument('--epochs', type=int, default=130,
help="# of times to train over the dataset")
parser.add_argument('--batch_size', type=int, default=5,
help="batch size to use")
parser.add_argument('--optimizer', type=str,
help="Optimizer to use for training",
default="adam",
choices=('sgd', 'adam'))
parser.add_argument('--lr', type=float, default=0.003,
help="Initial learning rate to use")
parser.add_argument('--momentum', type=float,
help="Momentum to use when training",
default=.9)
parser.add_argument('--lrdecay', type=float,
help="Learning rate decay per **batch**",
default=1)
parser.add_argument('--shuffle', type=str2bool,
help="shuffle dataset between epochs?",
default=True)
parser.add_argument('--use_s1', type=str2bool,
help="use s1 data?",
default=False)
parser.add_argument('--use_s2', type=str2bool,
help="use s2 data?",
default=True)
parser.add_argument('--s1_agg', type=str2bool,
help="aggregate s1 data across time?",
default=False)
parser.add_argument('--s2_agg', type=str2bool,
help="aggregate s2 data across time?",
default=False)
parser.add_argument('--agg_days', type=int,
help="Number of days to aggregate in each time bin",
default=15)
parser.add_argument('--num_workers', type=int,
help="Number of workers to use for pulling data",
default=8)
# TODO: find correct string name
parser.add_argument('--device', type=str,
help="Cuda or CPU",
default='cuda')
parser.add_argument('--save_dir', type=str,
help="Directory to save the models in. If unspecified, saves the model to ./runs.",
default='./runs')
parser.add_argument('--save_best', type=str2bool, default=True,
help="Whether to save plots and images for best epoch according to validation f1-score")
parser.add_argument('--name', type=str,
help="Name of experiment. Used to uniquely save the model. Defaults to current time + model name if not set.")
parser.add_argument('--time_slice', type=int,
help="which time slice for training FCN/UNet",
default=None)
parser.add_argument('--weight_decay', type=float,
help="l2 regularization weight",
default=0)
parser.add_argument('--loss_weight', type=str2bool,
help="weighted cross entropy loss",
default=True)
parser.add_argument('--gamma', type=int,
help="weighting factor for focal loss",
default=2)
parser.add_argument('--weight_scale', type=float,
help="power to raise weights by",
default=1)
parser.add_argument('--lr_decay', type=float,
help="Factor to multiply lr by on decay",
default=.5)
parser.add_argument('--apply_transforms', type=str2bool,
help="Apply horizontal flipping / rotation",
default=True)
parser.add_argument('--normalize', type=str2bool,
help="Apply normalization to input based on overall band means and stds",
default=True)
parser.add_argument('--patience', type=int,
help="Number of epochs before decreasing lr.",
default=3)
parser.add_argument('--env_name', type=str, default=None,
help="Environment name for visdom visualization")
parser.add_argument('--seed', type=int, default=1,
help="Random seed to use for reproducability")
parser.add_argument('--sample_w_clouds', type=str2bool, default=False,
help="Use clouds for sampling Sentinel-2 dates")
parser.add_argument('--include_clouds', type=str2bool, default=False,
help="Include clouds as input feature")
parser.add_argument('--least_cloudy', type=str2bool, default=False,
help="Whether to use least cloudy samples (True) or sample from cloudiness (False)")
parser.add_argument('--include_doy', type=str2bool, default=False,
help="Include day of year as input feature")
parser.add_argument('--num_timesteps', type=int, default=40,
help="Number of timesteps to include")
# Args for CLSTM model
parser.add_argument('--hidden_dims', type=int,
help="Number of channels in hidden state used in convolutional RNN",
default=128)
parser.add_argument('--crnn_kernel_sizes', type=int,
help="Convolutional kernel size used within a recurrent cell",
default=3)
parser.add_argument('--conv_kernel_size', type=int,
help="Convolutional kernel size used within a convolutional layer",
default=3)
parser.add_argument('--crnn_num_layers', type=int,
help="Number of convolutional RNN cells to stack",
default=1)
parser.add_argument('--bidirectional', type=str2bool,
help='Use bidirectional?',
default=False)
parser.add_argument('--avg_hidden_states', type=str2bool, default=True,
help="average hidden states for each timestep?")
# Arguments for number of bands to use
parser.add_argument('--s2_num_bands', type=int, default=10,
help="Number of bands to use from Sentinel-2")
# Args for FCN CRNN model
parser.add_argument('--early_feats', type=str2bool, default=False,
help="Use early features in the CLSTM from center after encoder")
parser.add_argument('--fcn_out_feats', type=int, default=256,
help="Number of output features from fcn to be fed into CLSTM")
#parser.add_argument('--fcn_model_name', type=str, default='unet',
# help="Model to use for fcn part of fcn + crnn")
parser.add_argument('--crnn_model_name', type=str, default='clstm',
help="Model to use for crnn part of fcn + crnn")
parser.add_argument('--pretrained', type=str2bool, default=True,
help="Pretrained unet model for fcn-crnn")
parser.add_argument('--pretrained_model_path', type=str,
default=None, #PRETRAINED_GERMANY_PATH,
help='Path to pretrained model')
parser.add_argument('--dropout', type=float, default=.5,
help="Dropout probability to be used")
parser.add_argument('--model_path', type=str)
parser.add_argument('--eval_on_test', type=str2bool, default=False)
parser.add_argument('--percent_of_dataset', type=float, default=1)
parser.add_argument('--all_samples', type=str2bool, default=False)
parser.add_argument('--num_repeat', type=int, default=1)
parser.add_argument('--use_planet', type=str2bool, default=False,
help="use planet data?")
parser.add_argument('--resize_planet', type=str2bool, default=True,
help="Whether to resize planet to grid_size to correspond with other inputs")
parser.add_argument('--planet_agg', type=str2bool,
help="aggregate planet data across time?",
default=False)
parser.add_argument('--include_indices', type=str2bool, default=False,
help="Include ndvi and gcvi as input features")
parser.add_argument('--fix_feats', type=str2bool, default=False,
help="Fix pretrained features")
parser.add_argument('--clip_val', type=str2bool, default=True,
help="Whether or not to use gradient clipping, value is computed based on the number of parameters")
parser.add_argument('--main_crnn', type=str2bool, default=False,
help="Whether or not to use a CRNN in the main encoder at the bottom of the U of the UNet model (for early feats) or just before the prediction (for not early feats)")
parser.add_argument('--main_attn_type', type=str, default='None',
help="Attention type to use for main clstm layer, must be 'None', 'temporal', 'self', or 'vector'")
parser.add_argument('--enc_attn_type', type=str, default='None',
help="Attention type to use for encoder layers, must be 'None', 'temporal', 'self', or 'vector', self may use up too much memory")
parser.add_argument('--d_attn_dim', type=int, default=32,
help="Number of features in w_s1 output for temporal attention")
parser.add_argument('--r_attn_dim', type=int, default=1,
help="Number of features in w_s1 output for temporal attention")
parser.add_argument('--dk_attn_dim', type=int, default=256,
help="Number of dk features for self attention")
parser.add_argument('--dv_attn_dim', type=int, default=256,
help="Number of dv features for self attention")
parser.add_argument('--enc_crnn', type=str2bool, default=False,
help="Use crnn for encoder layers in addition to the main encodings")
parser.add_argument('--enc_attn', type=str2bool, default=False,
help="Use attn for encoder layers in addition to the main encodings")
parser.add_argument('--var_length', action="store_true", default=False,
help="use variable length sequences")
return parser