-
Notifications
You must be signed in to change notification settings - Fork 7
/
DINCAE_tutorial.jl
241 lines (197 loc) · 7.48 KB
/
DINCAE_tutorial.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
# # Tutorial on DINCAE
#
# This script/notebook reconstructs missing data in satellite data using a neural
# network architecture called Convolutional Auto-Encoder described in the
# following articles:
#
# * Barth, A., Alvera-Azcárate, A., Licer, M., and Beckers, J.-M.: DINCAE 1.0: a convolutional neural network with error estimates to reconstruct sea surface temperature satellite observations, Geosci. Model Dev., 13, 1609–1622, https://doi.org/10.5194/gmd-13-1609-2020, 2020.
# * Barth, A., Alvera-Azcárate, A., Troupin, C., & Beckers, J.-M. (2022). DINCAE 2.0: multivariate convolutional neural network with error estimates to reconstruct sea surface temperature satellite and altimetry observations. Geoscientific Model Development, 15(5), 2183–2196. https://doi.org/10.5194/gmd-15-2183-2022
#
# The example here uses MODIS sea surface temperature from the Physical
# Oceanography Distributed Active Archive Center (PO.DAAC) JPL, NASA.
# More information is available at https://dx.doi.org/10.5067/MODST-1D4D9
#
# This notebook/script is intended to be run on a GPU with CUDA support (NVIDIA GPU)
# with a least 8 GB of RAM.
# The modules necessary to run the tutorial can be installed with:
#
# ```julia
# using Pkg
# Pkg.activate("/file/path/") # needs to be repeated for every session
# Pkg.instantiate() # install all package
# ```
#
# where `"/file/path/"` is the path containing this file and the file `Project.toml`.
# *Alternatively*, one can also install the modules in the default environment with:
#
# ```julia
# using Pkg
# Pkg.add(["CUDA","NCDatasets","PyPlot"])
# ```
#
# See also https://github.com/gher-uliege/DINCAE.jl#installation
#
# Load the necessary modules
using CUDA
using DINCAE
using DINCAE_utils
using Dates
using NCDatasets
using PyPlot
# ## Data download
#
# First we define the domain and time interval.
#
# The example is tested here for a short time frame but for realistic
# applications one should use a much longer time-range (like 10, 20 years or more)
# longitude range (east, west)
lon_range = [-7, -0.8]
# latitude range (south, north)
lat_range = [33.8, 38.2]
# time range (start, end)
time_range = [DateTime(2000,2,25), DateTime(2020,12,31)]
#time_range = [DateTime(2000,2,25), DateTime(2000,3,31)]
#time_range = [DateTime(2001,1,1), DateTime(2001,12,31)]
# local directory
localdir = expanduser("~/Data/SST-AlboranSea-example")
# create directory
mkpath(localdir)
# filename of the subset
fname_subset = joinpath(localdir,"modis_subset.nc")
# filename of the clean data
fname = joinpath(localdir,"modis_cleanup.nc")
# filename of the data with added clouds for cross-validation
fname_cv = joinpath(localdir,"modis_cleanup_add_clouds.nc")
varname = "sst"
# Results of DINCAE will be placed in a sub-directory under `localdir`
outdir = joinpath(localdir,"Results")
mkpath(outdir)
# The variable `url` is the OPeNDAP data URL of the MODIS data. Note the final
# `#fillmismatch` (look here https://github.com/Unidata/netcdf-c/issues/1299#issuecomment-458312804
# for `#fillmismatch` the suffix).
# The downloading can take several minutes.
# `Error { code = 500; message = "Java heap space"; }`
if !isfile(fname_subset)
download("https://dox.ulg.ac.be/index.php/s/ckHBdhDzAKERwPb/download",fname_subset)
end
# ```julia
# url = "https://thredds.jpl.nasa.gov/thredds/dodsC/ncml_aggregation/OceanTemperature/modis/terra/11um/4km/aggregate__MODIS_TERRA_L3_SST_THERMAL_DAILY_4KM_DAYTIME_V2019.0.ncml#fillmismatch"
# ds = NCDataset(url)
# # find indices withing the longitude, latitude and time range
# i = findall(lon_range[1] .<= ds["lon"][:] .<= lon_range[end]);
# j = findall(lat_range[1] .<= ds["lat"][:] .<= lat_range[end]);
# n = findall(time_range[1] .<= ds["time"][:] .<= time_range[end]);
# # Write subset to disk
# write(fname_subset,ds,idimensions = Dict(
# "lon" => i,
# "lat" => j,
# "time" => n))
# close(ds)
# @info "NetCDF subset ($(length(n)) slices) written $fname_subset"
# ```
# ## Data preparation
#
# Load the NetCDF variable `sst` and `qual_sst`
ds = NCDataset(fname_subset)
sst = ds["sst"][:,:,:];
qual = ds["qual_sst"][:,:,:];
# We ignore all data points with missing quality flags,
# quality indicator exceeding 3 and temperature
# higher than 40°C.
sst_t = copy(sst)
sst_t[(qual .> 3) .& .!ismissing.(qual)] .= missing
sst_t[(sst_t .> 40) .& .!ismissing.(sst_t)] .= missing
@info "number of missing observations: $(count(ismissing,sst_t))"
@info "number of valid observations: $(count(.!ismissing,sst_t))"
# Clean-up the data to write them to disk.
varname = "sst"
fname = joinpath(localdir,"modis_cleanup.nc")
ds2 = NCDataset(fname,"c")
write(ds2,ds,exclude = ["sst","qual"])
defVar(ds2,varname,sst_t,("lon","lat","time"))
close(ds2)
# Add a land-sea mask to the file. Grid points with less than 5% of
# valid data are considered as land.
DINCAE_utils.add_mask(fname,varname; minseafrac = 0.05)
# Choose cross-validation points by adding clouds to the cleanest
# images (copied from the cloudiest images). This function will generate
# a file `fname_cv`.
DINCAE_utils.addcvpoint(fname,varname; mincvfrac = 0.10);
# # Reconstruct missing data
#
#
# F is the floating point number type for the neural network. Here we use
# single precision.
const F = Float32
# Test if CUDA is functional to use the GPU, otherwise the CPU is used.
if CUDA.functional()
Atype = CuArray{F}
else
@warn "No supported GPU found. We will use the CPU which is very slow. Please check https://developer.nvidia.com/cuda-gpus"
Atype = Array{F}
end
# Setting the parameters of neural network.
# See the documentation of `DINCAE.reconstruct` for more information.
epochs = 1000
batch_size = 32
enc_nfilter_internal = round.(Int,32 * 2 .^ (0:4))
clip_grad = 5.0
regularization_L2_beta = 0
ntime_win = 3
upsampling_method = :nearest
loss_weights_refine = (0.3,0.7)
save_epochs = 200:10:epochs
data = [
(filename = fname_cv,
varname = varname,
obs_err_std = 1,
jitter_std = 0.05,
isoutput = true,
)
]
data_test = data;
fnames_rec = [joinpath(outdir,"data-avg.nc")]
data_all = [data,data_test]
# Use these parameters for a quick test:
# epochs = 10
# save_epochs = epochs:epochs
# Start the training and reconstruction of the neural network.
loss = DINCAE.reconstruct(
Atype,data_all,fnames_rec;
epochs = epochs,
batch_size = batch_size,
enc_nfilter_internal = enc_nfilter_internal,
clip_grad = clip_grad,
save_epochs = save_epochs,
upsampling_method = upsampling_method,
loss_weights_refine = loss_weights_refine,
ntime_win = ntime_win,
)
# Plot the loss function
plot(loss)
ylim(extrema(loss[2:end]))
xlabel("epochs")
ylabel("loss");
# # Post process results
#
# Compute the RMS (Root Mean Squared error) with the independent validation data
case = (
fname_orig = fname,
fname_cv = fname_cv,
varname = varname,
)
fnameavg = joinpath(outdir,"data-avg.nc")
cvrms = DINCAE_utils.cvrms(case,fnameavg)
@info "Cross-validation RMS error is: $cvrms"
# Next we plot all time instances. The figures will be placed in the
# directory `figdir`
figdir = joinpath(outdir,"Fig")
DINCAE_utils.plotres(case,fnameavg, clim = nothing, figdir = figdir,
clim_quantile = (0.01,0.99),
which_plot = :cv)
@info "Figures are in $(figdir)"
# Example reconstruction for 2001-09-12
# ![reconstruction for the 2001-09-12](Fig/data-avg_2001-09-12.png)
# Panel (a) is the original data where we have added clouds (panel (b)). The
# reconstuction based on the data in panel (b) is shown in panel (c) together
# with its expected standard deviation error (panel (d)).