-
Notifications
You must be signed in to change notification settings - Fork 0
/
05.Processing-count-results.Rmd
457 lines (368 loc) · 21.3 KB
/
05.Processing-count-results.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
# Processing count results
For now, we are interested in calculating the following parameters for the count results:
1. The number of birds within every PPI pixel.
2. The average mass of the birds within every PPI pixel.
We can derive the total numbers of birds comparatively easily from the bird counts by Sovon, but to calculate the average mass of the birds we need to link a database of life-history traits. For the latter we first need to translate the vernacular (modern) names of bird species (used by Sovon) to the scientific ones, so we can link both.
## Processing environment
```{r setup_annotating_count_results, warning=FALSE, message=FALSE}
library(rgbif)
library(stringr)
library(readxl)
library(dplyr)
library(tidyr)
library(readr)
library(kableExtra)
```
## Loading the Sovon count data
The count data is spread over a few sheets in an `xlsx` file, which we load here. For clarity, we rename all the columns to English and we filter out all counts (i.e. areas) where birds are not positively identified to species level. Although it would be possible to 'fill' these uncertain counts based on proportions, determining how to do that is not necessary for our purposes. Instead, we will just remove these counts altogether. Finally, subspecies identifiers for these species are removed, as we assume variation is limited and the database of life-history traits does not contain parameters for subspecies separately.
```{r preparing_wb_counts}
sovon_data <- "data/raw/sovon/tel_dec_jan_1718.xlsx"
data <- data.frame()
sheets <- excel_sheets(sovon_data)
sheets <- sheets[-c(1, 5)] # Sheet 1 and 5 contain PTT and roost counts respectively, so we ignore these for now, as they have to be processed differently
# Explicit column types to suppress warnings thrown because of lacking euring codes for records with no birds
coltypes <- c("numeric", "text", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "text", "numeric", "numeric", "numeric")
for (i in seq_along(sheets)) {
data <- rbind(data, read_excel(sovon_data, sheet = sheets[i], col_types = coltypes))
}
data %>%
drop_na() %>% # A few rows somehow contain no birds or species 'geen vogels'
rename(count_id = TELLING_ID, area_nr = GEBIEDSCODE, year = JAAR, month = MAAND, day = DAG, start_time = BEGINTIJD, end_time = EINDTIJD,
"euring" = "EURING", species = SOORT, number = Aantal, xcoor = XCOOR, ycoor = YCOOR) %>%
group_by(area_nr) %>%
filter(!any(str_ends(species, "spec."))) %>% # Filter out all counts with unidentified birds
filter(!any(length(str_subset(species, " of ")) > 0)) %>% # Filter out all counts with either/or totals
filter(!any(str_starts(species, "hybride"))) %>% # Filter out all counts with hybrids
ungroup() %>%
rowwise() %>%
mutate(species = str_split(species, "\\(")[[1]][1] %>% str_trim()) -> wb_data # And remove all subspecies identifications
head(wb_data, 10) %>%
kable(format = "html", col.names = colnames(wb_data)) %>%
kable_styling() %>%
kableExtra::scroll_box(width = "100%", height = "400px")
```
Following this logic, we can process the PTT counts similarly and see which species are contained in those.
```{r preparing_ptt_counts}
read_excel(sovon_data, sheet = 1) %>%
rename(count_id = tellingid, route = route, count_point = telpunt, season = seizoen, year = teljaar, month = maand,
day = dag, euring = euring, species = soort, number = aantal, xcoor = xcoor, ycoor = ycoor) %>%
group_by(route, count_point) %>%
filter(!any(str_ends(species, "spec."))) %>% # Filter out all counts with unidentified birds
filter(!any(length(str_subset(species, " of ")) > 0)) %>% # Filter out all counts with either/or totals
filter(!any(str_starts(species, "hybride"))) %>% # Filter out all counts with hybrids
ungroup() %>%
rowwise() %>%
mutate(species = str_split(species, "\\(")[[1]][1] %>% str_trim()) -> ptt_data # And remove all subspecies identifications
head(ptt_data, 10) %>%
kable(format = "html", col.names = colnames(ptt_data)) %>%
kable_styling() %>%
kableExtra::scroll_box(width = "100%", height = "400px")
```
## Filtering/preprocessing species names
The following section is the result of an iterative process aimed at matching species in our count data with species in the database of life-history characteristics. Unfortunately, automatic tools only get us so far, a bit of tweaking has to be done by hand. To reduce manual corrections needed, *only species to which >1% of the birds belong in a single count* have been corrected manually. In other words: if a species which cannot be matched always accounts for less than 1% of the total number of birds within a count, this species is discarded from the whole dataset.
Besides the 1% criteria, both the waterbird and PTT counts contain some exotics (e.g. 'Helmparelhoen'/Helmeted guineafowl) for which our life-history characteristics dataset does not contain any measurements anyways (there are others that we do have measurements of), some species that are very unlikely to ever take flight during NYE (e.g. 'Kip'/Domesticated chicken) and some mammals for which the same applies. We will remove these from the dataset manually.
```{r filtering_species}
exotics <- c("Helmparelhoen", "Kaapse Casarca", "Manengans", "Ringtaling", "Buffelkopeend", "Kokardezaagbek", "Chileense Flamingo",
"Kaapse Taling", "Bahamapijlstaart", "Muskuseend", "Zwarte Zwaan", "Knobbelgans", "Zwaangans")
unlikely_flight_candidate <- c("Kip")
mammals <- c("Damhert", "Haas", "Ree", "Bever", "Bruine Rat", "Muskusrat", "Mol", "Vos", "Kat", "Otter", "Grijze Zeehond", "Konijn",
"Eekhoorn", "Edelhert", "Gewone Zeehond", "Wild Zwijn", "Steenmarter", "Moeflon")
input_error <- c("Steltstrandloper")
remove_species <- c(exotics, unlikely_flight_candidate, mammals, input_error)
ptt_data %>%
filter(!species %in% remove_species) -> ptt_data
wb_data %>%
filter(!species %in% remove_species) -> wb_data
```
With all these species removed or adjusted, we can create a species lookup table. We fetch the scientific names and corresponding GBIF species IDs from the [Checklist Dutch Species Register](https://doi.org/10.15468/rjdpzy), which is the GBIF dataset with key `4dd32523-a3a3-43b7-84df-4cda02f15cf7`. We furthermore remove all unnecessary information, such as subspecies from the scientific names as well.
```{r create_species_lut, results='hold'}
unique_species <- unique(c(wb_data$species, ptt_data$species))
build_species_lut <- function(specieslist, datasetKey = NULL, class_name = NULL) {
# As this function can possibly return many different records, we pick the scientific name and GBIF ID (nubKey) that are the most
# common in the returned results. This should most often result in an OK result of the name lookup function.
Mode <- function(x) {
ux <- unique(na.omit(x))
ux[which.max(tabulate(match(x, ux)))]
}
n <- length(specieslist)
species_lut <- data.frame(lookupname = character(n),
scientificname = character(n),
gbif_key = numeric(n), stringsAsFactors = FALSE)
# Somehow the higherTaxonKey changes regularly, so we have to query this first
higherTaxonKey <- NULL
if (!is.null(class_name)) {
class_record <- name_lookup(class_name, datasetKey = datasetKey)
higherTaxonKey <- class_record$data$classKey
}
for(i in seq_along(specieslist)) {
gbif_data <- tryCatch({
gbif <- name_lookup(specieslist[i],
datasetKey = datasetKey,
higherTaxonKey = higherTaxonKey,
return = "data")
list(paste(str_split(Mode(gbif$data$scientificName), pattern = " ")[[1]][1:2], collapse = " "), Mode(gbif$data$nubKey))
}, error = function(e) {
list("", NaN)
})
species_lut[i, ] <- c(specieslist[i], gbif_data[1], as.numeric(as.character(gbif_data[2])))
}
return(species_lut)
}
species_lut <- build_species_lut(unique_species, datasetKey = "4dd32523-a3a3-43b7-84df-4cda02f15cf7", class_name = "Aves")
head(species_lut, 10) %>%
kable(format = "html", col.names = colnames(species_lut)) %>%
kable_styling() %>%
kableExtra::scroll_box(width = "100%", height = "400px")
```
Finally, the lookup table also contains some scientific names which unfortunately will not match with the life-history characteristics dataset, so these too we will adjust manually.
```{r manual_species_tweaks}
# Change to similar species which is in the LHT database
species_lut[species_lut$lookupname == "Toendrarietgans", "scientificname"] <- "Anser fabalis" # Taiga Bean Goose
species_lut[species_lut$lookupname == "Kleine Canadese Gans", "scientificname"] <- "Branta leucopsis" # Barnacle Goose
species_lut[species_lut$lookupname == "Kleine Barmsijs", "scientificname"] <- "Acanthis flammea" # Redpoll
species_lut[species_lut$lookupname == "Pontische Meeuw", "scientificname"] <- "Larus argentatus" # Herring Gull
species_lut[species_lut$lookupname == "Indische Gans", "scientificname"] <- "Anser albifrons" # Greater White-fronted Goose
species_lut[species_lut$lookupname == "Canadese Gans", "scientificname"] <- "Branta canadensis" # Canada Goose
species_lut[species_lut$lookupname == "Topper", "scientificname"] <- "Aythya marila" # Greater Scaup
species_lut[species_lut$lookupname == "Tafeleend", "scientificname"] <- "Aythya ferina" # Greater Scaup
# Change scientific name for same species to match with the LHT database
species_lut[species_lut$lookupname == "Kleine Zwaan", "scientificname"] <- "Cygnus columbianus" # Bewick's Swan
species_lut[species_lut$lookupname == "Kokmeeuw", "scientificname"] <- "Larus ridibundus" # Black-headed Gull
species_lut[species_lut$lookupname == "Smient", "scientificname"] <- "Mareca penelope" # Wigeon
species_lut[species_lut$lookupname == "Krakeend", "scientificname"] <- "Mareca strepera" # Gadwall
species_lut[species_lut$lookupname == "Slobeend", "scientificname"] <- "Spatula clypeata" # Northern Shoveler
species_lut[species_lut$lookupname == "Winterkoning", "scientificname"] <- "Troglodytes troglodytes" # Wren
species_lut[species_lut$lookupname == "Grote Jager", "scientificname"] <- "Catharacta skua" # Great Skua
species_lut[species_lut$lookupname == "Roodborsttapuit", "scientificname"] <- "Saxicola torquatus" # Stonechat
species_lut[species_lut$lookupname == "Strandleeuwerik", "scientificname"] <- "Eremophila alpestris" # Horned Lark
```
## Adding vernacular family names
Fetching vernacular names of scientific families from GBIF is even more convoluted than the name lookup we have done above, so instead we have manually derived a list of vernacular names for families based on the generated species lookup table.
```{r load_vernacular_family_names}
read_delim("data/raw/sovon/familyvernacular_lut.csv", delim = ";", col_types = cols(lookupname = col_character(), familyvernacular = col_character())) %>%
right_join(species_lut, by = "lookupname") -> species_lut
```
## Linking life-history traits to species
We use the *Life-history characteristics of European birds*-dataset [@storchova_2018] to calculate the mean mass of all birds in a PPI pixel. This dataset is stored on Dryad and we can [download](https://doi.org/10.5061/dryad.n6k3n) it there. Unfortunately at the time of writing the `rdryad` package is severely out-of-date with the new Dryad API, so we cannot nicely automate this download yet. Anyways, the files should be downloaded manually and added to `data/raw/life-history-characteristics/`.
We calculate the mean of the mass of both sexed and unsexed birds and assume that represents the mean mass for a species.
```{r prepare_life_history_characteristics, results='hold'}
read_tsv("data/raw/life-history-characteristics/Life-history characteristics of European birds.txt",
col_types = cols_only('Species' = col_character(), 'WeightU_MEAN' = col_double(), 'WeightM_MEAN' = col_double(), 'WeightF_MEAN' = col_double())) %>%
rowwise %>%
mutate(mean_weight = mean(c(WeightU_MEAN, WeightF_MEAN, WeightM_MEAN))) %>%
dplyr::select(Species, mean_weight) %>%
rename(species = Species) %>%
filter(!any(str_ends(species, "ssp"))) %>% # Filter out birds not identified to species
rowwise() %>%
mutate(species = paste(str_split(species, pattern = " ")[[1]][1:2], collapse = " ")) %>%
ungroup() %>%
group_by(species) %>%
summarise(mean_weight = mean(mean_weight), .groups = "drop_last",
mean_crs = mean_weight ^ (2/3)) %>%
drop_na() -> lhc
lhc[lhc$species == "Aquila nipalenis", "species"] <- "Aquila nipalensis" # Small error in dataset -> notified author
head(lhc, 10) %>%
kable(format = "html", col.names = colnames(lhc)) %>%
kable_styling() %>%
kableExtra::scroll_box(width = "100%", height = "400px")
```
Now we can try to link the names once again with what can be found in GBIF.
```{r link_life_history_characteristics_with_gbif}
unique_species_lhc <- unique(lhc$species)
lhc_species_lut <- build_species_lut(unique_species_lhc, dataset = NULL)
head(lhc_species_lut, 10) %>%
kable(format = "html", col.names = colnames(lhc_species_lut)) %>%
kable_styling() %>%
kableExtra::scroll_box(width = "100%", height = "400px")
```
With the GBIF IDs/keys in place for both the life-history characteristics, as well as the Sovon counts, we can now link the datasets together. We start with the waterbirds.
```{r link_sovon_counts_with_life_history_characteristics_wb_counts}
lhc %>%
left_join(lhc_species_lut, by = c("species" = "scientificname")) %>%
dplyr::select(species, mean_weight, mean_crs, gbif_key) -> lhc
wb_data %>%
left_join(species_lut, by = c("species" = "lookupname")) %>%
left_join(dplyr::select(lhc, mean_weight, mean_crs, gbif_key), by = c("gbif_key" = "gbif_key")) %>%
left_join(dplyr::select(lhc, mean_weight, mean_crs, species), by = c("scientificname" = "species")) %>%
dplyr::select(-c(mean_weight.x, mean_crs.x)) %>%
rename(mean_weight = mean_weight.y, mean_crs = mean_crs.y) -> wb_data_lhc
head(wb_data_lhc, 10) %>%
kable(format = "html", col.names = colnames(wb_data_lhc)) %>%
kable_styling() %>%
kableExtra::scroll_box(width = "100%", height = "400px")
```
And then the PTT counts.
```{r link_sovon_counts_with_life_history_characteristics_ptt_counts}
ptt_data %>%
left_join(species_lut, by = c("species" = "lookupname")) %>%
left_join(dplyr::select(lhc, mean_weight, mean_crs, gbif_key), by = c("gbif_key" = "gbif_key")) %>%
left_join(dplyr::select(lhc, mean_weight, mean_crs, species), by = c("scientificname" = "species")) %>%
dplyr::select(-c(mean_weight.x, mean_crs.x)) %>%
rename(mean_weight = mean_weight.y, mean_crs = mean_crs.y) -> ptt_data_lhc
head(ptt_data_lhc, 10) %>%
kable(format = "html", col.names = colnames(ptt_data_lhc)) %>%
kable_styling() %>%
kableExtra::scroll_box(width = "100%", height = "400px")
```
Now we can verify if our *1% criteria* (see above) for species matching is met. We do this by calculating the proportions of birds belonging to a certain species out of the total numbers of birds counted within a count. This should result in an empty dataframe, which will stop the code chunk below from running if that is *not* the case.
```{r verify_one_percent_criteria_wb_counts}
wb_data_lhc %>%
as.data.frame() %>%
group_by(count_id) %>%
mutate(total_birds_count = sum(number)) %>%
group_by(count_id, species) %>%
mutate(proportion_species = sum(number) / total_birds_count) %>%
ungroup() %>%
arrange(count_id) %>%
filter((is.na(mean_weight) & (proportion_species > 0.01))) -> wb_data_lhc_verify
stopifnot(nrow(wb_data_lhc_verify) == 0)
rm(wb_data_lhc_verify)
```
And once again, we can do the same for the PTT counts.
```{r verify_one_percent_criteria_ptt_counts}
ptt_data_lhc %>%
as.data.frame() %>%
group_by(count_id) %>%
mutate(total_birds_count = sum(number)) %>%
group_by(count_id, species) %>%
mutate(proportion_species = sum(number) / total_birds_count) %>%
ungroup() %>%
arrange(count_id) %>%
filter((is.na(mean_weight) & (proportion_species > 0.01))) -> ptt_data_lhc_verify
stopifnot(nrow(ptt_data_lhc_verify) == 0)
rm(ptt_data_lhc_verify)
```
With that out of the way we can *finally* remove the remaining empty rows and save the PTT and waterbird counts in their final form.
```{r remove_empty_rows}
ptt_data_lhc %>%
drop_na() -> ptt_data
wb_data_lhc %>%
drop_na() -> wb_data
```
And we save the data for further use.
```{r save_processed_sovon_data}
saveRDS(ptt_data, file = "data/processed/sovon/ptt.RDS")
saveRDS(wb_data, file = "data/processed/sovon/wb.RDS")
```
## Calculating total bird biomass
While we are at it, we will already calculate the total biomass contained within each of the count areas, so we can then add these values to the PPIs through the IDs of the count areas/routes. We use the waterbird count from January 2018, and the point-transect counts from 2017, as these are the best in terms of coverage.
```{r calculate_total_biomass}
wb_year <- 2018
ptt_year <- 2017
wb_data %>%
mutate(area_nr = as.character(area_nr)) %>%
filter(year == wb_year) %>%
rowwise() %>%
mutate(specific_biomass = number * mean_weight,
specific_crs = number * mean_crs) %>%
ungroup() %>%
group_by(area_nr, year) %>%
summarise(total_biomass = sum(specific_biomass),
total_crs = sum(specific_crs),
weighted_mean_weight = weighted.mean(mean_weight, number),
weighted_mean_crs = weighted.mean(mean_crs, number),
total_birds = sum(number),
.groups = "drop_last") %>%
ungroup() %>%
group_by(area_nr) %>% # Below is only required if year-filter is NOT set, otherwise does nothing
summarise(total_biomass = mean(total_biomass),
total_crs = mean(total_crs),
weighted_mean_weight = mean(weighted_mean_weight),
weighted_mean_crs = mean(weighted_mean_crs),
total_birds = mean(total_birds),
.groups = "drop_last") %>%
identity() -> wb_biomass
ptt_data %>%
filter(year == ptt_year) %>%
rowwise() %>%
mutate(specific_biomass = number * mean_weight,
specific_crs = number * mean_crs) %>%
ungroup() %>%
group_by(route, year) %>%
summarise(total_biomass = sum(specific_biomass),
total_crs = sum(specific_crs),
weighted_mean_weight = weighted.mean(mean_weight, number),
weighted_mean_crs = weighted.mean(mean_crs, number),
total_birds = sum(number),
.groups = "drop_last") %>%
ungroup() %>%
group_by(route) %>% # Below is only required if year-filter is NOT set, otherwise does nothing
summarise(total_biomass = mean(total_biomass),
total_crs = mean(total_crs),
weighted_mean_weight = mean(weighted_mean_weight),
weighted_mean_crs = mean(weighted_mean_crs),
total_birds = mean(total_birds),
.groups = "drop_last") %>%
identity() -> ptt_biomass
```
And now we will save these datasets of `total_biomass` as well.
```{r save_total_biomass}
saveRDS(wb_biomass, file = "data/processed/sovon/wb_biomass.RDS")
saveRDS(ptt_biomass, file = "data/processed/sovon/ptt_biomass.RDS")
```
## Calculating proportions across bird families
Now that we have identified which families birds belong to, we can calculate the proportion of birds that belong to these families across all the PTT and waterbird counts. This can give an idea of the spatial composition of bird communities to illustrate analysis results with.
```{r calculate_proportions_per_family, results='hold'}
wb <- wb_data
ptt <- ptt_data
families <- unique(species_lut$familyvernacular)
wb %>%
mutate(area_nr = as.character(area_nr)) %>%
filter(year == wb_year) %>%
group_by(area_nr, year) %>%
mutate(total_birds = sum(number)) %>%
ungroup() %>%
group_by(area_nr, year, familyvernacular) %>%
mutate(familyprop = sum(number) / total_birds) %>%
ungroup() %>%
dplyr::select(area_nr, familyvernacular, familyprop, total_birds) %>%
distinct() %>%
arrange(area_nr) %>%
pivot_wider(names_from = familyvernacular, values_from = familyprop) %>%
replace(is.na(.), 0) %>%
identity() -> wb_props
ptt %>%
filter(year == ptt_year) %>%
group_by(route, year) %>%
mutate(total_birds = sum(number)) %>%
ungroup() %>%
group_by(route, year, familyvernacular) %>%
mutate(familyprop = sum(number) / total_birds) %>%
ungroup() %>%
dplyr::select(route, familyvernacular, familyprop, total_birds) %>%
distinct() %>%
arrange(route) %>%
pivot_wider(names_from = familyvernacular, values_from = familyprop) %>%
replace(is.na(.), 0) %>%
identity() -> ptt_props
```
And now we will save these datasets of family proportions as well.
```{r save_family_proportions}
saveRDS(wb_props, file = "data/processed/sovon/wb_props.RDS")
saveRDS(ptt_props, file = "data/processed/sovon/ptt_props.RDS")
```
## Export lookup table characteristic groups
We store a lookup table of characteristic groups and scientific names as .csv for a WebTable in the paper.
```{r}
ptt %>%
dplyr::select(species, scientificname, familyvernacular, mean_weight, mean_rcs = mean_crs) %>%
distinct(scientificname, .keep_all = TRUE) %>%
group_by(familyvernacular) %>%
mutate(family_mean_weight = mean(mean_weight)) %>%
ungroup() %>%
arrange(desc(family_mean_weight), desc(mean_weight)) %>%
write.csv("data/processed/sovon/lut_species_family.csv")
```
## Summary statistics for WebPanel
Number of PTT counts included
```{r number_of_ptt_counts}
ptt %>%
distinct(count_id) %>%
nrow()
```
Number of waterbird counts included
```{r number_of_waterbird_counts}
wb %>%
distinct(area_nr) %>%
nrow()
```