Skip to content

Commit

Permalink
Merge pull request #193 from birdflow-science/189-outliers
Browse files Browse the repository at this point in the history
Add trim_quantile argument to preprocess_species()
  • Loading branch information
ethanplunkett committed Jun 14, 2024
2 parents 5f1d670 + 4acfda8 commit 80b4a25
Show file tree
Hide file tree
Showing 8 changed files with 192 additions and 9 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: BirdFlowR
Title: Predict and Visualize Bird Movement
Version: 0.1.0.9064
Version: 0.1.0.9065
Authors@R:
c(person("Ethan", "Plunkett", email = "plunkett@umass.edu", role = c("aut", "cre"),
comment = c(ORCID = "0000-0003-4405-2251")),
Expand Down
13 changes: 13 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
# BirdFlowR 0.1.0.9065
2024-06-14

Add `trim_quantile` argument to `preprocess_species()` and `process_rasters()`
to allow trimming values above a quantile in the values to the quantile's value.
E.g. with `trim_quantile = 0.99` any value above the 99th percentile will be
trimmed to value of the 99th percentile.

The motivation is to trim high outliers when they are believed to be artifacts
of the S&T models and eBird data. Robins are a good example.

Closes #198 "Outliers".

# BirdFlowR 0.1.0.9064
2024-06-14

Expand Down
29 changes: 27 additions & 2 deletions R/preprocess_species.R
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,17 @@
#' quality using based on the four `<season>_model_quality` columns in
#' [ebirdst_runs][ebirdst::ebirdst_runs] ignored with 2021 \pkg{ebirdst}
#' version year.
#' @param trim_quantile With the default of `NULL` there is no outlier trimming,
#' otherwise a single value between 0 and 1 to indicate the quantile
#' to truncate at or a series of 52 such values corresponding with each week.
#' Trimming outliers is always done week by week with the values above the
#' `trim_quantile` quantile set to the value
#' of that quantile. Reasonable non `NULL` values will be close to 1 e.g. 0.99,
#' 0.995, 0.999.
#' Set `trim_quantile` to eliminate high outliers that are believed
#' to be model artifacts.
#' See [Issue #189](https://github.com/birdflow-science/BirdFlowR/issues/189)
#' for detailed justification.
#' @inheritDotParams lookup_timestep_sequence -x
#'
#' @return Returns a BirdFlow model object that lacks marginals, but is
Expand Down Expand Up @@ -113,6 +124,7 @@ preprocess_species <- function(species = NULL,
gpu_ram = 12,
skip_quality_checks = FALSE,
min_season_quality = 3,
trim_quantile = NULL,
...

) {
Expand Down Expand Up @@ -144,6 +156,18 @@ preprocess_species <- function(species = NULL,
if (is.na(species))
stop("species cannot be NA")

if (!is.null(trim_quantile)) {
stopifnot(is.numeric(trim_quantile),
!anyNA(trim_quantile),
all(trim_quantile > 0),
all(trim_quantile <= 1),
length(trim_quantile) == 1 || length(trim_quantile) == 52)

if (length(trim_quantile) == 1)
trim_quantile <- rep(trim_quantile, 52)

}

stopifnot(is.logical(hdf5),
length(hdf5) == 1)

Expand Down Expand Up @@ -388,7 +412,8 @@ preprocess_species <- function(species = NULL,
sp_path = sp_path,
clip = clip,
project_method = project_method,
download_patterns = download_patterns)
download_patterns = download_patterns,
trim_quantile = trim_quantile)
mask <- a$mask

# Save distribution and confidence intervals to export object
Expand Down Expand Up @@ -451,7 +476,7 @@ preprocess_species <- function(species = NULL,
bf_msg(message)

#----------------------------------------------------------------------------#
# Truncate
# Truncate (dates)
#----------------------------------------------------------------------------#
truncated <- !all(seq_len(n_timesteps(export)) %in%
lookup_timestep_sequence(x = export, ...))
Expand Down
34 changes: 32 additions & 2 deletions R/process_rasters.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
#' species range.
#' @param project_method Method to use when reprojecting. Set locally by code
#' within `preprocess_species()`
#'
#' @inheritParams preprocess_species
#' @return A list with
#' \item{distr, uci, lci}{The are the species distribution, and upper and lower
#' confidence intervals on that distribution in their flattened form. Each
Expand All @@ -28,7 +28,8 @@ process_rasters <- function(res,
sp_path,
clip,
project_method,
download_patterns) {
download_patterns,
trim_quantile = NULL) {


res_m <- res * 1000
Expand Down Expand Up @@ -235,6 +236,35 @@ process_rasters <- function(res,
lci <- terra::values(abunds_lci_low_res)[m, , drop = FALSE]
lci[is.na(lci)] <- 0

#----------------------------------------------------------------------------#
# Trim outliers
#----------------------------------------------------------------------------#
if (!is.null(trim_quantile)) {

nt <- ncol(distr)

stopifnot(is.numeric(trim_quantile),
!anyNA(trim_quantile),
all(trim_quantile > 0),
all(trim_quantile <= 1),
length(trim_quantile) == 1 || length(trim_quantile) == nt)

if (length(trim_quantile) == 1)
trim_quantile <- rep(trim_quantile, nt)

truncate_and_renormalize <- function(x, p = 0.95) {
q <- quantile(x[!x == 0], p = p)
x[x > q] <- q
x <- x / sum(x, na.rm = TRUE)
x
}

for (i in seq_len(ncol(distr))) {
distr[, i] <- truncate_and_renormalize(distr[, i], p = trim_quantile[i])
}
} # end trim outliers


return(list(distr = distr,
uci = uci,
lci = lci,
Expand Down
3 changes: 3 additions & 0 deletions inst/WORDLIST
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
th
bbdfc
al
Albers
Expand Down Expand Up @@ -107,6 +108,8 @@ preprocessed
probabilistically
programmatically
proj
quantile
quantile's
ragg
rasterize
rasters
Expand Down
13 changes: 13 additions & 0 deletions man/preprocess_species.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 19 additions & 1 deletion man/process_rasters.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

87 changes: 84 additions & 3 deletions tests/testthat/test-preprocess_species.R
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ test_that("preprocess_species() works with clip", {
xmax <- 1550000
ymax <- 1300000
poly <- matrix(c(xmin, xmin, xmax, xmax, xmin,
ymin, ymax, ymax, ymin, ymin), ncol = 2) |>
ymin, ymax, ymax, ymin, ymin), ncol = 2) |>
list() |>
sf::st_polygon()

Expand Down Expand Up @@ -179,8 +179,8 @@ test_that("preprocess_species() works with clip", {
clip = clip,
season = "prebreeding",
gpu_ram = 2
)
)
)


# Test that expect file was created
Expand Down Expand Up @@ -214,6 +214,87 @@ test_that("preprocess_species() works with crs arg", {
})


test_that("preprocess_species() works with trim_quantile", {
skip_on_cran()
skip_on_ci()
local_quiet()

bf <- preprocess_species(species = "example_data",
res = 200,
hdf5 = FALSE,
trim_quantile = NULL)


expect_no_error(
bf_trim <- preprocess_species(species = "example_data",
res = 200,
hdf5 = FALSE,
trim_quantile = 0.99)
)




# Expect highest value to be lower in trimmed dataset
expect_true((dt < d)[which.max(d)])

# Expect lowest non-zero value to be higher in trimmed dataset
sv <- d != 0
expect_true((dt[sv] > d[sv])[which.min(d[sv])])

if (FALSE) {
# Code to visualize differences
# With the test dataset there's almost nothing.
d <- get_distr(bf, 10)
dt <- get_distr(bf_trim, 10)
col <- rep(NA, length(d))
col[d < dt] <- "Blue"
col[d == dt] <- "Black"
col[d > dt] <- "Red"
plot(d, dt, col = col)
abline(a = 0, b = 1)
cbind(d, dt)
get_distr(bf, c(1, 10, 20, 40)) |> plot_distr(bf)
get_distr(bf_trim, c(1, 10, 20, 40)) |> plot_distr(bf_trim)
}

skip("Slow test of quatile trimming on Robins - always skipped.")

# This test requires a valid ebirdst key as well

# The robin distribution is super concentrated in the first timestep
# and was the motivation for adding quantile based trimming.
# This test is mostly a visual confirmation that it's working
# as expected with that species.

bf <- preprocess_species("amerob", hdf5 = FALSE, res = 200)
bft <- preprocess_species("amerob", hdf5 = FALSE, res = 200,
trim_quantile = c(0.994, rep(1, 51)))

get_distr(bf, c(1, 10)) |> plot_distr(bf)
get_distr(bft, c(1, 10)) |> plot_distr(bft)

# Only first distribution was trimmed
expect_equal(get_distr(bf, 2), get_distr(bft, 2))


# Expect highest value to be lower in trimmed dataset
expect_true((dt < d)[which.max(d)])

# Expect lowest non-zero value to be higher in trimmed dataset
sv <- d != 0
expect_true((dt[sv] > d[sv])[which.min(d[sv])])

# Visualize outlier truncation
if (FALSE) {
d <- get_distr(bf, 1)
dt <- get_distr(bft, 1)
plot(d, dt)
abline(0, 1)
}

})

test_that("preprocess_species() works with clip and crs", {

# This was a problem that manifested while running lots of species with a
Expand All @@ -228,7 +309,7 @@ test_that("preprocess_species() works with clip and crs", {
# Note this test is slow and requires an active ebirdst access key
# See ebirdst::set_ebirdst_access_key()

skip("Slow test to documents an old bug. Always skipped.")
skip("Slow test to document an old bug. Always skipped.")

species <- "banswa"

Expand Down

0 comments on commit 80b4a25

Please sign in to comment.