Skip to content

Commit

Permalink
Merge pull request #157 from birdflow-science/align
Browse files Browse the repository at this point in the history
Add snap_to_birdflow() and lookup_date()
  • Loading branch information
ethanplunkett committed Dec 22, 2023
2 parents 0b1020b + 48e0606 commit 9f1a0d8
Show file tree
Hide file tree
Showing 10 changed files with 572 additions and 6 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: BirdFlowR
Title: Predict and Visualize Bird Movement
Version: 0.1.0.9045
Version: 0.1.0.9046
Authors@R:
c(person("Ethan", "Plunkett", email = "plunkett@umass.edu", role = c("aut", "cre"),
comment = c(ORCID = "0000-0003-4405-2251")),
Expand Down
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ export(is_location_valid)
export(latlon_to_xy)
export(load_collection_index)
export(load_model)
export(lookup_date)
export(lookup_season_timesteps)
export(lookup_timestep)
export(lookup_timestep_sequence)
Expand All @@ -79,6 +80,7 @@ export(route)
export(route_migration)
export(row_to_y)
export(sample_distr)
export(snap_to_birdflow)
export(sparsify)
export(species)
export(species_info)
Expand Down Expand Up @@ -116,6 +118,7 @@ importFrom(sf,st_as_sf)
importFrom(sf,st_bbox)
importFrom(sf,st_crs)
importFrom(stats,cor)
importFrom(stats,median)
importFrom(stats,predict)
importFrom(terra,crs)
importFrom(terra,ext)
Expand Down
17 changes: 17 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,20 @@
# BirdFlowR 0.1.0.9045
2023-12-21

New functions:

* `snap_to_birdflow()` snap movement data to a BirdFlow model
week and cells and optionally aggregate the data from each week
(by unique bird or track).

* `lookup_date()` convert timesteps to dates.

Changes

* `lookup_timestep()` gains a new argument `allow_failure`. FALSE, the default
mimics prior behanvior. Set to TRUE to return NA if the date cannot be
looked up.

# BirdFlowR 0.1.0.9045
2023-12-18

Expand Down
18 changes: 18 additions & 0 deletions R/lookup_date.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@

#' Retrieve date associated with a timestep
#'
#' @param timestep Integer between 1 and `n_timesteps(bf)`
#' @param bf A BirdFlow object
#'
#' @return A Date object
#' @seealso [get_dates()], [lookup_timestep()], [lookup_timestep_sequence()]
#' @export
#'
#'
#' @examples
#' bf <- BirdFlowModels::amewoo
#' lookup_date(1:5, bf)
lookup_date <- function(timestep, bf){
dates <- get_dates(bf)
lubridate::as_date(dates$date[match(timestep, dates$timestep)])
}
20 changes: 16 additions & 4 deletions R/lookup_timestep.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,17 @@
#' a numeric timestep, a character representing a timestep e.g. "t1", or "all"
#' for all timesteps in the model.
#' @param bf A BirdFlow object.
#' @param allow_failure If TRUE function will return NA values when it fails
#' to resolve a timestep for any element of `x`. With the default, FALSE,
#' the function will throw an error if not all elements of `x` are resolved
#' to timesteps.
#' @return A vector of timesteps corresponding to elements in `x`.
#' @export
#' @examples
#' bf <- BirdFlowModels::amewoo
#' lookup_timestep(c("2001-3-23", "2022-12-05"), bf)
#'
lookup_timestep <- function(x, bf) {
lookup_timestep <- function(x, bf, allow_failure = FALSE) {
stopifnot(inherits(bf, "BirdFlow"))
dates <- get_dates(bf) # standardize to current date format
original_x <- x
Expand All @@ -54,15 +58,23 @@ lookup_timestep <- function(x, bf) {
x <- dates$timestep[match(week, dates$week)]
}

# Eliminate out of range values
x[!x %in% dates$timestep] <- NA

# Check that x is numeric (should always be TRUE by now)
if(!is.numeric(x))
stop("Date lookup failed. Likely the class of x was unexpected.")

# Force integers to numeric
x <- as.numeric(x)

# Validate
if (!is.numeric(x) || any(is.na(x)) || !all(x %in% dates$timestep)) {
if ( any(is.na(x)) && !allow_failure) {
xtext <- ifelse(length(original_x) > 3,
paste0(paste(original_x[1:3], collapse = ", "),
", ..."),
paste0(original_x, collapse = ", "))
stop("Date lookup failed for x = ", xtext)
}
if (is.integer(x))
x <- as.numeric(x)
return(x)
}
268 changes: 268 additions & 0 deletions R/snap_to_birdflow.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,268 @@
#' Assign cells and timesteps to coordinates and dates
#'
#' This function aligns bird movement data that has both location and date
#' information with a BirdFlow model's cells and timesteps.
#' Optionally,it will aggregate multiple points from a track within the same
#' week into a single point and date prior to determining the location ID.
#'
#' If `aggregate` is NULL than than each row in the output will correspond with
#' the same row in the input. With aggregation rows can be lost in two ways
#' (1) if the coordinates or timestep can't be resolved (first three errors
#' on list) then the row is dropped prior to aggregation.
#' (2) via aggregation all the rows within a week for a track will be collapsed
#' to one. These aggregated locations can still trigger the other three
#' errors on the list.
#'
#' A location id will only be assigned if the location is a valid location for
#' the model on the associated date.
#'
#' @param d A data frame with bird movement data it must have the columns
#' specified by `x_col`, `y_col`, `date_col`, and `id_cols`. The default
#' values for the first three are `"lon"`, `"lat"`, and`"date"`.
#' @param bf A BirdFlow model
#' @param x_col The name of the column that holds x or longitude coordinates.
#' Default is `"lon"`.
#' @param y_col The name of the y or latidute column, default is `"lat"`.
#' @param date_col Name of the date column, defaults to `"date"`
#' @param id_cols The name of identification columns. One or more columns that
#' collectively define a unique track. If `aggregate` is not `NULL` these along
#' with `timestep` (calculagted from the `date_col`) are used to define the
#' groups. If aggregate is `NULL` these columns are not used but are retained
#' in the output.
#' @param crs The coordinate reference system used by `x_col` and
#' `y_col`. The default `"EPSG:4326"` corresponds to
#' [WGS 1984](https://epsg.io/4326)
#' @param aggregate Leave the default `NULL` for no aggregation - each row in
#' `d` is processed separately and represented in the returned object.
#' If `aggregate` is set to one of the values below then the locations (rows)
#' in `d` that fall in the same week and track (see `id_cols`) will be
#' aggregated together such that there is only one location per week for each
#' track.
#'
#' The possible values for `aggregate` are:
#' * `mean` The mean of the "x", "y", and "date" columns are used for the week.
#' * `median` The median of the "x", "y", and "date" columns are used for the
#' week.
#' * `midweek` The observation that is closest to the middle of the week is
#' used to represent the week. With ties the observation that occurs first
#' is used.
#'
#' Pending ideas, not yet implemented:
#' * `gmedian` [geometric median](https://cran.r-project.org/web/packages/Gmedian/index.html)
#' * `central` The point closest to the centroid of all the points is used to
#' represent the week.
#' @return A data frame with columns
#' \item{`<id_cols>`}{The columns identified with `id_cols` will be retained and be
#' leftmost.}
#' \item{date}{This column will have information from `date_col` but not retain
#' its name or original formatting. If aggregate is NULL the input dates
#' will be retained, if not the date will vary with the aggegation method but
#' will represent the dates that went into the summary and not the mid-week
#' date associated with `timestep`.
#' For example with `aggregate = "mean"` the date will be the average date of
#' the points in the week.}
#' \item{timestep}{The model timestep associated with `date`.}
#' \item{x, y}{The x and y coordinates of the point. These will always be in
#' `crs(bf)` and will represent the original or aggregated location and
#' not the cell center.}
#' \item{i}{The location index of the cell associated with the `x` and `y` in
#' `bf`. See [i_to_xy()].}
#' \item{n}{The number of rows in `d` that contributed to each output row.
#' If aggregate is NULL every value will be 1. }
#' \item{error}{TRUE if there was an error.}
#' \item{message}{NA or the error message. The possible messages are:
#' * "err_date" - The date could not be parsed with [lubridate::as_date()]
#' * "err_truncated" - `bf` is [truncated](truncate_birdflow) and the date
#' falls outside of portion of the year the model covers.
#' * "err_coords" - The coordinates could not be reprojected into `crs(bf)`
#' and thus likely are corrupt in some way.
#' * "err_not_active" - the location does not fall within an active cell
#' as defined by the static mask.
#' * "err_dynamic" - the location does not fall within the dynamic mask on
#' the associated date.
#' * "err_sparse" - the location falls within the dynamic mask but that
#' location and date combination has been eliminated by
#' [sparsification](sparsify).
#'
#' The function will always return the error message that appears first on
#' this list, even though in some cases multiple errors can be triggered.
#'
#' With aggregation the first three errors prevent a row from being used and
#' it will be dropped prior to aggregation with a warning.
#' }
#' @export
#' @importFrom stats median
#' @examples
#' bf <- BirdFlowModels::rewbla |> add_dynamic_mask()
#' obs <- BirdFlowModels::rewbla_observations
#' a <- snap_to_birdflow(obs, bf, id_cols = "bird_id")
#'
snap_to_birdflow <- function(d, bf,
x_col = "lon", y_col = "lat",
date_col = "date",
id_cols = "id",
crs = "EPSG:4326",
aggregate = NULL){


# Check columns
for (arg in c("x_col", "y_col", "date_col", "id_cols")) {
value = get(arg)
if (!all(value %in% names(d)))
stop("\"", setdiff(value, names(d)), "\" is not a column in d")
}

stopifnot(is.numeric(d[[x_col]]),
is.numeric(d[[y_col]]))

# Helper - make an empty error table
make_error_table <- function(n){
f <- rep(FALSE, n)
data.frame(error = f,
err_date = f,
err_truncated = f,
err_coords = f,
err_not_active = f,
err_dynamic = f,
err_sparse =f)
}

# helper - propagate an error in any "err_..." column to "error" column
update_errors <- function(e){
e$error <- apply(e[ , !names(e) == "error", drop = FALSE], 1, any)
e
}


errors <- make_error_table(nrow(d))

# Process dates into timesteps and labels
dates <- suppressWarnings(lubridate::as_date(d[[date_col]]))
errors$err_date[is.na(dates)] <- TRUE
errors <- update_errors(errors)
d$timestep <- lookup_timestep(dates, bf, allow_failure = TRUE)
errors$err_truncated <- !errors$err_date & is.na(d$timestep)
errors <- update_errors(errors)

# Convert to sf
d_sf <- sf::st_as_sf(d, coords = c(x_col, y_col))
sf::st_crs(d_sf) <- sf::st_crs(crs)

# Transform to bf crs
d_t <- sf::st_transform(d_sf, sf::st_crs(crs(bf)))
coords <- sf::st_coordinates(d_t)
colnames(coords) <- tolower(colnames(coords))
stopifnot(colnames(coords) == c("x", "y"))
errors$err_coords <- apply(coords, 1, anyNA)
errors <- update_errors(errors)

# Add transformed coordinates as "x" and "y" columns
names(d)[names(d) == "x"] <- "original_x"
names(d)[names(d) == "y"] <- "original_y"
d <- cbind(d, coords)
rm(coords)


if(is.null(aggregate)){
# Number of observations is 1 if no aggregation
d$n <- 1
} else { # aggregation

#### Aggregate ####

# Note: since it's unclear what to do with rows that have unresolved
# dates or locations when we aggregate I'm dropping them here.
keep <- !errors$error
if(!all(keep)){
warning(sum(!keep), " input rows have dates or coordinates that can't ",
"be resolved and will be dropped prior to aggregation.")
d <- d[keep, , drop = FALSE]
errors <- errors[keep, , drop = FALSE]
}


# Add coordinates - and drop any prexisting x or y columns
# work with a temporary copy
d$date <- lubridate::as_date(d[[date_col]])

d <- dplyr::group_by(d, dplyr::pick({{id_cols}}), .data$timestep)



d <- switch(aggregate,
"mean" = {dplyr::summarize(d,
x = mean(.data$x),
y = mean(.data$y),
date = mean(.data$date),
n = dplyr::n())},
"median" = {dplyr::summarize(d,
x = median(.data$x),
y = median(.data$y),
date = median(.data$date),
n = dplyr::n())},
"midweek" = {
d$mid <- lookup_date(d$timestep, bf)

# find which element in x is closest to y
closest <- function(x, y){
which.min(abs(x - y))
}

d <- dplyr::summarize(
d,
x = .data$x[closest(.data$date, .data$mid)],
y = .data$y[closest(.data$date, .data$mid)],
date = .data$date[closest(.data$date, .data$mid)],
n = dplyr::n())
d$mid <- NULL
d
},

# If none of the above match
stop("aggregate should be \"mean\" or \"median\"")
)

d <- as.data.frame(d)

# restore date column name
names(d)[names(d) == "date"] <- date_col

# Make new error for remaining rows
# We dropped all the errors before aggregation so there are none (yet)
errors <- make_error_table(nrow(d))

}

# resolve i (location index)
d$i <- xy_to_i(d[ , c("x", "y"), drop = FALSE], bf = bf)
errors$err_not_active <- is.na(d$i)
errors <- update_errors(errors)

# Check against dynamic mask
dm <- get_dynamic_mask(bf)
no_err <- !errors$error # only evaluate where there aren't other problems
errors$err_dynamic[no_err] <- !dm[cbind(d$i[no_err], d$timestep[no_err])]
errors <- update_errors(errors)

# Check against sparsification
# sm is the sparsified mask, locations where the dynamic mask is 1
# but the marginal derived distribution is 0
sm <- (get_distr(bf, from_marginals = TRUE) > 0) & dm
no_err <- !errors$error # only evaluate where there aren't other problems
errors$err_sparse[no_err] <- !sm[cbind(d$i[no_err], d$timestep[no_err])]
errors <- update_errors(errors)

# Add error information to d
d$error <- errors$error
errors$error <- NULL
d$message <- names(errors)[apply(errors, 1, function(x) which(x)[1])]

# Standarize output
expected_cols <- c(id_cols, "date", "timestep", "x", "y", "i", "n",
"error", "message")


stopifnot(all(expected_cols %in% names(d)))

return(d[, expected_cols, drop = FALSE])
}
Loading

0 comments on commit 9f1a0d8

Please sign in to comment.