From 61fcdfa3325cd8cb09e22dfb9c2ba6700a1aae1b Mon Sep 17 00:00:00 2001 From: Ethan Plunkett Date: Fri, 12 Apr 2024 19:01:51 +0000 Subject: [PATCH] Update calc_flux() and plot_flux() Use sparse array in calc_flux() to reduce memory requirments. Improve labels in plot_flux(). --- DESCRIPTION | 6 +- NEWS.md | 20 +++-- R/animate_distr.R | 2 +- R/animate_flux.R | 2 +- R/calc_flux.R | 49 +++++++++-- R/is_between.R | 81 +++++++++++++------ R/plot_flux.R | 26 +++++- man/animate_distr.Rd | 2 +- man/calc_flux.Rd | 28 +++++-- man/is_between.Rd | 2 +- ...elper-skip_if_wrong_ebirdst_for_snapshot.R | 2 +- tests/testthat/test-calc_flux.R | 11 ++- .../header-attrs-2.25/header-attrs.js | 12 +++ 13 files changed, 187 insertions(+), 56 deletions(-) create mode 100644 vignettes/BirdFlowOverview_files/header-attrs-2.25/header-attrs.js diff --git a/DESCRIPTION b/DESCRIPTION index be127d9d..774b6d59 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,10 +1,9 @@ Package: BirdFlowR Title: Predict and Visualize Bird Movement -Version: 0.1.0.9055 +Version: 0.1.0.9056 Authors@R: c(person("Ethan", "Plunkett", email = "plunkett@umass.edu", role = c("aut", "cre"), - comment = c(ORCID = "0000-0003-4405-2251")), - person("Daniel", "Sheldon", role = c("aut")) ) + comment = c(ORCID = "0000-0003-4405-2251"))) Description: BirdFlowR predicts bird movement and distribution from previously fitted models. License: MIT + file LICENSE Encoding: UTF-8 @@ -17,6 +16,7 @@ Suggests: knitr, ragg, rmarkdown, + SparseArray, testthat (>= 3.0.0), withr Config/testthat/edition: 3 diff --git a/NEWS.md b/NEWS.md index d8edfd9b..bb4f792a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,13 @@ +# BirdFlowR 0.1.0.9056 +2024-04-04 +## Flux III +* `is_between()` now uses a `SparseArray::SparseArray()` for the logical array. + to reduce memory usage +* Labeling updates to `plot_flux()` and `animate_flux()`. + + # BirdFlowR 0.1.0.9055 -2023-03-29 +2024-03-29 ## Flux II * Flux values are now divided by the radius to standardize units at at P/km/week/ where P is the proportion of the population. @@ -53,7 +61,7 @@ calculate and work with non-directional flux / net movement. # BirdFlowR 0.1.0.9053 -2023-03-15 +2024-03-15 * New `shrink_birdlfow()` reverses `extend_birdflow()`, returning the model to it's original extent. @@ -62,14 +70,14 @@ calculate and work with non-directional flux / net movement. the original. # BirdFlowR 0.1.0.9052 -2023-03-14 +2024-03-14 * `export_birdflow()` gets new arguments to control output file names. Default values mimic old behavior. # BirdFlowR 0.1.0.9051 -2023-02-28 +2024-02-28 * Add `get_mask()` * Add `extend_bird_flow()` @@ -89,12 +97,12 @@ calculate and work with non-directional flux / net movement. appropriately with both file structures. # BirdFlowR 0.1.0.9050 -2023-02-22 +2024-02-22 * Add test for `preprocess_species()` with a non-default CRS. # BirdFlowR 0.1.0.9049 -2023-01-11 +2024-01-11 #### Drop "_clip" diff --git a/R/animate_distr.R b/R/animate_distr.R index ffe1f90e..f1e896de 100644 --- a/R/animate_distr.R +++ b/R/animate_distr.R @@ -61,7 +61,7 @@ #' # Display #' print(gif) #' -#' # mv +#' # Save #' gif_file <- tempfile("animation", fileext = ".gif") #' gganimate::save_animation(gif, gif_file) #' file.remove(gif_file) # cleanup diff --git a/R/animate_flux.R b/R/animate_flux.R index 15ef1d09..b9ac1fad 100644 --- a/R/animate_flux.R +++ b/R/animate_flux.R @@ -14,7 +14,7 @@ animate_flux <- function(flux, bf, title = species(bf), ...) { anim <- p + ggplot2::facet_null() + - gganimate::transition_manual(frames = .data$date) + + gganimate::transition_manual(frames = .data$label) + ggplot2::labs(title = title, subtitle = "{current_frame}") diff --git a/R/calc_flux.R b/R/calc_flux.R index 89199887..c9158897 100644 --- a/R/calc_flux.R +++ b/R/calc_flux.R @@ -25,16 +25,34 @@ #' major geographic features such as coasts, large lakes, mountain ranges, and #' ecological system boundaries that might result in non-linear migration paths. #' +#' `calc_flux()` assumes that a line passes by a point if any part of the line +#' is within the radius of the point. This assumption breaks down if the +#' radius is much larger than the movement lengths as points that are ahead of +#' the line may still be within a radius of the line. In the extreme a large +#' enough radius on a point outside of the entire range will capture all the +#' movement. This isn't a problem with +#' the default points and radius as the points ahead of the line will never be +#' within the radius. +#' +#' The default points for `calc_flux()` are aligned with the cell centers as +#' are the movement lines. This alignment means that a very small radius will +#' result in an overestimate of flux. The default value of half the cell size +#' is sufficient for this not to be a problem, as we are capturing and +#' standardizing the units based on the entire cell area that that point +#' represents. +#' #' @param bf A BirdFlow model #' @param points A set of points to calculate movement through. If `points` is #' `NULL` they will default to the BirdFlow model cells that are either active -#' or fall between two active cells. Should be a data frame with `x` and `y` +#' or fall between two active cells. Otherwise a data frame with `x` and `y` #' columns containing point coordinates in [crs(bf)][crs()]. -#' @param radius The radius in meters around the points, used to determine if -#' a path is passing through the point. +#' @param radius The radius in meters around the points used to assess whether +#' a movement line passes by (or through) the point. If a point is farther than +#' `radius` from a great circle line between two cells centers then it is not +#' between them. #' @param n_directions The number of directional bins to use for recording #' movement direction. Must be either `1` indicating no direction information -#' or an even number. +#' or an even number. This is a placeholder, currently only `1` is supported. #' @param format The format to return the results in one of: #' \describe{ #' \item{`"points"`}{Returns a list with `flux` a matrix or array of @@ -68,7 +86,12 @@ #' } #' calc_flux <- function(bf, points = NULL, radius = NULL, n_directions = 1, - format = NULL, batch_size = 1e6, check_radius = TRUE) { + format = NULL, batch_size = 5e5, check_radius = TRUE) { + + if (!requireNamespace("SparseArray", quietly = TRUE)) { + stop("The SparseArray package is required to use calc_flux(). ", + "Please install it prior to calling this function.") + } if (n_directions != 1) stop("Only one directional flux is supported at the moment.") @@ -90,6 +113,8 @@ calc_flux <- function(bf, points = NULL, radius = NULL, n_directions = 1, points <- result$points radius_km <- result$radius / 1000 + bf_msg("Calculating Flux\n") + timesteps <- lookup_timestep_sequence(bf) transitions <- lookup_transitions(bf) marginals <- gsub("^T", "M", transitions) @@ -104,7 +129,10 @@ calc_flux <- function(bf, points = NULL, radius = NULL, n_directions = 1, n_pts <- dim(between)[3] + + bf_msg(" Processing Marginals\n") for (i in seq_along(marginals)) { + from <- timesteps[i] to <- timesteps[i + 1] mar <- get_marginal(bf, marginals[i]) @@ -116,10 +144,19 @@ calc_flux <- function(bf, points = NULL, radius = NULL, n_directions = 1, stopifnot(all(dim(sb)[1:2] == dim(mar))) # verify for (j in seq_len(n_pts)){ - net_movement[j, i] <- sum(mar[sb[, , j]]) + # This is a little hacky the marginal is a sparse Matrix::Matrix + # or a standard Matrix (depending on whether bf is sparse) + # sb and between are SparseArray::SparseArray + # SparseArray doesn't support logical sub-setting ("yet"), + # Matrix does. The solution is to convert the selection + # to a standard matrix or a sparse Matrix. The first is empirically a + # lot faster + sel <- as.matrix(sb[, , j]) #, sparse = TRUE) + net_movement[j, i] <- sum(mar[sel]) } } + bf_msg(" Formatting flux\n") # Standardize to P of population to pass through KM of transect in a week net_movement <- net_movement / (radius_km * 2) diff --git a/R/is_between.R b/R/is_between.R index d7d5903b..aed59c00 100644 --- a/R/is_between.R +++ b/R/is_between.R @@ -61,9 +61,15 @@ if (FALSE) { #' \item{radius}{The radius of the circle in meters.} #' @keywords internal is_between <- function(bf, points = NULL, radius = NULL, n_directions = 1, - skip_unconnected = TRUE, batch_size = 1e6, + skip_unconnected = TRUE, batch_size = 1e5, check_radius = TRUE) { bf_msg("Generating between array.\n") + + if (!requireNamespace("SparseArray", quietly = TRUE)) { + stop("The SparseArray package is required to use is_between(). ", + "Please install it prior to calling this function.") + } + # Force use of s2 o_use_s2 <- sf::sf_use_s2() on.exit(sf::sf_use_s2(o_use_s2)) @@ -90,14 +96,18 @@ is_between <- function(bf, points = NULL, radius = NULL, n_directions = 1, "Set check_radius to FALSE to ignore this advice.") } } - bf_msg("Creating points\n") + if (is.null(points)) { + bf_msg(" Creating points\n") + # Create points at all cell centers in the rectangular raster points <- rasterize_distr(get_distr(bf, 1), bf, format = "dataframe") points <- points[, c("x", "y", "i")] active <- points[!is.na(points$i), , drop = FALSE] + bf_msg(" Transforming to spherical coordinates\n") + # Convert to lat long (spherical) coordinates active_sph <- sf::st_as_sf(active, coords = c("x", "y"), crs = crs(bf)) |> sf::st_transform(crs = "EPSG:4326") @@ -128,19 +138,18 @@ is_between <- function(bf, points = NULL, radius = NULL, n_directions = 1, points <- points[sv, , drop = FALSE] } + bf_msg(" Preparing points\n") - - - - bf_msg("Preparing points\n") - - # Initialize array (All FALSE) to hold betweenness - between <- array(data = FALSE, - dim = c(n_active(bf), n_active(bf), nrow(points)), - dimnames = list(from = paste0("F_", seq_len(n_active(bf))), - to = paste0("T_", seq_len(n_active(bf))), - loc = paste0("L_", seq_len(nrow(points))))) + # Initialize sparse array (All FALSE) to hold betweenness + # This is a hack because there's no creation method that allows setting a + # dimension, but I can make an empty "random" array. + between <- SparseArray::randomSparseArray( + dim = c(n_active(bf), n_active(bf), nrow(points)), + density = 0) != 0 + dimnames(between) <- list(from = paste0("F_", seq_len(n_active(bf))), + to = paste0("T_", seq_len(n_active(bf))), + loc = paste0("L_", seq_len(nrow(points)))) # Generate table of active cell x, y, and i (in birdflow CRS) @@ -175,9 +184,6 @@ is_between <- function(bf, points = NULL, radius = NULL, n_directions = 1, # They represent stop overs or seasonal residence, not migratory movement pairs <- pairs[pairs$from != pairs$to, , drop = FALSE] - - - # Drop pairs that aren't ever connected if (skip_unconnected) { # Create matrix indicating which active cells are connected to @@ -202,21 +208,21 @@ is_between <- function(bf, points = NULL, radius = NULL, n_directions = 1, to = col(ever_connected)[ever_connected]) connected_pairs$id <- with(connected_pairs, paste(from, "-", to)) - pairs <- pairs[pairs$id %in% connected_pairs$id, , drop = FALSE] } - # To avoid - # Add batch number + # Add batch number to pairs of points pairs$batch <- rep(seq_len(nrow(pairs)), each = batch_size, length.out = nrow(pairs)) n_batch <- max(pairs$batch) - bf_msg("Processing points in ", n_batch, " batches \n") + # Process pairs of active points in batches - converting to great circle + # lines connecting each pair and then determining which evaluation points + # are within the distance + bf_msg(" Processing points in ", n_batch, " batches \n") for (batch in seq_len(n_batch)) { - bf_msg(round(batch / n_batch * 100, 2), "%\n") b_pairs <- pairs[pairs$batch == batch, , drop = FALSE] # Lines connecting pairs of points lines <- vector(mode = "list", length = nrow(b_pairs)) @@ -233,11 +239,38 @@ is_between <- function(bf, points = NULL, radius = NULL, n_directions = 1, # great circle) to each of the great circle lines connecting the pairs. within_dist <- sf::st_is_within_distance(lines_sfc, points_sph, - dist = units::set_units(radius, "m")) + dist = units::set_units(radius, "m")) + + + + # Update between + # + # There seems to be overhead for interacting with the sparse array + # Making a matrix with three columns which will contain the index + # of every TRUE cell and then updating between once is much + # faster than updating between in a loop. + + + # Make an index matrix of additional TRUE cells + # each row is the three dimensional index of a TRUE (between) cell + add_list <- vector(mode = "list", length = nrow(b_pairs)) for (i in seq_len(nrow(b_pairs))) { - between[b_pairs$from[i], b_pairs$to[i], within_dist[[i]]] <- TRUE - between[b_pairs$to[i], b_pairs$from[i], within_dist[[i]]] <- TRUE + n <- length(within_dist[[i]]) + add_list[[i]] <- cbind(rep(b_pairs$from[i], n), + rep(b_pairs$to[i], n), + within_dist[[i]]) } + + # Forward transitions + add <- do.call(rbind, add_list) + between[add] <- TRUE # individual cell indexing + + # Backwards transitions + add <- add[, c(2, 1, 3)] + between[add] <- TRUE + + bf_msg(" ", round(batch / n_batch * 100, 2), "%\n") + } # End batch loop if (FALSE) { diff --git a/R/plot_flux.R b/R/plot_flux.R index b9a4d356..7165b0d4 100644 --- a/R/plot_flux.R +++ b/R/plot_flux.R @@ -45,6 +45,15 @@ plot_flux <- function(flux, distr <- apply(distr, 2, function(x) x / max(x, na.rm = TRUE)) } + # Add " " labels as ordered factor + dates <- lubridate::as_date(flux$date) + label <- paste0(lubridate::month(dates, label = TRUE, abbr = FALSE), " ", + lubridate::mday(dates)) + ud <- sort(unique(dates)) + ul <- paste0(lubridate::month(ud, label = TRUE, abbr = FALSE), " ", + lubridate::mday(ud)) + label <- ordered(label, levels = ul) + flux$label <- label if (is.null(limits)) { limits <- range(flux$flux, na.rm = TRUE) @@ -104,21 +113,23 @@ plot_flux <- function(flux, ggplot2::ggplot(ggplot2::aes(x = .data$x, y = .data$y, fill = .data$flux)) + ggplot2::geom_raster() + - ggplot2::scale_fill_gradientn(colors = gradient_colors) + ggplot2::scale_fill_gradientn(colors = gradient_colors, name = value_label) # Add facet wrap and title if (length(transitions > 1)) { # Multiple transitions, facet wrap on date and add species title p <- p + - ggplot2::facet_wrap(ggplot2::vars(.data$date)) + + ggplot2::facet_wrap(ggplot2::vars(.data$label)) + ggplot2::ggtitle(title) } else { # Single transition add species title AND date subtitle p <- p + - ggplot2::ggtitle(title, subtitle = flux$date[1]) + ggplot2::ggtitle(title, subtitle = flux$label[1]) } + + # Add coastline if (!is.null(coast_color) && !is.null(coast_linewidth)) { @@ -131,5 +142,14 @@ plot_flux <- function(flux, color = coast_color) } + # coord_sf is required to adjust coordinates while using geom_sf + # Here we are preventing expanding the extent of the plot. + # Setting the CRS is only necessary when the coastline isn't plotted because + # of NULL coast_color or coast_linewidth + p <- p + + ggplot2::coord_sf(expand = FALSE, crs = crs(bf)) + + ggplot2::theme(axis.title = ggplot2::element_blank(), + strip.background = ggplot2::element_blank()) + return(p) } diff --git a/man/animate_distr.Rd b/man/animate_distr.Rd index 1811e757..c78dcc24 100644 --- a/man/animate_distr.Rd +++ b/man/animate_distr.Rd @@ -115,7 +115,7 @@ spread_anim <- animate_distr(density_spread, bf, limit = c(0, 0.05)) # Display print(gif) - # mv + # Save gif_file <- tempfile("animation", fileext = ".gif") gganimate::save_animation(gif, gif_file) file.remove(gif_file) # cleanup diff --git a/man/calc_flux.Rd b/man/calc_flux.Rd index 7ab1f4cc..47dc87cf 100644 --- a/man/calc_flux.Rd +++ b/man/calc_flux.Rd @@ -10,7 +10,7 @@ calc_flux( radius = NULL, n_directions = 1, format = NULL, - batch_size = 1e+06, + batch_size = 5e+05, check_radius = TRUE ) } @@ -19,15 +19,17 @@ calc_flux( \item{points}{A set of points to calculate movement through. If \code{points} is \code{NULL} they will default to the BirdFlow model cells that are either active -or fall between two active cells. Should be a data frame with \code{x} and \code{y} +or fall between two active cells. Otherwise a data frame with \code{x} and \code{y} columns containing point coordinates in \link[=crs]{crs(bf)}.} -\item{radius}{The radius in meters around the points, used to determine if -a path is passing through the point.} +\item{radius}{The radius in meters around the points used to assess whether +a movement line passes by (or through) the point. If a point is farther than +\code{radius} from a great circle line between two cells centers then it is not +between them.} \item{n_directions}{The number of directional bins to use for recording movement direction. Must be either \code{1} indicating no direction information -or an even number.} +or an even number. This is a placeholder, currently only \code{1} is supported.} \item{format}{The format to return the results in one of: \describe{ @@ -89,6 +91,22 @@ between the center of the the source and destination raster cells. Caution should be used when interpreting the results especially around major geographic features such as coasts, large lakes, mountain ranges, and ecological system boundaries that might result in non-linear migration paths. + +\code{calc_flux()} assumes that a line passes by a point if any part of the line +is within the radius of the point. This assumption breaks down if the +radius is much larger than the movement lengths as points that are ahead of +the line may still be within a radius of the line. In the extreme a large +enough radius on a point outside of the entire range will capture all the +movement. This isn't a problem with +the default points and radius as the points ahead of the line will never be +within the radius. + +The default points for \code{calc_flux()} are aligned with the cell centers as +are the movement lines. This alignment means that a very small radius will +result in an overestimate of flux. The default value of half the cell size +is sufficient for this not to be a problem, as we are capturing and +standardizing the units based on the entire cell area that that point +represents. } \examples{ diff --git a/man/is_between.Rd b/man/is_between.Rd index 37e5e310..6ee8b8c6 100644 --- a/man/is_between.Rd +++ b/man/is_between.Rd @@ -10,7 +10,7 @@ is_between( radius = NULL, n_directions = 1, skip_unconnected = TRUE, - batch_size = 1e+06, + batch_size = 1e+05, check_radius = TRUE ) } diff --git a/tests/testthat/helper-skip_if_wrong_ebirdst_for_snapshot.R b/tests/testthat/helper-skip_if_wrong_ebirdst_for_snapshot.R index a1786a23..452ccb97 100644 --- a/tests/testthat/helper-skip_if_wrong_ebirdst_for_snapshot.R +++ b/tests/testthat/helper-skip_if_wrong_ebirdst_for_snapshot.R @@ -2,7 +2,7 @@ skip_if_wrong_ebirdst_for_snapshot <- function() { # nolint: object_length_linter, line_length_linter v <- ebirdst_pkg_ver() if (v < "3.2022.0") { - skip(paste0( + testthat::skip(paste0( "ebirdst version ", as.character(v), " produces dated snapshots.")) diff --git a/tests/testthat/test-calc_flux.R b/tests/testthat/test-calc_flux.R index 28556174..d8d9f0c8 100644 --- a/tests/testthat/test-calc_flux.R +++ b/tests/testthat/test-calc_flux.R @@ -23,6 +23,12 @@ test_that("Test sensativity of flux to radius", { testthat::skip("In depth flux radius analysis - always skipped") + # This is an exploration of how flux varies across a very wide range + # of values for the radius parameter. + # It's a "test" only in the sense that it improves understanding of the + # functions behavior, but this seemed as good a place to save it as any. + # Conclusions are at bottom. + # Sparsify and truncate to speed things up bf <- BirdFlowModels::amewoo bf <- truncate_birdflow(bf, start = 1, end = 5) @@ -138,14 +144,11 @@ test_that("Test sensativity of flux to radius", { # Two conclusions: # 1. as implemented a diameter equal to the cell size is a sweet spot, and # keeping the diameter within the range between > 0.5 and < 2 cell widths - # is a good idea. + # is a good idea (and radius half that). # 2. If we want the algorithm to work with larger diameters it would make # sense to not allow points beyond the end of the line to be counted # (even if they are within the radius of the line). I suspect this # would slow things down unless I implemented it in C++ as I don't think # I can do it with any kind of vectorization in R. - - - }) diff --git a/vignettes/BirdFlowOverview_files/header-attrs-2.25/header-attrs.js b/vignettes/BirdFlowOverview_files/header-attrs-2.25/header-attrs.js new file mode 100644 index 00000000..dd57d92e --- /dev/null +++ b/vignettes/BirdFlowOverview_files/header-attrs-2.25/header-attrs.js @@ -0,0 +1,12 @@ +// Pandoc 2.9 adds attributes on both header and div. We remove the former (to +// be compatible with the behavior of Pandoc < 2.8). +document.addEventListener('DOMContentLoaded', function(e) { + var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); + var i, h, a; + for (i = 0; i < hs.length; i++) { + h = hs[i]; + if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 + a = h.attributes; + while (a.length > 0) h.removeAttribute(a[0].name); + } +});