Skip to content

Commit

Permalink
Merge pull request #70 from NIEHS/reflow-path
Browse files Browse the repository at this point in the history
0.6.2
  • Loading branch information
sigmafelix authored Apr 23, 2024
2 parents 0b6c6a7 + 2f3d320 commit 3d8e614
Show file tree
Hide file tree
Showing 21 changed files with 819 additions and 125 deletions.
9 changes: 5 additions & 4 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
Package: chopin
Title: CHOPIN: Computation for Climate and Health research On Parallelized
INfrastructure
Version: 0.5.0.20240311
Title: CHOPIN: Computation for Climate and Health research On Parallelized INfrastructure
Version: 0.6.2.20240423
Authors@R: c(
person("Insang", "Song", , "geoissong@gmail.com", role = c("aut", "cre"),
comment = c(ORCID = "0000-0001-8732-3256")),
Expand All @@ -26,6 +25,7 @@ LazyDataCompression: xz
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.3.1
Imports:
anticlust,
doFuture,
dplyr (>= 1.1.0),
exactextractr (>= 0.8.2),
Expand All @@ -46,10 +46,11 @@ Suggests:
future.callr,
knitr,
rmarkdown,
spatstat.random,
testthat (>= 3.0.0),
units,
withr
VignetteBuilder: knitr
Config/testthat/edition: 3
LitrVersionUsed: 0.9.0
LitrId: b95c1c1ccdf00f4553673c2a048681ca
LitrId: 5af3a7d30fa132cc525ba06ef54fd718
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ export(par_cut_coords)
export(par_def_q)
export(par_fallback)
export(par_grid)
export(par_group_balanced)
export(par_group_grid)
export(par_hierarchy)
export(par_make_grid)
export(par_make_gridset)
Expand All @@ -33,6 +35,7 @@ export(summarize_sedc)
export(vect_valid_repair)
import(doFuture)
import(future)
importFrom(anticlust,balanced_clustering)
importFrom(dplyr,across)
importFrom(dplyr,all_of)
importFrom(dplyr,as_tibble)
Expand Down Expand Up @@ -73,6 +76,7 @@ importFrom(sf,st_set_crs)
importFrom(sf,st_transform)
importFrom(sf,st_within)
importFrom(stars,st_as_stars)
importFrom(stats,dist)
importFrom(stats,quantile)
importFrom(stats,setNames)
importFrom(stats,weighted.mean)
Expand Down
223 changes: 200 additions & 23 deletions R/gridding.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
#' @param mode character(1). Mode of region construction.
#' One of
#' * `"grid"` (simple grid regardless of the number of features in each grid)
#' * `"density"` (clustering-based varying grids),
#' * `"grid_advanced"` (merging adjacent grids with
#' smaller number of features than `grid_min_features`).
#' The argument `grid_min_features` should be specified.
Expand Down Expand Up @@ -61,7 +60,7 @@
par_make_gridset <-
function(
input,
mode = c("grid", "grid_advanced", "density", "grid_quantile"),
mode = c("grid", "grid_advanced", "grid_quantile"),
nx = 10L,
ny = 10L,
grid_min_features = 30L,
Expand All @@ -74,13 +73,11 @@ par_make_gridset <-
if (!all(
is.integer(nx),
is.integer(ny)
)
) {
)) {
stop("nx, ny must be integer.\n")
}
if (!is.numeric(padding)) {
message("padding should be numeric.
We try converting padding to numeric...\n")
message("padding should be numeric. Try converting padding to numeric...\n")
padding <- as.numeric(padding)
if (any(inherits(padding, "try-error"), is.na(padding))) {
stop("padding is not convertible to numeric or converted to NA.\n")
Expand All @@ -101,10 +98,10 @@ We try converting padding to numeric...\n")
x = input,
y = NULL,
quantiles = quantiles
),
density = simpleError("density method is under development.\n")
)
)

# register CRS
if (dep_check(grid_reg) == "sf") {
grid_reg <- sf::st_set_crs(grid_reg, sf::st_crs(input))
grid_reg_conv <- dep_switch(grid_reg)
Expand All @@ -130,6 +127,82 @@ We try converting padding to numeric...\n")

}


#' Extension of par_group_balanced for padded grids
#' @description This function is an extension of `par_group_balanced`
#' to be compatible with `par_grid`, for which a set of padded grids
#' of the extent of input point subsets (as recorded in the field named `"CGRIDID"`)
#' is generated out of input points along with the output of
#' `par_group_balanced`.
#' @family Parallelization
#' @param points_in `sf` or `SpatVector` object.
#' @param ngroups integer(1). The number of groups.
#' @param padding numeric(1). A extrusion factor to make buffer to
#' clip actual datasets. Depending on the length unit of the CRS of input.
#' @returns A list of two,
#' * \code{original}: exhaustive and non-overlapping
#' grid polygons in the class of input
#' * \code{padded}: a square buffer of each polygon in
#' \code{original}. Used for computation.
#' @author Insang Song
#' @examples
#' library(terra)
#' library(sf)
#' ncpath <- system.file("gpkg/nc.gpkg", package = "sf")
#' nc <- terra::vect(ncpath)
#' nc_rp <- terra::spatSample(nc, 1000)
#' nc_gr <- par_group_grid(nc_rp, 10L, 1000)
#' nc_gr
#' @importFrom terra as.polygons
#' @importFrom terra ext
#' @importFrom terra buffer
#' @export
par_group_grid <-
function(
points_in = NULL,
ngroups = NULL,
padding
) {
if (missing(ngroups)) {
stop("ngroups should be specified.\n")
}

if (!is.numeric(padding)) {
message("padding should be numeric. Try converting padding to numeric...\n")
padding <- as.numeric(padding)
if (any(inherits(padding, "try-error"), is.na(padding), missing(padding))) {
stop("padding is not convertible to numeric or converted to NA.\n")
}
}
pgroups <- par_group_balanced(points_in, ngroups)

grid_p <- lapply(
split(pgroups, pgroups$CGRIDID),
function(x) {
terra::as.polygons(terra::ext(x))
}
)
grid_p <- Reduce(rbind, grid_p)
grid_p$CGRIDID <- sort(unique(pgroups$CGRIDID))

grid_reg_pad <-
terra::buffer(
grid_p,
width = padding,
capstyle = "square",
joinstyle = "mitre"
)
if (dep_check(points_in) != dep_check(grid_reg_pad)) {
grid_reg_pad <- dep_switch(grid_reg_pad)
}
grid_results <-
list(original = pgroups,
padded = grid_reg_pad)
return(grid_results)
}



#' @title Generate grid polygons
#' @family Parallelization
#' @description Returns a sf object that includes x- and y- index
Expand Down Expand Up @@ -309,17 +382,24 @@ par_cut_coords <- function(x = NULL, y = NULL, quantiles) {
#' This function strongly assumes that the input
#' is returned from the par_make_grid,
#' which has `"CGRIDID"` as the unique id field.
#' @note This function will not work properly if `grid_in` has
#' more than one million grids.
#' @author Insang Song
#' @param points_in `sf` or `SpatVector` object. Target points of computation.
#' @param grid_in `sf` or `SpatVector` object.
#' The grid generated by [`par_make_grid`].
#' @param grid_min_features integer(1). Threshold to merge adjacent grids.
#' @param merge_max integer(1). Maximum number of grids to merge per merged set.
#' Default is 4.
#' For example, if the number of grids to merge is 20 and `merge_max` is 10,
#' the function will split the 20 grids into two sets of 10 grids.
#' @returns A `sf` or `SpatVector` object of computation grids.
#' @examples
#' \dontrun{
#' library(sf)
#' library(igraph)
#' ligrary(dplyr)
#' library(dplyr)
#' library(spatstat.random)
#' dg <- sf::st_as_sfc(st_bbox(c(xmin = 0, ymin = 0, xmax = 8e5, ymax = 6e5)))
#' sf::st_crs(dg) <- 5070
#' dgs <- sf::st_as_sf(st_make_grid(dg, n = c(20, 15)))
Expand All @@ -333,7 +413,7 @@ par_cut_coords <- function(x = NULL, y = NULL, quantiles) {
#' #### NOT RUN ####
#' }
#' @references
#' Polsby, D. D., & Popper, F. J. (1991). The Third Criterion: Compactness as a Procedural Safeguard Against Partisan Gerrymandering. _Yale Law & Policy Review_, 9(2), 301–353. [Link](http://hdl.handle.net/20.500.13051/17448)
#' Polsby DD, Popper FJ. (1991). The Third Criterion: Compactness as a Procedural Safeguard Against Partisan Gerrymandering. _Yale Law & Policy Review_, 9(2), 301–353. [Link](http://hdl.handle.net/20.500.13051/17448)
#' @importFrom dplyr group_by
#' @importFrom dplyr summarize
#' @importFrom dplyr ungroup
Expand All @@ -354,21 +434,25 @@ par_merge_grid <-
function(
points_in = NULL,
grid_in = NULL,
grid_min_features = NULL
grid_min_features = NULL,
merge_max = 4L
) {
package_detected <- dep_check(points_in)
if (package_detected == "terra") {
points_in <- dep_switch(points_in)
grid_in <- dep_switch(grid_in)
points_pc <- dep_switch(points_in)
grid_pc <- dep_switch(grid_in)
} else {
points_pc <- points_in
grid_pc <- grid_in
}

# 1. count #points in each grid
n_points_in_grid <- lengths(sf::st_intersects(grid_in, points_in))
n_points_in_grid <- lengths(sf::st_intersects(grid_pc, points_pc))
grid_lt_threshold <- (n_points_in_grid < grid_min_features)

# 2. concatenate self and contiguity grid indices
grid_self <- sf::st_relate(grid_in, grid_in, pattern = "2********")
grid_rook <- sf::st_relate(grid_in, grid_in, pattern = "F***1****")
grid_self <- sf::st_relate(grid_pc, grid_pc, pattern = "2********")
grid_rook <- sf::st_relate(grid_pc, grid_pc, pattern = "F***1****")
# 3. merge self and rook neighbors
grid_selfrook <- mapply(c, grid_self, grid_rook, SIMPLIFY = FALSE)
# 4. conditional 1: the number of points per grid exceed the threshold?
Expand All @@ -390,7 +474,7 @@ par_merge_grid <-
return(grid_in)
}
# leave only actual index rather than logical
grid_lt_threshold_idx <- seq(1, nrow(grid_in))[grid_lt_threshold]
grid_lt_threshold_idx <- seq(1, nrow(grid_pc))[grid_lt_threshold]

# 5. filter out the ones that are below the threshold
identified <- lapply(grid_selfrook,
Expand All @@ -416,19 +500,63 @@ par_merge_grid <-
igraph::components()

identified_graph_member <- identified_graph$membership
identified_graph_member2 <- identified_graph_member

# for assigning merged grid id (original)
merge_idx <- which(rownames(grid_pc) %in% names(identified_graph_member))

# nolint start
# post-process: split membership into (almost) equal sizes
# note that identified_graph_member should preserve the order
tab_graph_member <- table(identified_graph_member)
if (any(tab_graph_member > merge_max)) {
# gets index of the grids in too large groups
graph_member_excess_idx <- which(identified_graph_member %in% names(tab_graph_member[tab_graph_member > merge_max]))
# extract the excess groups
graph_member_excess <- identified_graph_member[graph_member_excess_idx]
# for each excess group, split into smaller groups
for (i in seq_along(unique(graph_member_excess))) {
graph_member_excess_this <- which(graph_member_excess == unique(graph_member_excess)[i])
graph_member_excess_repl <- graph_member_excess[graph_member_excess_this]
# 1e6 is arbitrarily chosen; it should be large enough to avoid conflicts
# with the original membership
# I do believe this number will not be changed as 1e6+ computation grids are not practical
graph_member_excess_split <-
split(
graph_member_excess_repl,
ceiling(seq_along(graph_member_excess_repl) / merge_max) + (i * 1e6)
)

graph_member_excess_split <-
mapply(function(x, y) {
rep(sapply(y, as.numeric), length(x))
}, graph_member_excess_split, names(graph_member_excess_split),
SIMPLIFY = TRUE
)
graph_member_excess_split <- unname(unlist(graph_member_excess_split))
identified_graph_member2[which(identified_graph_member2 == unique(graph_member_excess)[i])] <-
graph_member_excess_split
}
identified_graph_member <- identified_graph_member2
} else {
identified_graph_member <- identified_graph_member
}
# nolint end
# 10. Assign membership information
merge_idx <- as.integer(names(identified_graph_member))
merge_member <- split(merge_idx, identified_graph_member)
# Here we use the modified membership
merge_member <- split(rownames(grid_pc)[merge_idx], identified_graph_member)
# 11. Label the merged grids
merge_member_label <-
unlist(lapply(merge_member, function(x) paste(x, collapse = "_")))
merge_member_label <- merge_member_label[identified_graph_member]
merge_member_label <- mapply(function(lst, label) {
rep(label, length(lst)) },
merge_member, merge_member_label, SIMPLIFY = TRUE)
merge_member_label <- unlist(merge_member_label)

# 12. Assign labels to the original sf object
grid_out <- grid_in
grid_out <- grid_pc
grid_out[["CGRIDID"]][merge_idx] <- merge_member_label

grid_out <- grid_out |>
dplyr::group_by(!!rlang::sym("CGRIDID")) |>
dplyr::summarize(n_merged = dplyr::n()) |>
Expand All @@ -438,7 +566,9 @@ par_merge_grid <-
par_merge_gridd <- grid_out[which(grid_out$n_merged > 1), ]
par_merge_gridd_area <- as.numeric(sf::st_area(par_merge_gridd))
par_merge_gridd_perimeter <-
as.numeric(sf::st_length(sf::st_cast(par_merge_gridd, "LINESTRING")))
suppressWarnings(
as.numeric(sf::st_length(sf::st_cast(par_merge_gridd, "LINESTRING")))
)
par_merge_gridd_pptest <-
(4 * pi * par_merge_gridd_area) / (par_merge_gridd_perimeter ^ 2)

Expand All @@ -450,7 +580,54 @@ par_merge_grid <-
message("The reduced computational regions have too complex shapes.
Consider increasing thresholds or using the original grids.\n")
}
if (dep_check(points_in) != dep_check(grid_out)) {
grid_out <- dep_switch(grid_out)
}

return(grid_out)
}


#' Generate groups based on balanced clustering
#' @description For balancing computational loads, the function uses
#' the `anticlust` package to cluster the input points. The number of clusters
#' is determined by the `num_cluster` argument. Each cluster will have
#' equal number of points. Grids will be generated based on the cluster extents.
#' @note This function is only for two-dimensional points.
#' The results will be inexhaustive grids.
#' @param points_in `sf` or `SpatVector` object. Target points of computation.
#' @param n_clusters integer(1). The number of clusters.
#' @returns `SpatVector` object with a field `"CGRIDID"`.
#' @examples
#' library(terra)
#' library(anticlust)
#' data(ncpoints, package = "chopin")
#' ncp <- terra::vect(ncpoints, geom = c("X", "Y"), keepgeom = FALSE, crs = "EPSG:5070")
#' par_group_balanced(ncp, 10)
#' @author Insang Song
#' @importFrom anticlust balanced_clustering
#' @importFrom terra vect
#' @importFrom terra crds
#' @importFrom stats dist
#' @export
par_group_balanced <- function(
points_in = NULL,
n_clusters = NULL
) {
if (!is.numeric(n_clusters)) {
stop("n_clusters should be numeric.\n")
}
if (n_clusters < 2) {
stop("n_clusters should be greater than 1.\n")
}
if (dep_check(points_in) == "sf") {
points_in <- terra::vect(points_in)
}
# define dissimilarity based on Euclidean distance
dissim <- dist(terra::crds(points_in), method = "euclidean")
cl <- anticlust::balanced_clustering(dissim, K = n_clusters)
points_in$CGRIDID <- cl
return(points_in)
}


Loading

0 comments on commit 3d8e614

Please sign in to comment.