Merge pull request #70 from NIEHS/reflow-path

0.6.2
NIEHS · Apr 23, 2024 · 3d8e614 · 3d8e614
2 parents 0b6c6a7 + 2f3d320
commit 3d8e614
Show file tree

Hide file tree

Showing 21 changed files with 819 additions and 125 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,6 @@
 Package: chopin
-Title: CHOPIN: Computation for Climate and Health research On Parallelized
-    INfrastructure
-Version: 0.5.0.20240311
+Title: CHOPIN: Computation for Climate and Health research On Parallelized INfrastructure
+Version: 0.6.2.20240423
 Authors@R: c(
     person("Insang", "Song", , "geoissong@gmail.com", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0001-8732-3256")),
@@ -26,6 +25,7 @@ LazyDataCompression: xz
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.3.1
 Imports: 
+    anticlust,
     doFuture,
     dplyr (>= 1.1.0),
     exactextractr (>= 0.8.2),
@@ -46,10 +46,11 @@ Suggests:
     future.callr,
     knitr,
     rmarkdown,
+    spatstat.random,
     testthat (>= 3.0.0),
     units,
     withr
 VignetteBuilder: knitr
 Config/testthat/edition: 3
 LitrVersionUsed: 0.9.0
-LitrId: b95c1c1ccdf00f4553673c2a048681ca
+LitrId: 5af3a7d30fa132cc525ba06ef54fd718
diff --git a/NAMESPACE b/NAMESPACE
@@ -21,6 +21,8 @@ export(par_cut_coords)
 export(par_def_q)
 export(par_fallback)
 export(par_grid)
+export(par_group_balanced)
+export(par_group_grid)
 export(par_hierarchy)
 export(par_make_grid)
 export(par_make_gridset)
@@ -33,6 +35,7 @@ export(summarize_sedc)
 export(vect_valid_repair)
 import(doFuture)
 import(future)
+importFrom(anticlust,balanced_clustering)
 importFrom(dplyr,across)
 importFrom(dplyr,all_of)
 importFrom(dplyr,as_tibble)
@@ -73,6 +76,7 @@ importFrom(sf,st_set_crs)
 importFrom(sf,st_transform)
 importFrom(sf,st_within)
 importFrom(stars,st_as_stars)
+importFrom(stats,dist)
 importFrom(stats,quantile)
 importFrom(stats,setNames)
 importFrom(stats,weighted.mean)

diff --git a/R/gridding.R b/R/gridding.R
@@ -6,7 +6,6 @@
 #' @param mode character(1). Mode of region construction.
 #'  One of
 #' * `"grid"` (simple grid regardless of the number of features in each grid)
-#' * `"density"` (clustering-based varying grids),
 #' * `"grid_advanced"` (merging adjacent grids with
 #'  smaller number of features than `grid_min_features`).
 #'  The argument `grid_min_features` should be specified.
@@ -61,7 +60,7 @@
 par_make_gridset <-
   function(
       input,
-      mode = c("grid", "grid_advanced", "density", "grid_quantile"),
+      mode = c("grid", "grid_advanced", "grid_quantile"),
       nx = 10L,
       ny = 10L,
       grid_min_features = 30L,
@@ -74,13 +73,11 @@ par_make_gridset <-
     if (!all(
       is.integer(nx),
       is.integer(ny)
-    )
-    ) {
+    )) {
       stop("nx, ny must be integer.\n")
     }
     if (!is.numeric(padding)) {
-      message("padding should be numeric.
-We try converting padding to numeric...\n")
+      message("padding should be numeric. Try converting padding to numeric...\n")
       padding <- as.numeric(padding)
       if (any(inherits(padding, "try-error"), is.na(padding))) {
         stop("padding is not convertible to numeric or converted to NA.\n")
@@ -101,10 +98,10 @@ We try converting padding to numeric...\n")
           x = input,
           y = NULL,
           quantiles = quantiles
-        ),
-        density = simpleError("density method is under development.\n")
+        )
       )
 
+    # register CRS
     if (dep_check(grid_reg) == "sf") {
       grid_reg <- sf::st_set_crs(grid_reg, sf::st_crs(input))
       grid_reg_conv <- dep_switch(grid_reg)
@@ -130,6 +127,82 @@ We try converting padding to numeric...\n")
 
   }
 
+
+#' Extension of par_group_balanced for padded grids
+#' @description This function is an extension of `par_group_balanced`
+#' to be compatible with `par_grid`, for which a set of padded grids
+#' of the extent of input point subsets (as recorded in the field named `"CGRIDID"`)
+#' is generated out of input points along with the output of
+#' `par_group_balanced`.
+#' @family Parallelization
+#' @param points_in `sf` or `SpatVector` object.
+#' @param ngroups integer(1). The number of groups.
+#' @param padding numeric(1). A extrusion factor to make buffer to
+#'  clip actual datasets. Depending on the length unit of the CRS of input.
+#' @returns A list of two,
+#'  * \code{original}: exhaustive and non-overlapping
+#'  grid polygons in the class of input
+#'  * \code{padded}: a square buffer of each polygon in
+#'  \code{original}. Used for computation.
+#' @author Insang Song
+#' @examples
+#' library(terra)
+#' library(sf)
+#' ncpath <- system.file("gpkg/nc.gpkg", package = "sf")
+#' nc <- terra::vect(ncpath)
+#' nc_rp <- terra::spatSample(nc, 1000)
+#' nc_gr <- par_group_grid(nc_rp, 10L, 1000)
+#' nc_gr
+#' @importFrom terra as.polygons
+#' @importFrom terra ext
+#' @importFrom terra buffer
+#' @export
+par_group_grid <-
+  function(
+    points_in = NULL,
+    ngroups = NULL,
+    padding
+  ) {
+    if (missing(ngroups)) {
+      stop("ngroups should be specified.\n")
+    }
+
+    if (!is.numeric(padding)) {
+      message("padding should be numeric. Try converting padding to numeric...\n")
+      padding <- as.numeric(padding)
+      if (any(inherits(padding, "try-error"), is.na(padding), missing(padding))) {
+        stop("padding is not convertible to numeric or converted to NA.\n")
+      }
+    }
+    pgroups <- par_group_balanced(points_in, ngroups)
+
+    grid_p <- lapply(
+      split(pgroups, pgroups$CGRIDID),
+      function(x) {
+        terra::as.polygons(terra::ext(x))
+      }
+    )
+    grid_p <- Reduce(rbind, grid_p)
+    grid_p$CGRIDID <- sort(unique(pgroups$CGRIDID))
+
+    grid_reg_pad <-
+      terra::buffer(
+        grid_p,
+        width = padding,
+        capstyle = "square",
+        joinstyle = "mitre"
+      )
+    if (dep_check(points_in) != dep_check(grid_reg_pad)) {
+      grid_reg_pad <- dep_switch(grid_reg_pad)
+    }
+    grid_results <-
+      list(original = pgroups,
+           padded = grid_reg_pad)
+    return(grid_results)
+  }
+
+
+
 #' @title Generate grid polygons
 #' @family Parallelization
 #' @description Returns a sf object that includes x- and y- index
@@ -309,17 +382,24 @@ par_cut_coords <- function(x = NULL, y = NULL, quantiles) {
 #'  This function strongly assumes that the input
 #'  is returned from the par_make_grid,
 #'  which has `"CGRIDID"` as the unique id field.
+#' @note This function will not work properly if `grid_in` has
+#' more than one million grids.
 #' @author Insang Song
 #' @param points_in `sf` or `SpatVector` object. Target points of computation.
 #' @param grid_in `sf` or `SpatVector` object.
 #' The grid generated by [`par_make_grid`].
 #' @param grid_min_features integer(1). Threshold to merge adjacent grids.
+#' @param merge_max integer(1). Maximum number of grids to merge per merged set.
+#' Default is 4.
+#' For example, if the number of grids to merge is 20 and `merge_max` is 10,
+#' the function will split the 20 grids into two sets of 10 grids.
 #' @returns A `sf` or `SpatVector` object of computation grids.
 #' @examples
 #' \dontrun{
 #' library(sf)
 #' library(igraph)
-#' ligrary(dplyr)
+#' library(dplyr)
+#' library(spatstat.random)
 #' dg <- sf::st_as_sfc(st_bbox(c(xmin = 0, ymin = 0, xmax = 8e5, ymax = 6e5)))
 #' sf::st_crs(dg) <- 5070
 #' dgs <- sf::st_as_sf(st_make_grid(dg, n = c(20, 15)))
@@ -333,7 +413,7 @@ par_cut_coords <- function(x = NULL, y = NULL, quantiles) {
 #' #### NOT RUN ####
 #' }
 #' @references
-#' Polsby, D. D., & Popper, F. J. (1991). The Third Criterion: Compactness as a Procedural Safeguard Against Partisan Gerrymandering. _Yale Law & Policy Review_, 9(2), 301–353. [Link](http://hdl.handle.net/20.500.13051/17448)
+#' Polsby DD, Popper FJ. (1991). The Third Criterion: Compactness as a Procedural Safeguard Against Partisan Gerrymandering. _Yale Law & Policy Review_, 9(2), 301–353. [Link](http://hdl.handle.net/20.500.13051/17448)
 #' @importFrom dplyr group_by
 #' @importFrom dplyr summarize
 #' @importFrom dplyr ungroup
@@ -354,21 +434,25 @@ par_merge_grid <-
   function(
     points_in = NULL,
     grid_in = NULL,
-    grid_min_features = NULL
+    grid_min_features = NULL,
+    merge_max = 4L
   ) {
     package_detected <- dep_check(points_in)
     if (package_detected == "terra") {
-      points_in <- dep_switch(points_in)
-      grid_in <- dep_switch(grid_in)
+      points_pc <- dep_switch(points_in)
+      grid_pc <- dep_switch(grid_in)
+    } else {
+      points_pc <- points_in
+      grid_pc <- grid_in
     }
 
     # 1. count #points in each grid
-    n_points_in_grid <- lengths(sf::st_intersects(grid_in, points_in))
+    n_points_in_grid <- lengths(sf::st_intersects(grid_pc, points_pc))
     grid_lt_threshold <- (n_points_in_grid < grid_min_features)
 
     # 2. concatenate self and contiguity grid indices
-    grid_self <- sf::st_relate(grid_in, grid_in, pattern = "2********")
-    grid_rook <- sf::st_relate(grid_in, grid_in, pattern = "F***1****")
+    grid_self <- sf::st_relate(grid_pc, grid_pc, pattern = "2********")
+    grid_rook <- sf::st_relate(grid_pc, grid_pc, pattern = "F***1****")
     # 3. merge self and rook neighbors
     grid_selfrook <- mapply(c, grid_self, grid_rook, SIMPLIFY = FALSE)
     # 4. conditional 1: the number of points per grid exceed the threshold?
@@ -390,7 +474,7 @@ par_merge_grid <-
       return(grid_in)
     }
     # leave only actual index rather than logical
-    grid_lt_threshold_idx <- seq(1, nrow(grid_in))[grid_lt_threshold]
+    grid_lt_threshold_idx <- seq(1, nrow(grid_pc))[grid_lt_threshold]
 
     # 5. filter out the ones that are below the threshold
     identified <- lapply(grid_selfrook,
@@ -416,19 +500,63 @@ par_merge_grid <-
       igraph::components()
 
     identified_graph_member <- identified_graph$membership
+    identified_graph_member2 <- identified_graph_member
 
+    # for assigning merged grid id (original)  
+    merge_idx <- which(rownames(grid_pc) %in% names(identified_graph_member))
+
+    # nolint start
+    # post-process: split membership into (almost) equal sizes
+    # note that identified_graph_member should preserve the order
+    tab_graph_member <- table(identified_graph_member)
+    if (any(tab_graph_member > merge_max)) {
+      # gets index of the grids in too large groups
+      graph_member_excess_idx <- which(identified_graph_member %in% names(tab_graph_member[tab_graph_member > merge_max]))
+      # extract the excess groups
+      graph_member_excess <- identified_graph_member[graph_member_excess_idx]
+      # for each excess group, split into smaller groups
+      for (i in seq_along(unique(graph_member_excess))) {
+        graph_member_excess_this <- which(graph_member_excess == unique(graph_member_excess)[i])
+        graph_member_excess_repl <- graph_member_excess[graph_member_excess_this]
+        # 1e6 is arbitrarily chosen; it should be large enough to avoid conflicts
+        # with the original membership
+        # I do believe this number will not be changed as 1e6+ computation grids are not practical
+        graph_member_excess_split <-
+          split(
+            graph_member_excess_repl,
+            ceiling(seq_along(graph_member_excess_repl) / merge_max) + (i * 1e6)
+          )
+
+        graph_member_excess_split <-
+          mapply(function(x, y) {
+              rep(sapply(y, as.numeric), length(x))
+            }, graph_member_excess_split, names(graph_member_excess_split),
+            SIMPLIFY = TRUE
+          )
+        graph_member_excess_split <- unname(unlist(graph_member_excess_split))
+        identified_graph_member2[which(identified_graph_member2 == unique(graph_member_excess)[i])] <-
+          graph_member_excess_split
+      }
+      identified_graph_member <- identified_graph_member2
+    } else {
+      identified_graph_member <- identified_graph_member
+    }
+    # nolint end
     # 10. Assign membership information
-    merge_idx <- as.integer(names(identified_graph_member))
-    merge_member <- split(merge_idx, identified_graph_member)
+    # Here we use the modified membership
+    merge_member <- split(rownames(grid_pc)[merge_idx], identified_graph_member)
     # 11. Label the merged grids
     merge_member_label <-
       unlist(lapply(merge_member, function(x) paste(x, collapse = "_")))
-    merge_member_label <- merge_member_label[identified_graph_member]
+    merge_member_label <- mapply(function(lst, label) {
+      rep(label, length(lst)) },
+      merge_member, merge_member_label, SIMPLIFY = TRUE)
+    merge_member_label <- unlist(merge_member_label)
 
     # 12. Assign labels to the original sf object
-    grid_out <- grid_in
+    grid_out <- grid_pc
     grid_out[["CGRIDID"]][merge_idx] <- merge_member_label
-
+    
     grid_out <- grid_out |>
       dplyr::group_by(!!rlang::sym("CGRIDID")) |>
       dplyr::summarize(n_merged = dplyr::n()) |>
@@ -438,7 +566,9 @@ par_merge_grid <-
     par_merge_gridd <- grid_out[which(grid_out$n_merged > 1), ]
     par_merge_gridd_area <- as.numeric(sf::st_area(par_merge_gridd))
     par_merge_gridd_perimeter <-
-      as.numeric(sf::st_length(sf::st_cast(par_merge_gridd, "LINESTRING")))
+      suppressWarnings(
+        as.numeric(sf::st_length(sf::st_cast(par_merge_gridd, "LINESTRING")))
+      )
     par_merge_gridd_pptest <-
       (4 * pi * par_merge_gridd_area) / (par_merge_gridd_perimeter ^ 2)
 
@@ -450,7 +580,54 @@ par_merge_grid <-
       message("The reduced computational regions have too complex shapes.
       Consider increasing thresholds or using the original grids.\n")
     }
+    if (dep_check(points_in) != dep_check(grid_out)) {
+      grid_out <- dep_switch(grid_out)
+    }
 
     return(grid_out)
+  }
+
+
+#' Generate groups based on balanced clustering
+#' @description For balancing computational loads, the function uses
+#' the `anticlust` package to cluster the input points. The number of clusters
+#' is determined by the `num_cluster` argument. Each cluster will have
+#' equal number of points. Grids will be generated based on the cluster extents.
+#' @note This function is only for two-dimensional points.
+#' The results will be inexhaustive grids.
+#' @param points_in `sf` or `SpatVector` object. Target points of computation.
+#' @param n_clusters integer(1). The number of clusters.
+#' @returns `SpatVector` object with a field `"CGRIDID"`.
+#' @examples
+#' library(terra)
+#' library(anticlust)
+#' data(ncpoints, package = "chopin")
+#' ncp <- terra::vect(ncpoints, geom = c("X", "Y"), keepgeom = FALSE, crs = "EPSG:5070")
+#' par_group_balanced(ncp, 10)
+#' @author Insang Song
+#' @importFrom anticlust balanced_clustering
+#' @importFrom terra vect
+#' @importFrom terra crds
+#' @importFrom stats dist
+#' @export
+par_group_balanced <- function(
+  points_in = NULL,
+  n_clusters = NULL
+) {
+  if (!is.numeric(n_clusters)) {
+    stop("n_clusters should be numeric.\n")
+  }
+  if (n_clusters < 2) {
+    stop("n_clusters should be greater than 1.\n")
+  }
+  if (dep_check(points_in) == "sf") {
+    points_in <- terra::vect(points_in)
+  }
+  # define dissimilarity based on Euclidean distance
+  dissim <- dist(terra::crds(points_in), method = "euclidean")
+  cl <- anticlust::balanced_clustering(dissim, K = n_clusters)
+  points_in$CGRIDID <- cl
+  return(points_in)
 }
 
+