diff --git a/DESCRIPTION b/DESCRIPTION index e9e86b0..1fdc0ad 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,6 @@ Package: chopin -Title: CHOPIN: Computation for Climate and Health research On Parallelized - INfrastructure -Version: 0.5.0.20240311 +Title: CHOPIN: Computation for Climate and Health research On Parallelized INfrastructure +Version: 0.6.2.20240423 Authors@R: c( person("Insang", "Song", , "geoissong@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0001-8732-3256")), @@ -26,6 +25,7 @@ LazyDataCompression: xz Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.1 Imports: + anticlust, doFuture, dplyr (>= 1.1.0), exactextractr (>= 0.8.2), @@ -46,10 +46,11 @@ Suggests: future.callr, knitr, rmarkdown, + spatstat.random, testthat (>= 3.0.0), units, withr VignetteBuilder: knitr Config/testthat/edition: 3 LitrVersionUsed: 0.9.0 -LitrId: b95c1c1ccdf00f4553673c2a048681ca +LitrId: 5af3a7d30fa132cc525ba06ef54fd718 diff --git a/NAMESPACE b/NAMESPACE index f4a06af..5d9445b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -21,6 +21,8 @@ export(par_cut_coords) export(par_def_q) export(par_fallback) export(par_grid) +export(par_group_balanced) +export(par_group_grid) export(par_hierarchy) export(par_make_grid) export(par_make_gridset) @@ -33,6 +35,7 @@ export(summarize_sedc) export(vect_valid_repair) import(doFuture) import(future) +importFrom(anticlust,balanced_clustering) importFrom(dplyr,across) importFrom(dplyr,all_of) importFrom(dplyr,as_tibble) @@ -73,6 +76,7 @@ importFrom(sf,st_set_crs) importFrom(sf,st_transform) importFrom(sf,st_within) importFrom(stars,st_as_stars) +importFrom(stats,dist) importFrom(stats,quantile) importFrom(stats,setNames) importFrom(stats,weighted.mean) diff --git a/R/gridding.R b/R/gridding.R index 8ee9eec..40de2ff 100644 --- a/R/gridding.R +++ b/R/gridding.R @@ -6,7 +6,6 @@ #' @param mode character(1). Mode of region construction. #' One of #' * `"grid"` (simple grid regardless of the number of features in each grid) -#' * `"density"` (clustering-based varying grids), #' * `"grid_advanced"` (merging adjacent grids with #' smaller number of features than `grid_min_features`). #' The argument `grid_min_features` should be specified. @@ -61,7 +60,7 @@ par_make_gridset <- function( input, - mode = c("grid", "grid_advanced", "density", "grid_quantile"), + mode = c("grid", "grid_advanced", "grid_quantile"), nx = 10L, ny = 10L, grid_min_features = 30L, @@ -74,13 +73,11 @@ par_make_gridset <- if (!all( is.integer(nx), is.integer(ny) - ) - ) { + )) { stop("nx, ny must be integer.\n") } if (!is.numeric(padding)) { - message("padding should be numeric. -We try converting padding to numeric...\n") + message("padding should be numeric. Try converting padding to numeric...\n") padding <- as.numeric(padding) if (any(inherits(padding, "try-error"), is.na(padding))) { stop("padding is not convertible to numeric or converted to NA.\n") @@ -101,10 +98,10 @@ We try converting padding to numeric...\n") x = input, y = NULL, quantiles = quantiles - ), - density = simpleError("density method is under development.\n") + ) ) + # register CRS if (dep_check(grid_reg) == "sf") { grid_reg <- sf::st_set_crs(grid_reg, sf::st_crs(input)) grid_reg_conv <- dep_switch(grid_reg) @@ -130,6 +127,82 @@ We try converting padding to numeric...\n") } + +#' Extension of par_group_balanced for padded grids +#' @description This function is an extension of `par_group_balanced` +#' to be compatible with `par_grid`, for which a set of padded grids +#' of the extent of input point subsets (as recorded in the field named `"CGRIDID"`) +#' is generated out of input points along with the output of +#' `par_group_balanced`. +#' @family Parallelization +#' @param points_in `sf` or `SpatVector` object. +#' @param ngroups integer(1). The number of groups. +#' @param padding numeric(1). A extrusion factor to make buffer to +#' clip actual datasets. Depending on the length unit of the CRS of input. +#' @returns A list of two, +#' * \code{original}: exhaustive and non-overlapping +#' grid polygons in the class of input +#' * \code{padded}: a square buffer of each polygon in +#' \code{original}. Used for computation. +#' @author Insang Song +#' @examples +#' library(terra) +#' library(sf) +#' ncpath <- system.file("gpkg/nc.gpkg", package = "sf") +#' nc <- terra::vect(ncpath) +#' nc_rp <- terra::spatSample(nc, 1000) +#' nc_gr <- par_group_grid(nc_rp, 10L, 1000) +#' nc_gr +#' @importFrom terra as.polygons +#' @importFrom terra ext +#' @importFrom terra buffer +#' @export +par_group_grid <- + function( + points_in = NULL, + ngroups = NULL, + padding + ) { + if (missing(ngroups)) { + stop("ngroups should be specified.\n") + } + + if (!is.numeric(padding)) { + message("padding should be numeric. Try converting padding to numeric...\n") + padding <- as.numeric(padding) + if (any(inherits(padding, "try-error"), is.na(padding), missing(padding))) { + stop("padding is not convertible to numeric or converted to NA.\n") + } + } + pgroups <- par_group_balanced(points_in, ngroups) + + grid_p <- lapply( + split(pgroups, pgroups$CGRIDID), + function(x) { + terra::as.polygons(terra::ext(x)) + } + ) + grid_p <- Reduce(rbind, grid_p) + grid_p$CGRIDID <- sort(unique(pgroups$CGRIDID)) + + grid_reg_pad <- + terra::buffer( + grid_p, + width = padding, + capstyle = "square", + joinstyle = "mitre" + ) + if (dep_check(points_in) != dep_check(grid_reg_pad)) { + grid_reg_pad <- dep_switch(grid_reg_pad) + } + grid_results <- + list(original = pgroups, + padded = grid_reg_pad) + return(grid_results) + } + + + #' @title Generate grid polygons #' @family Parallelization #' @description Returns a sf object that includes x- and y- index @@ -309,17 +382,24 @@ par_cut_coords <- function(x = NULL, y = NULL, quantiles) { #' This function strongly assumes that the input #' is returned from the par_make_grid, #' which has `"CGRIDID"` as the unique id field. +#' @note This function will not work properly if `grid_in` has +#' more than one million grids. #' @author Insang Song #' @param points_in `sf` or `SpatVector` object. Target points of computation. #' @param grid_in `sf` or `SpatVector` object. #' The grid generated by [`par_make_grid`]. #' @param grid_min_features integer(1). Threshold to merge adjacent grids. +#' @param merge_max integer(1). Maximum number of grids to merge per merged set. +#' Default is 4. +#' For example, if the number of grids to merge is 20 and `merge_max` is 10, +#' the function will split the 20 grids into two sets of 10 grids. #' @returns A `sf` or `SpatVector` object of computation grids. #' @examples #' \dontrun{ #' library(sf) #' library(igraph) -#' ligrary(dplyr) +#' library(dplyr) +#' library(spatstat.random) #' dg <- sf::st_as_sfc(st_bbox(c(xmin = 0, ymin = 0, xmax = 8e5, ymax = 6e5))) #' sf::st_crs(dg) <- 5070 #' dgs <- sf::st_as_sf(st_make_grid(dg, n = c(20, 15))) @@ -333,7 +413,7 @@ par_cut_coords <- function(x = NULL, y = NULL, quantiles) { #' #### NOT RUN #### #' } #' @references -#' Polsby, D. D., & Popper, F. J. (1991). The Third Criterion: Compactness as a Procedural Safeguard Against Partisan Gerrymandering. _Yale Law & Policy Review_, 9(2), 301–353. [Link](http://hdl.handle.net/20.500.13051/17448) +#' Polsby DD, Popper FJ. (1991). The Third Criterion: Compactness as a Procedural Safeguard Against Partisan Gerrymandering. _Yale Law & Policy Review_, 9(2), 301–353. [Link](http://hdl.handle.net/20.500.13051/17448) #' @importFrom dplyr group_by #' @importFrom dplyr summarize #' @importFrom dplyr ungroup @@ -354,21 +434,25 @@ par_merge_grid <- function( points_in = NULL, grid_in = NULL, - grid_min_features = NULL + grid_min_features = NULL, + merge_max = 4L ) { package_detected <- dep_check(points_in) if (package_detected == "terra") { - points_in <- dep_switch(points_in) - grid_in <- dep_switch(grid_in) + points_pc <- dep_switch(points_in) + grid_pc <- dep_switch(grid_in) + } else { + points_pc <- points_in + grid_pc <- grid_in } # 1. count #points in each grid - n_points_in_grid <- lengths(sf::st_intersects(grid_in, points_in)) + n_points_in_grid <- lengths(sf::st_intersects(grid_pc, points_pc)) grid_lt_threshold <- (n_points_in_grid < grid_min_features) # 2. concatenate self and contiguity grid indices - grid_self <- sf::st_relate(grid_in, grid_in, pattern = "2********") - grid_rook <- sf::st_relate(grid_in, grid_in, pattern = "F***1****") + grid_self <- sf::st_relate(grid_pc, grid_pc, pattern = "2********") + grid_rook <- sf::st_relate(grid_pc, grid_pc, pattern = "F***1****") # 3. merge self and rook neighbors grid_selfrook <- mapply(c, grid_self, grid_rook, SIMPLIFY = FALSE) # 4. conditional 1: the number of points per grid exceed the threshold? @@ -390,7 +474,7 @@ par_merge_grid <- return(grid_in) } # leave only actual index rather than logical - grid_lt_threshold_idx <- seq(1, nrow(grid_in))[grid_lt_threshold] + grid_lt_threshold_idx <- seq(1, nrow(grid_pc))[grid_lt_threshold] # 5. filter out the ones that are below the threshold identified <- lapply(grid_selfrook, @@ -416,19 +500,63 @@ par_merge_grid <- igraph::components() identified_graph_member <- identified_graph$membership + identified_graph_member2 <- identified_graph_member + # for assigning merged grid id (original) + merge_idx <- which(rownames(grid_pc) %in% names(identified_graph_member)) + + # nolint start + # post-process: split membership into (almost) equal sizes + # note that identified_graph_member should preserve the order + tab_graph_member <- table(identified_graph_member) + if (any(tab_graph_member > merge_max)) { + # gets index of the grids in too large groups + graph_member_excess_idx <- which(identified_graph_member %in% names(tab_graph_member[tab_graph_member > merge_max])) + # extract the excess groups + graph_member_excess <- identified_graph_member[graph_member_excess_idx] + # for each excess group, split into smaller groups + for (i in seq_along(unique(graph_member_excess))) { + graph_member_excess_this <- which(graph_member_excess == unique(graph_member_excess)[i]) + graph_member_excess_repl <- graph_member_excess[graph_member_excess_this] + # 1e6 is arbitrarily chosen; it should be large enough to avoid conflicts + # with the original membership + # I do believe this number will not be changed as 1e6+ computation grids are not practical + graph_member_excess_split <- + split( + graph_member_excess_repl, + ceiling(seq_along(graph_member_excess_repl) / merge_max) + (i * 1e6) + ) + + graph_member_excess_split <- + mapply(function(x, y) { + rep(sapply(y, as.numeric), length(x)) + }, graph_member_excess_split, names(graph_member_excess_split), + SIMPLIFY = TRUE + ) + graph_member_excess_split <- unname(unlist(graph_member_excess_split)) + identified_graph_member2[which(identified_graph_member2 == unique(graph_member_excess)[i])] <- + graph_member_excess_split + } + identified_graph_member <- identified_graph_member2 + } else { + identified_graph_member <- identified_graph_member + } + # nolint end # 10. Assign membership information - merge_idx <- as.integer(names(identified_graph_member)) - merge_member <- split(merge_idx, identified_graph_member) + # Here we use the modified membership + merge_member <- split(rownames(grid_pc)[merge_idx], identified_graph_member) # 11. Label the merged grids merge_member_label <- unlist(lapply(merge_member, function(x) paste(x, collapse = "_"))) - merge_member_label <- merge_member_label[identified_graph_member] + merge_member_label <- mapply(function(lst, label) { + rep(label, length(lst)) }, + merge_member, merge_member_label, SIMPLIFY = TRUE) + merge_member_label <- unlist(merge_member_label) # 12. Assign labels to the original sf object - grid_out <- grid_in + grid_out <- grid_pc grid_out[["CGRIDID"]][merge_idx] <- merge_member_label - + grid_out <- grid_out |> dplyr::group_by(!!rlang::sym("CGRIDID")) |> dplyr::summarize(n_merged = dplyr::n()) |> @@ -438,7 +566,9 @@ par_merge_grid <- par_merge_gridd <- grid_out[which(grid_out$n_merged > 1), ] par_merge_gridd_area <- as.numeric(sf::st_area(par_merge_gridd)) par_merge_gridd_perimeter <- - as.numeric(sf::st_length(sf::st_cast(par_merge_gridd, "LINESTRING"))) + suppressWarnings( + as.numeric(sf::st_length(sf::st_cast(par_merge_gridd, "LINESTRING"))) + ) par_merge_gridd_pptest <- (4 * pi * par_merge_gridd_area) / (par_merge_gridd_perimeter ^ 2) @@ -450,7 +580,54 @@ par_merge_grid <- message("The reduced computational regions have too complex shapes. Consider increasing thresholds or using the original grids.\n") } + if (dep_check(points_in) != dep_check(grid_out)) { + grid_out <- dep_switch(grid_out) + } return(grid_out) + } + + +#' Generate groups based on balanced clustering +#' @description For balancing computational loads, the function uses +#' the `anticlust` package to cluster the input points. The number of clusters +#' is determined by the `num_cluster` argument. Each cluster will have +#' equal number of points. Grids will be generated based on the cluster extents. +#' @note This function is only for two-dimensional points. +#' The results will be inexhaustive grids. +#' @param points_in `sf` or `SpatVector` object. Target points of computation. +#' @param n_clusters integer(1). The number of clusters. +#' @returns `SpatVector` object with a field `"CGRIDID"`. +#' @examples +#' library(terra) +#' library(anticlust) +#' data(ncpoints, package = "chopin") +#' ncp <- terra::vect(ncpoints, geom = c("X", "Y"), keepgeom = FALSE, crs = "EPSG:5070") +#' par_group_balanced(ncp, 10) +#' @author Insang Song +#' @importFrom anticlust balanced_clustering +#' @importFrom terra vect +#' @importFrom terra crds +#' @importFrom stats dist +#' @export +par_group_balanced <- function( + points_in = NULL, + n_clusters = NULL +) { + if (!is.numeric(n_clusters)) { + stop("n_clusters should be numeric.\n") + } + if (n_clusters < 2) { + stop("n_clusters should be greater than 1.\n") + } + if (dep_check(points_in) == "sf") { + points_in <- terra::vect(points_in) + } + # define dissimilarity based on Euclidean distance + dissim <- dist(terra::crds(points_in), method = "euclidean") + cl <- anticlust::balanced_clustering(dissim, K = n_clusters) + points_in$CGRIDID <- cl + return(points_in) } + diff --git a/R/scale_process.R b/R/scale_process.R index aec3e19..797e2f6 100644 --- a/R/scale_process.R +++ b/R/scale_process.R @@ -140,6 +140,7 @@ par_grid <- fun_dist, ... ) { + # grid id selection check if (is.character(grid_target_id) && !grepl(":", grid_target_id)) { stop("Character grid_target_id should be in a form of 'startid:endid'.\n") } @@ -182,7 +183,7 @@ par_grid <- args_input[[1]] <- args_input[[1]][grid, ] if (methods::is(args_input[[2]], "SpatVector")) { - gpad_in <- grids$padded[grids$padded$CGRIDID == grid$CGRIDID, ] + gpad_in <- grids$padded[grids$padded$CGRIDID %in% grid$CGRIDID, ] args_input[[2]] <- args_input[[2]][gpad_in, ] } if (!"id" %in% names(formals(fun_dist))) { @@ -202,7 +203,6 @@ par_grid <- }, error = function(e) { par_fallback(e, fun_dist, debug) - }) return(run_result) diff --git a/README.Rmd b/README.Rmd index 37b1d31..60a9aa5 100644 --- a/README.Rmd +++ b/README.Rmd @@ -71,24 +71,24 @@ future::plan(future::multicore, workers = 4L) ```{r flowchart-mermaid, echo = FALSE, eval = (Sys.getenv("IN_GALLEY") == "")} mermaid_chart <- ' -graph LR - n72366978["Is the spatial resolution finer than 100 meters?"] - n21640044["Are there 100K+ features in the input vectors?"] - n38371203["Are there multiple rasters?"] +graph TD + n72366978["Spatial resolution <= 100 meters?"] + n21640044["100K+ features?"] + n38371203["Multiple rasters?"] n9291599["exact_extract with suitable max_cells_in_memory value"] - n84295645["Are they hierarchical?"] + n84295645["Hierarchical structure?"] n82902796["single thread processing"] - n76722667["Do they have the same extent and resolution?"] - n53137122["Is a single raster larger than your free memory space?"] - n34878990["Are your split_level have similar number of members?"] - n27787116["Are they spatially clustered?"] - n38458721["Do you have enough memory, say, more than the total disk space of the rasters?"] + n76722667["The same extent and resolution?"] + n53137122["Raster files larger than your free memory space?"] + n34878990["Split levels are in similar sizes?"] + n27787116["Spatially clustered?"] + n38458721["Free memory >= the total disk space of the rasters?"] n10339179["exact_extract with low max_cells_in_memory"] - n95964029["exact_extract with high max_cells_in_memory argument value"] + n95964029["exact_extract with high max_cells_in_memory"] n89847105["par_hierarchy"] - n94475834["par_make_gridset(..., mode = #quot;grid_quantile#quot;) or par_make_gridset_mode = #quot;grid_advanced#quot;)"] - n77415399["par_make_gridset(..., mode = #quot;grid#quot;)"] - n65357807["Stack rasters then process in the single thread"] + n94475834["par_make_gridset with mode = \'grid_quantile\' or mode = \'grid_advanced\'"] + n77415399["par_make_gridset with mode = \'grid\'"] + n65357807["Stack rasters then process with single thread"] n60994964["par_multirasters"] n64849552["par_grid"] n72366978 -->|Yes| n38371203 @@ -112,11 +112,11 @@ graph LR n94475834 --> n64849552 n77415399 --> n64849552 ' -DiagrammeR::mermaid(mermaid_chart, width = 2000, height = 1400) +DiagrammeR::mermaid(mermaid_chart, width = 2500, height = 600) ``` ## To run the examples -- RStudio: download and open this document then press "Run All Chunks Above", "Run All Chunks Below", or "Restart R and Run All Chunks", whichever it is appropriate. +- RStudio: download and open this document then press "Run All Chunks Above", "Run All Chunks Below", or "Restart R and Run All Chunks", whichever is appropriate. - Visual Studio Code (with R extension): download and open this document then press "Run Above" at the last code chunk. - If you prefer command line (i.e., in Unix-like operating systems), run: @@ -448,7 +448,6 @@ system.time( y = rd1 ) ) - ``` - We will compare the results from the single-thread and multi-thread calculation. @@ -464,8 +463,7 @@ all.equal(resj$distance.x, resj$distance.y) ## Why parallelization is slower than the ordinary function run? - Parallelization may underperform when the datasets are too small to take advantage of divide-and-compute approach, where parallelization overhead is involved. Overhead here refers to the required amount of computational resources for transferring objects to multiple processes. -- Since the demonstrations above use quite small datasets, the advantage of parallelization was not as noticeable as it was expected. Should a large amount of data (spatial/temporal resolution or number of files, for example) be processed, users could see the efficiency of this package. More illustrative and truly scaled examples will be provided in vignettes and manuscripts in the near future. - +- Since the demonstrations above use quite small datasets, the advantage of parallelization was not as noticeable as it was expected. Should a large amount of data (spatial/temporal resolution or number of files, for example) be processed, users could see the efficiency of this package. Please refer to a [vignette](https://kyle-messier.github.io/chopin/articles/v02_climate_examples.html) in this package for the demonstration of various climate/weather datasets. #### Last edited: March 22, 2024 diff --git a/README.md b/README.md index 0334332..d93aab4 100644 --- a/README.md +++ b/README.md @@ -107,7 +107,7 @@ future::plan(future::multicore, workers = 4L) - RStudio: download and open this document then press “Run All Chunks Above”, “Run All Chunks Below”, or “Restart R and Run All Chunks”, - whichever it is appropriate. + whichever is appropriate. - Visual Studio Code (with R extension): download and open this document then press “Run Above” at the last code chunk. - If you prefer command line (i.e., in Unix-like operating systems), @@ -631,8 +631,9 @@ all.equal(resj$distance.x, resj$distance.y) advantage of parallelization was not as noticeable as it was expected. Should a large amount of data (spatial/temporal resolution or number of files, for example) be processed, users could see the - efficiency of this package. More illustrative and truly scaled - examples will be provided in vignettes and manuscripts in the near - future. + efficiency of this package. Please refer to a + [vignette](https://kyle-messier.github.io/chopin/articles/v02_climate_examples.html) + in this package for the demonstration of various climate/weather + datasets. #### Last edited: March 22, 2024 diff --git a/chopin_rmarkdown_litr.rmd b/chopin_rmarkdown_litr.rmd index 6014895..9f034da 100644 --- a/chopin_rmarkdown_litr.rmd +++ b/chopin_rmarkdown_litr.rmd @@ -1,7 +1,7 @@ --- title: "CHOPIN: Computation for Climate and Health research On Parallelized INfrastructure: ``r params$package_name``" author: "Insang Song" -date: "2024-03-11" +date: "2024-04-23" knit: litr::render output: litr::litr_html_document params: @@ -21,7 +21,7 @@ usethis::create_package( path = ".", fields = list( Package = params$package_name, - Version = "0.5.0.20240311", + Version = "0.6.2.20240423", Title = "CHOPIN: Computation for Climate and Health research On Parallelized INfrastructure", Description = "It enables users with basic understanding on geospatial data and sf and terra functions to parallelize geospatial operations for geospatial exposure assessment modeling and covariate computation. Parallelization is done by dividing large datasets into sub-regions with regular grids and data's own hierarchy. A computation over the large number of raster files can be parallelized with a chopin function as well.", `Authors@R` = c( @@ -79,7 +79,9 @@ usethis::use_package("future.callr", "Suggests") usethis::use_package("callr", "Suggests") usethis::use_package("DiagrammeR", "Suggests") usethis::use_package("igraph") +usethis::use_package("anticlust") usethis::use_package("withr", "Suggests") +usethis::use_package("spatstat.random", "Suggests") usethis::use_package("knitr", "Suggests") usethis::use_mit_license() usethis::use_code_of_conduct("geoissong@gmail.com") @@ -1076,7 +1078,6 @@ chopin::extract_at( #' @param mode character(1). Mode of region construction. #' One of #' * `"grid"` (simple grid regardless of the number of features in each grid) -#' * `"density"` (clustering-based varying grids), #' * `"grid_advanced"` (merging adjacent grids with #' smaller number of features than `grid_min_features`). #' The argument `grid_min_features` should be specified. @@ -1131,7 +1132,7 @@ chopin::extract_at( par_make_gridset <- function( input, - mode = c("grid", "grid_advanced", "density", "grid_quantile"), + mode = c("grid", "grid_advanced", "grid_quantile"), nx = 10L, ny = 10L, grid_min_features = 30L, @@ -1144,13 +1145,11 @@ par_make_gridset <- if (!all( is.integer(nx), is.integer(ny) - ) - ) { + )) { stop("nx, ny must be integer.\n") } if (!is.numeric(padding)) { - message("padding should be numeric. -We try converting padding to numeric...\n") + message("padding should be numeric. Try converting padding to numeric...\n") padding <- as.numeric(padding) if (any(inherits(padding, "try-error"), is.na(padding))) { stop("padding is not convertible to numeric or converted to NA.\n") @@ -1171,10 +1170,10 @@ We try converting padding to numeric...\n") x = input, y = NULL, quantiles = quantiles - ), - density = simpleError("density method is under development.\n") + ) ) + # register CRS if (dep_check(grid_reg) == "sf") { grid_reg <- sf::st_set_crs(grid_reg, sf::st_crs(input)) grid_reg_conv <- dep_switch(grid_reg) @@ -1200,6 +1199,82 @@ We try converting padding to numeric...\n") } + +#' Extension of par_group_balanced for padded grids +#' @description This function is an extension of `par_group_balanced` +#' to be compatible with `par_grid`, for which a set of padded grids +#' of the extent of input point subsets (as recorded in the field named `"CGRIDID"`) +#' is generated out of input points along with the output of +#' `par_group_balanced`. +#' @family Parallelization +#' @param points_in `sf` or `SpatVector` object. +#' @param ngroups integer(1). The number of groups. +#' @param padding numeric(1). A extrusion factor to make buffer to +#' clip actual datasets. Depending on the length unit of the CRS of input. +#' @returns A list of two, +#' * \code{original}: exhaustive and non-overlapping +#' grid polygons in the class of input +#' * \code{padded}: a square buffer of each polygon in +#' \code{original}. Used for computation. +#' @author Insang Song +#' @examples +#' library(terra) +#' library(sf) +#' ncpath <- system.file("gpkg/nc.gpkg", package = "sf") +#' nc <- terra::vect(ncpath) +#' nc_rp <- terra::spatSample(nc, 1000) +#' nc_gr <- par_group_grid(nc_rp, 10L, 1000) +#' nc_gr +#' @importFrom terra as.polygons +#' @importFrom terra ext +#' @importFrom terra buffer +#' @export +par_group_grid <- + function( + points_in = NULL, + ngroups = NULL, + padding + ) { + if (missing(ngroups)) { + stop("ngroups should be specified.\n") + } + + if (!is.numeric(padding)) { + message("padding should be numeric. Try converting padding to numeric...\n") + padding <- as.numeric(padding) + if (any(inherits(padding, "try-error"), is.na(padding), missing(padding))) { + stop("padding is not convertible to numeric or converted to NA.\n") + } + } + pgroups <- par_group_balanced(points_in, ngroups) + + grid_p <- lapply( + split(pgroups, pgroups$CGRIDID), + function(x) { + terra::as.polygons(terra::ext(x)) + } + ) + grid_p <- Reduce(rbind, grid_p) + grid_p$CGRIDID <- sort(unique(pgroups$CGRIDID)) + + grid_reg_pad <- + terra::buffer( + grid_p, + width = padding, + capstyle = "square", + joinstyle = "mitre" + ) + if (dep_check(points_in) != dep_check(grid_reg_pad)) { + grid_reg_pad <- dep_switch(grid_reg_pad) + } + grid_results <- + list(original = pgroups, + padded = grid_reg_pad) + return(grid_results) + } + + + #' @title Generate grid polygons #' @family Parallelization #' @description Returns a sf object that includes x- and y- index @@ -1379,17 +1454,24 @@ par_cut_coords <- function(x = NULL, y = NULL, quantiles) { #' This function strongly assumes that the input #' is returned from the par_make_grid, #' which has `"CGRIDID"` as the unique id field. +#' @note This function will not work properly if `grid_in` has +#' more than one million grids. #' @author Insang Song #' @param points_in `sf` or `SpatVector` object. Target points of computation. #' @param grid_in `sf` or `SpatVector` object. #' The grid generated by [`par_make_grid`]. #' @param grid_min_features integer(1). Threshold to merge adjacent grids. +#' @param merge_max integer(1). Maximum number of grids to merge per merged set. +#' Default is 4. +#' For example, if the number of grids to merge is 20 and `merge_max` is 10, +#' the function will split the 20 grids into two sets of 10 grids. #' @returns A `sf` or `SpatVector` object of computation grids. #' @examples #' \dontrun{ #' library(sf) #' library(igraph) -#' ligrary(dplyr) +#' library(dplyr) +#' library(spatstat.random) #' dg <- sf::st_as_sfc(st_bbox(c(xmin = 0, ymin = 0, xmax = 8e5, ymax = 6e5))) #' sf::st_crs(dg) <- 5070 #' dgs <- sf::st_as_sf(st_make_grid(dg, n = c(20, 15))) @@ -1403,7 +1485,7 @@ par_cut_coords <- function(x = NULL, y = NULL, quantiles) { #' #### NOT RUN #### #' } #' @references -#' Polsby, D. D., & Popper, F. J. (1991). The Third Criterion: Compactness as a Procedural Safeguard Against Partisan Gerrymandering. _Yale Law & Policy Review_, 9(2), 301–353. [Link](http://hdl.handle.net/20.500.13051/17448) +#' Polsby DD, Popper FJ. (1991). The Third Criterion: Compactness as a Procedural Safeguard Against Partisan Gerrymandering. _Yale Law & Policy Review_, 9(2), 301–353. [Link](http://hdl.handle.net/20.500.13051/17448) #' @importFrom dplyr group_by #' @importFrom dplyr summarize #' @importFrom dplyr ungroup @@ -1424,21 +1506,25 @@ par_merge_grid <- function( points_in = NULL, grid_in = NULL, - grid_min_features = NULL + grid_min_features = NULL, + merge_max = 4L ) { package_detected <- dep_check(points_in) if (package_detected == "terra") { - points_in <- dep_switch(points_in) - grid_in <- dep_switch(grid_in) + points_pc <- dep_switch(points_in) + grid_pc <- dep_switch(grid_in) + } else { + points_pc <- points_in + grid_pc <- grid_in } # 1. count #points in each grid - n_points_in_grid <- lengths(sf::st_intersects(grid_in, points_in)) + n_points_in_grid <- lengths(sf::st_intersects(grid_pc, points_pc)) grid_lt_threshold <- (n_points_in_grid < grid_min_features) # 2. concatenate self and contiguity grid indices - grid_self <- sf::st_relate(grid_in, grid_in, pattern = "2********") - grid_rook <- sf::st_relate(grid_in, grid_in, pattern = "F***1****") + grid_self <- sf::st_relate(grid_pc, grid_pc, pattern = "2********") + grid_rook <- sf::st_relate(grid_pc, grid_pc, pattern = "F***1****") # 3. merge self and rook neighbors grid_selfrook <- mapply(c, grid_self, grid_rook, SIMPLIFY = FALSE) # 4. conditional 1: the number of points per grid exceed the threshold? @@ -1460,7 +1546,7 @@ par_merge_grid <- return(grid_in) } # leave only actual index rather than logical - grid_lt_threshold_idx <- seq(1, nrow(grid_in))[grid_lt_threshold] + grid_lt_threshold_idx <- seq(1, nrow(grid_pc))[grid_lt_threshold] # 5. filter out the ones that are below the threshold identified <- lapply(grid_selfrook, @@ -1486,19 +1572,63 @@ par_merge_grid <- igraph::components() identified_graph_member <- identified_graph$membership + identified_graph_member2 <- identified_graph_member + + # for assigning merged grid id (original) + merge_idx <- which(rownames(grid_pc) %in% names(identified_graph_member)) + + # nolint start + # post-process: split membership into (almost) equal sizes + # note that identified_graph_member should preserve the order + tab_graph_member <- table(identified_graph_member) + if (any(tab_graph_member > merge_max)) { + # gets index of the grids in too large groups + graph_member_excess_idx <- which(identified_graph_member %in% names(tab_graph_member[tab_graph_member > merge_max])) + # extract the excess groups + graph_member_excess <- identified_graph_member[graph_member_excess_idx] + # for each excess group, split into smaller groups + for (i in seq_along(unique(graph_member_excess))) { + graph_member_excess_this <- which(graph_member_excess == unique(graph_member_excess)[i]) + graph_member_excess_repl <- graph_member_excess[graph_member_excess_this] + # 1e6 is arbitrarily chosen; it should be large enough to avoid conflicts + # with the original membership + # I do believe this number will not be changed as 1e6+ computation grids are not practical + graph_member_excess_split <- + split( + graph_member_excess_repl, + ceiling(seq_along(graph_member_excess_repl) / merge_max) + (i * 1e6) + ) + graph_member_excess_split <- + mapply(function(x, y) { + rep(sapply(y, as.numeric), length(x)) + }, graph_member_excess_split, names(graph_member_excess_split), + SIMPLIFY = TRUE + ) + graph_member_excess_split <- unname(unlist(graph_member_excess_split)) + identified_graph_member2[which(identified_graph_member2 == unique(graph_member_excess)[i])] <- + graph_member_excess_split + } + identified_graph_member <- identified_graph_member2 + } else { + identified_graph_member <- identified_graph_member + } + # nolint end # 10. Assign membership information - merge_idx <- as.integer(names(identified_graph_member)) - merge_member <- split(merge_idx, identified_graph_member) + # Here we use the modified membership + merge_member <- split(rownames(grid_pc)[merge_idx], identified_graph_member) # 11. Label the merged grids merge_member_label <- unlist(lapply(merge_member, function(x) paste(x, collapse = "_"))) - merge_member_label <- merge_member_label[identified_graph_member] + merge_member_label <- mapply(function(lst, label) { + rep(label, length(lst)) }, + merge_member, merge_member_label, SIMPLIFY = TRUE) + merge_member_label <- unlist(merge_member_label) # 12. Assign labels to the original sf object - grid_out <- grid_in + grid_out <- grid_pc grid_out[["CGRIDID"]][merge_idx] <- merge_member_label - + grid_out <- grid_out |> dplyr::group_by(!!rlang::sym("CGRIDID")) |> dplyr::summarize(n_merged = dplyr::n()) |> @@ -1508,7 +1638,9 @@ par_merge_grid <- par_merge_gridd <- grid_out[which(grid_out$n_merged > 1), ] par_merge_gridd_area <- as.numeric(sf::st_area(par_merge_gridd)) par_merge_gridd_perimeter <- - as.numeric(sf::st_length(sf::st_cast(par_merge_gridd, "LINESTRING"))) + suppressWarnings( + as.numeric(sf::st_length(sf::st_cast(par_merge_gridd, "LINESTRING"))) + ) par_merge_gridd_pptest <- (4 * pi * par_merge_gridd_area) / (par_merge_gridd_perimeter ^ 2) @@ -1520,10 +1652,178 @@ par_merge_grid <- message("The reduced computational regions have too complex shapes. Consider increasing thresholds or using the original grids.\n") } + if (dep_check(points_in) != dep_check(grid_out)) { + grid_out <- dep_switch(grid_out) + } return(grid_out) + } + +``` + +```{r grid-quadtree, include = FALSE, eval = FALSE} + +## anticlustering +library(anticlust) +set.seed(2024) +lds <- matrix(rnorm(4096), ncol = 2) +cl <- balanced_clustering(lds, K = 10) +plot_clusters(lds, cl) + + + +## test +library(spatstat.explore) +library(terra) + +data("ncpoints", package = "chopin") +ncpnt <- terra::vect(ncpoints, geom = c("X", "Y"), keepgeom = FALSE, crs = "EPSG:5070") +ncpnt$w <- 1L +ncppp <- as.ppp(sf::st_as_sf(ncpnt)) +ncpds <- spatstat.explore::density.ppp(ncppp, dimyx = c(200, 400)) +?pixellate.ppp +plot(ncpds) +ncpdsr <- rast(ncpds) +ncpdsr2 <- deepcopy(ncpdsr) +ncpdsrv <- values(ncpdsr) +values(ncpdsr2) <- max(ncpdsrv) - ncpdsrv + min(ncpdsrv) +ncpdsrq2 <- quadtree::quadtree(ncpdsr2,split_threshold = 5e-9, split_method = "range", combine_method = "max") +ncpdsrq <- quadtree::quadtree(ncpdsr,split_threshold = 1e-8) +par(mar = c(2.5,2.5,1,6)) +plot(ncpdsrq2) +plot(ncpdsrq) +plot(ncppp, add = T) + +ncnear <- terra::nearby(ncpnt, distance = 10000) +ncneard <- terra::distance(ncpnt[ncnear[,1],], ncpnt[ncnear[,2],], pairwise = TRUE) +ncr <- rasterize(ncpnt, rast(ext(ncpnt), resolution = 1000)) +chopin::kernelfunction(ncneard, 10000, "epanechnikov") +plot(ncr) +``` + +```{r group-balanced, send_to = "R/gridding.R"} +#' Generate groups based on balanced clustering +#' @description For balancing computational loads, the function uses +#' the `anticlust` package to cluster the input points. The number of clusters +#' is determined by the `num_cluster` argument. Each cluster will have +#' equal number of points. Grids will be generated based on the cluster extents. +#' @note This function is only for two-dimensional points. +#' The results will be inexhaustive grids. +#' @param points_in `sf` or `SpatVector` object. Target points of computation. +#' @param n_clusters integer(1). The number of clusters. +#' @returns `SpatVector` object with a field `"CGRIDID"`. +#' @examples +#' library(terra) +#' library(anticlust) +#' data(ncpoints, package = "chopin") +#' ncp <- terra::vect(ncpoints, geom = c("X", "Y"), keepgeom = FALSE, crs = "EPSG:5070") +#' par_group_balanced(ncp, 10) +#' @author Insang Song +#' @importFrom anticlust balanced_clustering +#' @importFrom terra vect +#' @importFrom terra crds +#' @importFrom stats dist +#' @export +par_group_balanced <- function( + points_in = NULL, + n_clusters = NULL +) { + if (!is.numeric(n_clusters)) { + stop("n_clusters should be numeric.\n") + } + if (n_clusters < 2) { + stop("n_clusters should be greater than 1.\n") + } + if (dep_check(points_in) == "sf") { + points_in <- terra::vect(points_in) + } + # define dissimilarity based on Euclidean distance + dissim <- dist(terra::crds(points_in), method = "euclidean") + cl <- anticlust::balanced_clustering(dissim, K = n_clusters) + points_in$CGRIDID <- cl + return(points_in) } + +``` + +```{r par_group_balanced-remainders, eval = FALSE, include = FALSE} + # extent to polygon + # convex hulls and voronoi polygons: unbalanced; abandoned + # clust <- split(points_in, points_in$clust) + # clustch <- lapply(clust, function(x) terra::convHull(x)) + # clustchr <- Reduce(rbind, clustch) + # clustchr <- terra::convHull(clustchr) + # clustchr <- terra::buffer(clustchr, width = 1e-5) + # clustcnt <- lapply(clustch, function(x) terra::centroids(x)) + # clustcnt <- Reduce(rbind, clustcnt) + # clustk <- terra::voronoi(clustcnt, bnd = clustchr) + # clustk <- terra::crop(clustk, clustchr) + # assign unique grid id + #clustk$CGRIDID <- seq(1, nrow(clustk)) + # if (dep_check(points_in) != dep_check(clustk)) { + # clustk <- dep_switch(clustk) + # } + + +``` + +```{r test-group-balanced, eval = FALSE, send_to = "tests/testthat/test-gridding.R"} +testthat::test_that("Balanced group tests", { + withr::local_package("sf") + withr::local_package("terra") + withr::local_package("anticlust") + withr::local_options(list(sf_use_s2 = FALSE)) + + rv <- terra::vect(matrix(rnorm(1000, 1e3, 350), ncol = 2)) + rs <- sf::st_as_sf(rv) + + testthat::expect_no_error( + par_group_balanced(rv, 10) + ) + testthat::expect_no_error( + par_group_balanced(rs, 10) + ) + + testthat::expect_error( + par_group_balanced(rv, "NUMBER") + ) + testthat::expect_error( + par_group_balanced(rv, 1L) + ) + testthat::expect_true(any("CGRIDID" %in% names(par_group_balanced(rv, 10)))) + + # gridded + testthat::expect_no_error( + pgg_terra <- par_group_grid(rv, 10, 100) + ) + testthat::expect_no_error( + pgg_sf <- par_group_grid(rs, 10, 100) + ) + testthat::expect_equal(length(pgg_terra), 2) + testthat::expect_equal(length(pgg_sf), 2) + testthat::expect_true(all(table(pgg_terra$original$CGRIDID) == 50)) + + testthat::expect_error( + par_group_grid(rv, NULL) + ) + testthat::expect_error( + par_group_grid(rv, 5L) + ) + testthat::expect_no_error( + par_group_grid(rv, 5L, "10000") + ) + testthat::expect_error( + suppressWarnings(par_group_grid(rv, 5L, "radius")) + ) + testthat::expect_error( + par_group_grid(rv, 5L, NA) + ) + +}) + + + ``` ```{r test-quantiles, eval = FALSE, send_to = "tests/testthat/test-gridding.R"} @@ -1604,7 +1904,7 @@ par_make_grid_terra <- function(points_in, ncutsx, ncutsy) { } ``` -```{r test-compregion, send_to = "tests/testthat/test-gridding.R"} +```{r test-compregion, eval= FALSE, send_to = "tests/testthat/test-gridding.R"} testthat::test_that("Grid split is well done.", { withr::local_package("sf") withr::local_package("stars") @@ -1643,7 +1943,7 @@ testthat::test_that("Grid split is well done.", { ncrp <- sf::st_as_sf(sf::st_sample(nc, 1000L)) # Points - testthat::expect_warning( + testthat::expect_no_warning( par_make_gridset( ncp, mode = "grid_advanced", @@ -1711,9 +2011,20 @@ testthat::test_that("Grid merge is well done.", { mode = "grid", nx = 20L, ny = 12L, padding = 1e4L) - testthat::expect_warning( - testthat::expect_message(par_merge_grid(ncptr2, griddedtr2$original, 15L)) + testthat::expect_message( + gridmerged2 <- par_merge_grid(ncptr2, griddedtr2$original, 15L) + ) + testthat::expect_s4_class(gridmerged2, "SpatVector") + + griddedtr22 <- + par_make_gridset(ncptr2, + mode = "grid", + nx = 40L, ny = 20L, + padding = 1e4L) + testthat::expect_message( + gridmergedx <- par_merge_grid(ncptr2, griddedtr22$original, 10L, merge_max = 10L) ) + }) ``` @@ -3532,6 +3843,7 @@ par_grid <- fun_dist, ... ) { + # grid id selection check if (is.character(grid_target_id) && !grepl(":", grid_target_id)) { stop("Character grid_target_id should be in a form of 'startid:endid'.\n") } @@ -3574,7 +3886,7 @@ par_grid <- args_input[[1]] <- args_input[[1]][grid, ] if (methods::is(args_input[[2]], "SpatVector")) { - gpad_in <- grids$padded[grids$padded$CGRIDID == grid$CGRIDID, ] + gpad_in <- grids$padded[grids$padded$CGRIDID %in% grid$CGRIDID, ] args_input[[2]] <- args_input[[2]][gpad_in, ] } if (!"id" %in% names(formals(fun_dist))) { @@ -3594,7 +3906,6 @@ par_grid <- }, error = function(e) { par_fallback(e, fun_dist, debug) - }) return(run_result) @@ -4706,7 +5017,6 @@ if (dir.exists("../figures")) { # "./tools/vignettes-sources/v02_par.Rmd.orig", # output = "./tools/vignettes-sources/v02_par.Rmd" # ) - ``` ```{r post-hoc-cleaning, echo = F, eval = FALSE} @@ -4754,5 +5064,5 @@ covr::package_coverage() ## Multinode computation ```{r slurm-submission-template, eval = FALSE, include = FALSE} # slurm submission is here - +#032323 ``` diff --git a/codemeta.json b/codemeta.json index 5084c4e..295dc93 100644 --- a/codemeta.json +++ b/codemeta.json @@ -7,7 +7,7 @@ "codeRepository": "https://github.com/kyle-messier/chopin", "issueTracker": "https://github.com/kyle-messier/chopin/issues", "license": "https://spdx.org/licenses/MIT", - "version": "0.5.0.20240311", + "version": "0.6.2.20240423", "programmingLanguage": { "@type": "ComputerLanguage", "name": "R", @@ -143,6 +143,18 @@ }, "sameAs": "https://CRAN.R-project.org/package=rmarkdown" }, + { + "@type": "SoftwareApplication", + "identifier": "spatstat.random", + "name": "spatstat.random", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=spatstat.random" + }, { "@type": "SoftwareApplication", "identifier": "testthat", @@ -189,6 +201,18 @@ "version": ">= 4.1" }, "2": { + "@type": "SoftwareApplication", + "identifier": "anticlust", + "name": "anticlust", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=anticlust" + }, + "3": { "@type": "SoftwareApplication", "identifier": "doFuture", "name": "doFuture", @@ -200,7 +224,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=doFuture" }, - "3": { + "4": { "@type": "SoftwareApplication", "identifier": "dplyr", "name": "dplyr", @@ -213,7 +237,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=dplyr" }, - "4": { + "5": { "@type": "SoftwareApplication", "identifier": "exactextractr", "name": "exactextractr", @@ -226,7 +250,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=exactextractr" }, - "5": { + "6": { "@type": "SoftwareApplication", "identifier": "future", "name": "future", @@ -238,7 +262,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=future" }, - "6": { + "7": { "@type": "SoftwareApplication", "identifier": "future.apply", "name": "future.apply", @@ -250,7 +274,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=future.apply" }, - "7": { + "8": { "@type": "SoftwareApplication", "identifier": "igraph", "name": "igraph", @@ -262,12 +286,12 @@ }, "sameAs": "https://CRAN.R-project.org/package=igraph" }, - "8": { + "9": { "@type": "SoftwareApplication", "identifier": "methods", "name": "methods" }, - "9": { + "10": { "@type": "SoftwareApplication", "identifier": "rlang", "name": "rlang", @@ -280,7 +304,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=rlang" }, - "10": { + "11": { "@type": "SoftwareApplication", "identifier": "sf", "name": "sf", @@ -293,7 +317,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=sf" }, - "11": { + "12": { "@type": "SoftwareApplication", "identifier": "stars", "name": "stars", @@ -306,7 +330,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=stars" }, - "12": { + "13": { "@type": "SoftwareApplication", "identifier": "terra", "name": "terra", @@ -321,9 +345,9 @@ }, "SystemRequirements": null }, - "fileSize": "26507.895KB", + "fileSize": "26515.43KB", "releaseNotes": "https://github.com/kyle-messier/chopin/blob/master/NEWS.md", - "readme": "https://github.com/kyle-messier/chopin/blob/main/README.md", + "readme": "https://github.com/NIEHS/chopin/blob/main/README.md", "contIntegration": ["https://github.com/Spatiotemporal-Exposures-and-Toxicology/chopin/actions/workflows/test-coverage.yaml", "https://codecov.io/github/Spatiotemporal-Exposures-and-Toxicology/chopin", "https://github.com/Spatiotemporal-Exposures-and-Toxicology/chopin/actions/workflows/check-standard.yaml"], "developmentStatus": "https://lifecycle.r-lib.org/articles/stages.html#experimental" } diff --git a/man/figures/README-flowchart-mermaid-1.png b/man/figures/README-flowchart-mermaid-1.png index eb53abe..2841cbb 100644 Binary files a/man/figures/README-flowchart-mermaid-1.png and b/man/figures/README-flowchart-mermaid-1.png differ diff --git a/man/par_cut_coords.Rd b/man/par_cut_coords.Rd index b0b5c77..ba28cf3 100644 --- a/man/par_cut_coords.Rd +++ b/man/par_cut_coords.Rd @@ -46,6 +46,7 @@ sum(table(qcv$CGRIDID)) # should be 1000 Other Parallelization: \code{\link{par_fallback}()}, \code{\link{par_grid}()}, +\code{\link{par_group_grid}()}, \code{\link{par_hierarchy}()}, \code{\link{par_make_grid}()}, \code{\link{par_make_gridset}()}, diff --git a/man/par_fallback.Rd b/man/par_fallback.Rd index ec9c90e..d447676 100644 --- a/man/par_fallback.Rd +++ b/man/par_fallback.Rd @@ -31,6 +31,7 @@ par_fallback(err, extract_at, debug = TRUE) Other Parallelization: \code{\link{par_cut_coords}()}, \code{\link{par_grid}()}, +\code{\link{par_group_grid}()}, \code{\link{par_hierarchy}()}, \code{\link{par_make_grid}()}, \code{\link{par_make_gridset}()}, diff --git a/man/par_grid.Rd b/man/par_grid.Rd index 183c670..a9bf97f 100644 --- a/man/par_grid.Rd +++ b/man/par_grid.Rd @@ -116,6 +116,7 @@ res <- Other Parallelization: \code{\link{par_cut_coords}()}, \code{\link{par_fallback}()}, +\code{\link{par_group_grid}()}, \code{\link{par_hierarchy}()}, \code{\link{par_make_grid}()}, \code{\link{par_make_gridset}()}, diff --git a/man/par_group_balanced.Rd b/man/par_group_balanced.Rd new file mode 100644 index 0000000..1ccba00 --- /dev/null +++ b/man/par_group_balanced.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in chopin_rmarkdown_litr.rmd. +\name{par_group_balanced} +\alias{par_group_balanced} +\title{Generate groups based on balanced clustering} +\usage{ +par_group_balanced(points_in = NULL, n_clusters = NULL) +} +\arguments{ +\item{points_in}{\code{sf} or \code{SpatVector} object. Target points of computation.} + +\item{n_clusters}{integer(1). The number of clusters.} +} +\value{ +\code{SpatVector} object with a field \code{"CGRIDID"}. +} +\description{ +For balancing computational loads, the function uses +the \code{anticlust} package to cluster the input points. The number of clusters +is determined by the \code{num_cluster} argument. Each cluster will have +equal number of points. Grids will be generated based on the cluster extents. +} +\note{ +This function is only for two-dimensional points. +The results will be inexhaustive grids. +} +\examples{ +library(terra) +library(anticlust) +data(ncpoints, package = "chopin") +ncp <- terra::vect(ncpoints, geom = c("X", "Y"), keepgeom = FALSE, crs = "EPSG:5070") +par_group_balanced(ncp, 10) +} +\author{ +Insang Song +} \ No newline at end of file diff --git a/man/par_group_grid.Rd b/man/par_group_grid.Rd new file mode 100644 index 0000000..6d6a9a6 --- /dev/null +++ b/man/par_group_grid.Rd @@ -0,0 +1,56 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in chopin_rmarkdown_litr.rmd. +\name{par_group_grid} +\alias{par_group_grid} +\title{Extension of par_group_balanced for padded grids} +\usage{ +par_group_grid(points_in = NULL, ngroups = NULL, padding) +} +\arguments{ +\item{points_in}{\code{sf} or \code{SpatVector} object.} + +\item{ngroups}{integer(1). The number of groups.} + +\item{padding}{numeric(1). A extrusion factor to make buffer to +clip actual datasets. Depending on the length unit of the CRS of input.} +} +\value{ +A list of two, +\itemize{ +\item \code{original}: exhaustive and non-overlapping +grid polygons in the class of input +\item \code{padded}: a square buffer of each polygon in +\code{original}. Used for computation. +} +} +\description{ +This function is an extension of \code{par_group_balanced} +to be compatible with \code{par_grid}, for which a set of padded grids +of the extent of input point subsets (as recorded in the field named \code{"CGRIDID"}) +is generated out of input points along with the output of +\code{par_group_balanced}. +} +\examples{ +library(terra) +library(sf) +ncpath <- system.file("gpkg/nc.gpkg", package = "sf") +nc <- terra::vect(ncpath) +nc_rp <- terra::spatSample(nc, 1000) +nc_gr <- par_group_grid(nc_rp, 10L, 1000) +nc_gr +} +\seealso{ +Other Parallelization: +\code{\link{par_cut_coords}()}, +\code{\link{par_fallback}()}, +\code{\link{par_grid}()}, +\code{\link{par_hierarchy}()}, +\code{\link{par_make_grid}()}, +\code{\link{par_make_gridset}()}, +\code{\link{par_merge_grid}()}, +\code{\link{par_multirasters}()} +} +\author{ +Insang Song +} +\concept{Parallelization} \ No newline at end of file diff --git a/man/par_hierarchy.Rd b/man/par_hierarchy.Rd index 3be6548..54224d5 100644 --- a/man/par_hierarchy.Rd +++ b/man/par_hierarchy.Rd @@ -118,6 +118,7 @@ Other Parallelization: \code{\link{par_cut_coords}()}, \code{\link{par_fallback}()}, \code{\link{par_grid}()}, +\code{\link{par_group_grid}()}, \code{\link{par_make_grid}()}, \code{\link{par_make_gridset}()}, \code{\link{par_merge_grid}()}, diff --git a/man/par_make_grid.Rd b/man/par_make_grid.Rd index dcec77f..117ccfe 100644 --- a/man/par_make_grid.Rd +++ b/man/par_make_grid.Rd @@ -46,6 +46,7 @@ Other Parallelization: \code{\link{par_cut_coords}()}, \code{\link{par_fallback}()}, \code{\link{par_grid}()}, +\code{\link{par_group_grid}()}, \code{\link{par_hierarchy}()}, \code{\link{par_make_gridset}()}, \code{\link{par_merge_grid}()}, diff --git a/man/par_make_gridset.Rd b/man/par_make_gridset.Rd index ce8acab..acbcbf5 100644 --- a/man/par_make_gridset.Rd +++ b/man/par_make_gridset.Rd @@ -6,7 +6,7 @@ \usage{ par_make_gridset( input, - mode = c("grid", "grid_advanced", "density", "grid_quantile"), + mode = c("grid", "grid_advanced", "grid_quantile"), nx = 10L, ny = 10L, grid_min_features = 30L, @@ -23,7 +23,6 @@ par_make_gridset( One of \itemize{ \item \code{"grid"} (simple grid regardless of the number of features in each grid) -\item \code{"density"} (clustering-based varying grids), \item \code{"grid_advanced"} (merging adjacent grids with smaller number of features than \code{grid_min_features}). The argument \code{grid_min_features} should be specified. @@ -88,6 +87,7 @@ Other Parallelization: \code{\link{par_cut_coords}()}, \code{\link{par_fallback}()}, \code{\link{par_grid}()}, +\code{\link{par_group_grid}()}, \code{\link{par_hierarchy}()}, \code{\link{par_make_grid}()}, \code{\link{par_merge_grid}()}, diff --git a/man/par_merge_grid.Rd b/man/par_merge_grid.Rd index fee0182..c19bbf6 100644 --- a/man/par_merge_grid.Rd +++ b/man/par_merge_grid.Rd @@ -4,7 +4,12 @@ \alias{par_merge_grid} \title{Merge adjacent grid polygons with given rules} \usage{ -par_merge_grid(points_in = NULL, grid_in = NULL, grid_min_features = NULL) +par_merge_grid( + points_in = NULL, + grid_in = NULL, + grid_min_features = NULL, + merge_max = 4L +) } \arguments{ \item{points_in}{\code{sf} or \code{SpatVector} object. Target points of computation.} @@ -13,6 +18,11 @@ par_merge_grid(points_in = NULL, grid_in = NULL, grid_min_features = NULL) The grid generated by \code{\link{par_make_grid}}.} \item{grid_min_features}{integer(1). Threshold to merge adjacent grids.} + +\item{merge_max}{integer(1). Maximum number of grids to merge per merged set. +Default is 4. +For example, if the number of grids to merge is 20 and \code{merge_max} is 10, +the function will split the 20 grids into two sets of 10 grids.} } \value{ A \code{sf} or \code{SpatVector} object of computation grids. @@ -24,11 +34,16 @@ This function strongly assumes that the input is returned from the par_make_grid, which has \code{"CGRIDID"} as the unique id field. } +\note{ +This function will not work properly if \code{grid_in} has +more than one million grids. +} \examples{ \dontrun{ library(sf) library(igraph) -ligrary(dplyr) +library(dplyr) +library(spatstat.random) dg <- sf::st_as_sfc(st_bbox(c(xmin = 0, ymin = 0, xmax = 8e5, ymax = 6e5))) sf::st_crs(dg) <- 5070 dgs <- sf::st_as_sf(st_make_grid(dg, n = c(20, 15))) @@ -43,13 +58,14 @@ plot(dg_merged$geometry) } } \references{ -Polsby, D. D., & Popper, F. J. (1991). The Third Criterion: Compactness as a Procedural Safeguard Against Partisan Gerrymandering. \emph{Yale Law & Policy Review}, 9(2), 301–353. \href{http://hdl.handle.net/20.500.13051/17448}{Link} +Polsby DD, Popper FJ. (1991). The Third Criterion: Compactness as a Procedural Safeguard Against Partisan Gerrymandering. \emph{Yale Law & Policy Review}, 9(2), 301–353. \href{http://hdl.handle.net/20.500.13051/17448}{Link} } \seealso{ Other Parallelization: \code{\link{par_cut_coords}()}, \code{\link{par_fallback}()}, \code{\link{par_grid}()}, +\code{\link{par_group_grid}()}, \code{\link{par_hierarchy}()}, \code{\link{par_make_grid}()}, \code{\link{par_make_gridset}()}, diff --git a/man/par_multirasters.Rd b/man/par_multirasters.Rd index 3faf357..789404a 100644 --- a/man/par_multirasters.Rd +++ b/man/par_multirasters.Rd @@ -94,6 +94,7 @@ Other Parallelization: \code{\link{par_cut_coords}()}, \code{\link{par_fallback}()}, \code{\link{par_grid}()}, +\code{\link{par_group_grid}()}, \code{\link{par_hierarchy}()}, \code{\link{par_make_grid}()}, \code{\link{par_make_gridset}()}, diff --git a/tests/testthat/test-gridding.R b/tests/testthat/test-gridding.R index cbfad66..3d09c9f 100644 --- a/tests/testthat/test-gridding.R +++ b/tests/testthat/test-gridding.R @@ -1,5 +1,61 @@ # Generated from chopin_rmarkdown_litr.rmd: do not edit by hand +testthat::test_that("Balanced group tests", { + withr::local_package("sf") + withr::local_package("terra") + withr::local_package("anticlust") + withr::local_options(list(sf_use_s2 = FALSE)) + + rv <- terra::vect(matrix(rnorm(1000, 1e3, 350), ncol = 2)) + rs <- sf::st_as_sf(rv) + + testthat::expect_no_error( + par_group_balanced(rv, 10) + ) + testthat::expect_no_error( + par_group_balanced(rs, 10) + ) + + testthat::expect_error( + par_group_balanced(rv, "NUMBER") + ) + testthat::expect_error( + par_group_balanced(rv, 1L) + ) + testthat::expect_true(any("CGRIDID" %in% names(par_group_balanced(rv, 10)))) + + # gridded + testthat::expect_no_error( + pgg_terra <- par_group_grid(rv, 10, 100) + ) + testthat::expect_no_error( + pgg_sf <- par_group_grid(rs, 10, 100) + ) + testthat::expect_equal(length(pgg_terra), 2) + testthat::expect_equal(length(pgg_sf), 2) + testthat::expect_true(all(table(pgg_terra$original$CGRIDID) == 50)) + + testthat::expect_error( + par_group_grid(rv, NULL) + ) + testthat::expect_error( + par_group_grid(rv, 5L) + ) + testthat::expect_no_error( + par_group_grid(rv, 5L, "10000") + ) + testthat::expect_error( + suppressWarnings(par_group_grid(rv, 5L, "radius")) + ) + testthat::expect_error( + par_group_grid(rv, 5L, NA) + ) + +}) + + + + testthat::test_that("Quantile cut tests", { withr::local_package("sf") withr::local_package("terra") @@ -96,7 +152,7 @@ testthat::test_that("Grid split is well done.", { ncrp <- sf::st_as_sf(sf::st_sample(nc, 1000L)) # Points - testthat::expect_warning( + testthat::expect_no_warning( par_make_gridset( ncp, mode = "grid_advanced", @@ -162,8 +218,19 @@ testthat::test_that("Grid merge is well done.", { mode = "grid", nx = 20L, ny = 12L, padding = 1e4L) - testthat::expect_warning( - testthat::expect_message(par_merge_grid(ncptr2, griddedtr2$original, 15L)) + testthat::expect_message( + gridmerged2 <- par_merge_grid(ncptr2, griddedtr2$original, 15L) ) + testthat::expect_s4_class(gridmerged2, "SpatVector") + + griddedtr22 <- + par_make_gridset(ncptr2, + mode = "grid", + nx = 40L, ny = 20L, + padding = 1e4L) + testthat::expect_message( + gridmergedx <- par_merge_grid(ncptr2, griddedtr22$original, 10L, merge_max = 10L) + ) + }) diff --git a/tools/readme-source/README.Rmd b/tools/readme-source/README.Rmd index 37b1d31..60a9aa5 100644 --- a/tools/readme-source/README.Rmd +++ b/tools/readme-source/README.Rmd @@ -71,24 +71,24 @@ future::plan(future::multicore, workers = 4L) ```{r flowchart-mermaid, echo = FALSE, eval = (Sys.getenv("IN_GALLEY") == "")} mermaid_chart <- ' -graph LR - n72366978["Is the spatial resolution finer than 100 meters?"] - n21640044["Are there 100K+ features in the input vectors?"] - n38371203["Are there multiple rasters?"] +graph TD + n72366978["Spatial resolution <= 100 meters?"] + n21640044["100K+ features?"] + n38371203["Multiple rasters?"] n9291599["exact_extract with suitable max_cells_in_memory value"] - n84295645["Are they hierarchical?"] + n84295645["Hierarchical structure?"] n82902796["single thread processing"] - n76722667["Do they have the same extent and resolution?"] - n53137122["Is a single raster larger than your free memory space?"] - n34878990["Are your split_level have similar number of members?"] - n27787116["Are they spatially clustered?"] - n38458721["Do you have enough memory, say, more than the total disk space of the rasters?"] + n76722667["The same extent and resolution?"] + n53137122["Raster files larger than your free memory space?"] + n34878990["Split levels are in similar sizes?"] + n27787116["Spatially clustered?"] + n38458721["Free memory >= the total disk space of the rasters?"] n10339179["exact_extract with low max_cells_in_memory"] - n95964029["exact_extract with high max_cells_in_memory argument value"] + n95964029["exact_extract with high max_cells_in_memory"] n89847105["par_hierarchy"] - n94475834["par_make_gridset(..., mode = #quot;grid_quantile#quot;) or par_make_gridset_mode = #quot;grid_advanced#quot;)"] - n77415399["par_make_gridset(..., mode = #quot;grid#quot;)"] - n65357807["Stack rasters then process in the single thread"] + n94475834["par_make_gridset with mode = \'grid_quantile\' or mode = \'grid_advanced\'"] + n77415399["par_make_gridset with mode = \'grid\'"] + n65357807["Stack rasters then process with single thread"] n60994964["par_multirasters"] n64849552["par_grid"] n72366978 -->|Yes| n38371203 @@ -112,11 +112,11 @@ graph LR n94475834 --> n64849552 n77415399 --> n64849552 ' -DiagrammeR::mermaid(mermaid_chart, width = 2000, height = 1400) +DiagrammeR::mermaid(mermaid_chart, width = 2500, height = 600) ``` ## To run the examples -- RStudio: download and open this document then press "Run All Chunks Above", "Run All Chunks Below", or "Restart R and Run All Chunks", whichever it is appropriate. +- RStudio: download and open this document then press "Run All Chunks Above", "Run All Chunks Below", or "Restart R and Run All Chunks", whichever is appropriate. - Visual Studio Code (with R extension): download and open this document then press "Run Above" at the last code chunk. - If you prefer command line (i.e., in Unix-like operating systems), run: @@ -448,7 +448,6 @@ system.time( y = rd1 ) ) - ``` - We will compare the results from the single-thread and multi-thread calculation. @@ -464,8 +463,7 @@ all.equal(resj$distance.x, resj$distance.y) ## Why parallelization is slower than the ordinary function run? - Parallelization may underperform when the datasets are too small to take advantage of divide-and-compute approach, where parallelization overhead is involved. Overhead here refers to the required amount of computational resources for transferring objects to multiple processes. -- Since the demonstrations above use quite small datasets, the advantage of parallelization was not as noticeable as it was expected. Should a large amount of data (spatial/temporal resolution or number of files, for example) be processed, users could see the efficiency of this package. More illustrative and truly scaled examples will be provided in vignettes and manuscripts in the near future. - +- Since the demonstrations above use quite small datasets, the advantage of parallelization was not as noticeable as it was expected. Should a large amount of data (spatial/temporal resolution or number of files, for example) be processed, users could see the efficiency of this package. Please refer to a [vignette](https://kyle-messier.github.io/chopin/articles/v02_climate_examples.html) in this package for the demonstration of various climate/weather datasets. #### Last edited: March 22, 2024