Merge pull request #95 from NIEHS/0.8.0

test update
NIEHS · Aug 18, 2024 · 683e7f5 · 683e7f5
2 parents 18490e6 + a687e4f
commit 683e7f5
Show file tree

Hide file tree

Showing 7 changed files with 455 additions and 36 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: chopin
 Title: Computation of Spatial Data by Hierarchical and Objective Partitioning of Inputs for Parallel Processing
-Version: 0.7.8.20240814
+Version: 0.7.8.20240817
 Authors@R: c(
     person("Insang", "Song", , "geoissong@gmail.com", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0001-8732-3256")),

diff --git a/R/processing.R b/R/processing.R
@@ -127,7 +127,7 @@ kernelfunction <-
 #' @param id character(1). Name of unique identifier field.
 #' @param func character(1)/function. supported function names or functions
 #'   taking `x` and `w` in `exactextractr::exact_extract`
-#' @param extent numeric
+#' @param extent numeric. Passed to .check_vector
 #' @param radius numeric(1).
 #' @param out_class character(1). "sf" or "terra"
 #' @param kernel character(1). Name of kernel functions [kernelfunction]
@@ -492,12 +492,12 @@ setMethod(
     y = NULL,
     id = NULL,
     func = "mean",
-    extent,
-    radius,
+    extent = NULL,
+    radius = NULL,
     out_class = "sf",
     kernel = NULL,
     kernel_func = stats::weighted.mean,
-    bandwidth,
+    bandwidth = NULL,
     max_cells = 3e+07,
     .standalone = TRUE,
     ...

diff --git a/README.Rmd b/README.Rmd
@@ -57,6 +57,18 @@ future::plan(future.mirai::mirai_multisession, workers = 4L)
 # the number of workers are up to users' choice
 ```
 
+
+### Function selection guide
+
+We provide two flowcharts to help users choose the right function for parallel processing. The raster-oriented flowchart is for users who want to start with raster data, and the vector-oriented flowchart is for users with large vector data.
+
+In **raster-oriented selection**, we suggest four factors to consider:
+- Number of raster files: for multiple files, `par_multirasters` is recommended. When there are multiple rasters that share the same extent and resolution, consider stacking the rasters into multilayer SpatRaster object by calling `terra::rast(filenames)`.
+- Raster resolution: We suggest 100 meters as a threshold. Rasters with resolution coarser than 100 meters and a few layers would be better for the direct call of `exactextractr::exact_extract()`.
+- Raster extent: Using `SpatRaster` in `exactextractr::exact_extract()` is often minimally affected by the raster extent.
+- Memory size: `max_cells_in_memory` argument value of `exactextractr::exact_extract()`, raster resolution, and the number of layers in `SpatRaster` are multiplicatively related to the memory usage.
+
+
 ```{r flowchart-mermaid-raster, echo = FALSE, eval = (Sys.getenv("IN_GALLEY") == "")}
 mermaid_chart_raster <-
 '
@@ -86,6 +98,11 @@ graph LR
 DiagrammeR::mermaid(mermaid_chart_raster, width = 1200, height = 400)
 ```
 
+For **vector-oriented selection**, we suggest three factors to consider:
+- Number of features: When the number of features is over 100,000, consider using `par_grid` or `par_hierarchy` to split the data into smaller chunks.
+- Hierarchical structure: If the data has a hierarchical structure, consider using `par_hierarchy` to parallelize the operation.
+- Data grouping: If the data needs to be grouped in similar sizes, consider using `par_pad_balanced` or `par_pad_grid` with `mode = "grid_quantile"`.
+
 ```{r flowchart-mermaid-vector, echo = FALSE, eval = (Sys.getenv("IN_GALLEY") == "")}
 mermaid_chart_vector <-
 '

diff --git a/README.md b/README.md
@@ -82,8 +82,37 @@ future::plan(future.mirai::mirai_multisession, workers = 4L)
 # the number of workers are up to users' choice
 ```
 
+### Function selection guide
+
+We provide two flowcharts to help users choose the right function for
+parallel processing. The raster-oriented flowchart is for users who want
+to start with raster data, and the vector-oriented flowchart is for
+users with large vector data.
+
+In **raster-oriented selection**, we suggest four factors to consider: -
+Number of raster files: for multiple files, `par_multirasters` is
+recommended. When there are multiple rasters that share the same extent
+and resolution, consider stacking the rasters into multilayer SpatRaster
+object by calling `terra::rast(filenames)`. - Raster resolution: We
+suggest 100 meters as a threshold. Rasters with resolution coarser than
+100 meters and a few layers would be better for the direct call of
+`exactextractr::exact_extract()`. - Raster extent: Using `SpatRaster` in
+`exactextractr::exact_extract()` is often minimally affected by the
+raster extent. - Memory size: `max_cells_in_memory` argument value of
+`exactextractr::exact_extract()`, raster resolution, and the number of
+layers in `SpatRaster` are multiplicatively related to the memory usage.
+
 <img src="man/figures/README-flowchart-mermaid-raster-1.png" width="100%" />
 
+For **vector-oriented selection**, we suggest three factors to
+consider: - Number of features: When the number of features is over
+100,000, consider using `par_grid` or `par_hierarchy` to split the data
+into smaller chunks. - Hierarchical structure: If the data has a
+hierarchical structure, consider using `par_hierarchy` to parallelize
+the operation. - Data grouping: If the data needs to be grouped in
+similar sizes, consider using `par_pad_balanced` or `par_pad_grid` with
+`mode = "grid_quantile"`.
+
 <img src="man/figures/README-flowchart-mermaid-vector-1.png" width="100%" />
 
 ## Installation
@@ -207,7 +236,7 @@ system.time(
 )
 #> Input is a character. Attempt to read it with terra::rast...
 #>    user  system elapsed 
-#>   7.205   0.206   7.669
+#>   7.115   0.227   7.537
 ```
 
 #### Generate regular grid computational regions
@@ -298,7 +327,7 @@ system.time(
 #> Input is a character. Attempt to read it with terra::rast...
 #> ℹ Task at CGRIDID: 4 is successfully dispatched.
 #>    user  system elapsed 
-#>   0.329   0.021   8.337
+#>   0.360   0.010   8.089
 
 ncpoints_srtm <-
   extract_at(
@@ -358,7 +387,7 @@ path_nchrchy <- file.path(wdir, "nc_hierarchy.gpkg")
 nc_data <- path_nchrchy
 nc_county <- sf::st_read(nc_data, layer = "county")
 #> Reading layer `county' from data source 
-#>   `/tmp/RtmpkigVUL/temp_libpath3cdb0b6b55ee11/chopin/extdata/nc_hierarchy.gpkg' 
+#>   `/tmp/RtmpPipkp9/temp_libpath2270313b1b582/chopin/extdata/nc_hierarchy.gpkg' 
 #>   using driver `GPKG'
 #> Simple feature collection with 100 features and 1 field
 #> Geometry type: POLYGON
@@ -367,7 +396,7 @@ nc_county <- sf::st_read(nc_data, layer = "county")
 #> Projected CRS: NAD83 / Conus Albers
 nc_tracts <- sf::st_read(nc_data, layer = "tracts")
 #> Reading layer `tracts' from data source 
-#>   `/tmp/RtmpkigVUL/temp_libpath3cdb0b6b55ee11/chopin/extdata/nc_hierarchy.gpkg' 
+#>   `/tmp/RtmpPipkp9/temp_libpath2270313b1b582/chopin/extdata/nc_hierarchy.gpkg' 
 #>   using driver `GPKG'
 #> Simple feature collection with 2672 features and 1 field
 #> Geometry type: MULTIPOLYGON
@@ -395,7 +424,7 @@ system.time(
 )
 #> Input is a character. Attempt to read it with terra::rast...
 #>    user  system elapsed 
-#>   0.910   0.003   0.916
+#>   0.897   0.003   0.904
 
 # hierarchical parallelization
 system.time(
@@ -513,7 +542,7 @@ system.time(
 #> Input is a character. Attempt to read it with terra::rast...ℹ Your input function at 37055 is dispatched.
 #> Input is a character. Attempt to read it with terra::rast...ℹ Your input function at 37047 is dispatched.
 #>    user  system elapsed 
-#>   0.258   0.073   2.005
+#>   0.249   0.077   2.016
 ```
 
 ### `par_multirasters`: parallelize over multiple rasters
@@ -540,9 +569,9 @@ terra::writeRaster(ncelev, file.path(tdir, "test5.tif"), overwrite = TRUE)
 # check if the raster files were exported as expected
 testfiles <- list.files(tdir, pattern = "*.tif$", full.names = TRUE)
 testfiles
-#> [1] "/tmp/Rtmpqu68xF/test1.tif" "/tmp/Rtmpqu68xF/test2.tif"
-#> [3] "/tmp/Rtmpqu68xF/test3.tif" "/tmp/Rtmpqu68xF/test4.tif"
-#> [5] "/tmp/Rtmpqu68xF/test5.tif"
+#> [1] "/tmp/RtmpxL0QL4/test1.tif" "/tmp/RtmpxL0QL4/test2.tif"
+#> [3] "/tmp/RtmpxL0QL4/test3.tif" "/tmp/RtmpxL0QL4/test4.tif"
+#> [5] "/tmp/RtmpxL0QL4/test5.tif"
 ```
 
 ``` r
@@ -559,32 +588,32 @@ system.time(
 )
 #> ℹ Input is not a character.
 #> Input is a character. Attempt to read it with terra::rast...
-#> ℹ Your input function at /tmp/Rtmpqu68xF/test1.tif is dispatched.
+#> ℹ Your input function at /tmp/RtmpxL0QL4/test1.tif is dispatched.
 #> 
 #> Input is a character. Attempt to read it with terra::rast...
-#> ℹ Your input function at /tmp/Rtmpqu68xF/test2.tif is dispatched.
+#> ℹ Your input function at /tmp/RtmpxL0QL4/test2.tif is dispatched.
 #> 
 #> Input is a character. Attempt to read it with terra::rast...
-#> ℹ Your input function at /tmp/Rtmpqu68xF/test3.tif is dispatched.
+#> ℹ Your input function at /tmp/RtmpxL0QL4/test3.tif is dispatched.
 #> 
 #> Input is a character. Attempt to read it with terra::rast...
-#> ℹ Your input function at /tmp/Rtmpqu68xF/test4.tif is dispatched.
+#> ℹ Your input function at /tmp/RtmpxL0QL4/test4.tif is dispatched.
 #> 
 #> Input is a character. Attempt to read it with terra::rast...
-#> ℹ Your input function at /tmp/Rtmpqu68xF/test5.tif is dispatched.
+#> ℹ Your input function at /tmp/RtmpxL0QL4/test5.tif is dispatched.
 #>    user  system elapsed 
-#>   1.713   0.147   2.747
+#>   1.714   0.139   2.718
 knitr::kable(head(res))
 ```
 
 |      mean | base_raster               |
 |----------:|:--------------------------|
-| 136.80203 | /tmp/Rtmpqu68xF/test1.tif |
-| 189.76170 | /tmp/Rtmpqu68xF/test1.tif |
-| 231.16968 | /tmp/Rtmpqu68xF/test1.tif |
-|  98.03845 | /tmp/Rtmpqu68xF/test1.tif |
-|  41.23463 | /tmp/Rtmpqu68xF/test1.tif |
-| 270.96933 | /tmp/Rtmpqu68xF/test1.tif |
+| 136.80203 | /tmp/RtmpxL0QL4/test1.tif |
+| 189.76170 | /tmp/RtmpxL0QL4/test1.tif |
+| 231.16968 | /tmp/RtmpxL0QL4/test1.tif |
+|  98.03845 | /tmp/RtmpxL0QL4/test1.tif |
+|  41.23463 | /tmp/RtmpxL0QL4/test1.tif |
+| 270.96933 | /tmp/RtmpxL0QL4/test1.tif |
 
 ``` r
 # remove temporary raster files
@@ -619,7 +648,7 @@ pnts <- sf::st_as_sf(pnts)
 pnts$pid <- sprintf("RPID-%04d", seq(1, 5000))
 rd1 <- sf::st_read(path_ncrd1)
 #> Reading layer `ncroads_first' from data source 
-#>   `/tmp/RtmpkigVUL/temp_libpath3cdb0b6b55ee11/chopin/extdata/ncroads_first.gpkg' 
+#>   `/tmp/RtmpPipkp9/temp_libpath2270313b1b582/chopin/extdata/ncroads_first.gpkg' 
 #>   using driver `GPKG'
 #> Simple feature collection with 620 features and 4 fields
 #> Geometry type: MULTILINESTRING
@@ -673,11 +702,11 @@ system.time(
   restr <- terra::nearest(x = terra::vect(pntst), y = terra::vect(rd1t))
 )
 #>    user  system elapsed 
-#>   0.603   0.000   0.604
+#>   0.602   0.000   0.603
 
 pnt_path <- file.path(tdir, "pntst.gpkg")
 sf::st_write(pntst, pnt_path)
-#> Writing layer `pntst' to data source `/tmp/Rtmpqu68xF/pntst.gpkg' using driver `GPKG'
+#> Writing layer `pntst' to data source `/tmp/RtmpxL0QL4/pntst.gpkg' using driver `GPKG'
 #> Writing 5000 features with 1 fields and geometry type Point.
 
 # we use four threads that were configured above
@@ -723,7 +752,7 @@ system.time(
 #> ℹ Input is a character. Trying to read with terra .
 #> ℹ Task at CGRIDID: 8 is successfully dispatched.
 #>    user  system elapsed 
-#>   0.085   0.000   0.534
+#>   0.079   0.005   0.448
 ```
 
 -   We will compare the results from the single-thread and multi-thread

diff --git a/codemeta.json b/codemeta.json
@@ -7,7 +7,7 @@
   "codeRepository": "https://github.com/NIEHS/chopin",
   "issueTracker": "https://github.com/NIEHS/chopin/issues",
   "license": "https://spdx.org/licenses/MIT",
-  "version": "0.7.8.20240814",
+  "version": "0.7.8.20240817",
   "programmingLanguage": {
     "@type": "ComputerLanguage",
     "name": "R",
@@ -368,7 +368,7 @@
     },
     "SystemRequirements": "NetCDF4"
   },
-  "fileSize": "27932.011KB",
+  "fileSize": "27933.593KB",
   "releaseNotes": "https://github.com/NIEHS/chopin/blob/master/NEWS.md",
   "readme": "https://github.com/NIEHS/chopin/blob/main/README.md",
   "contIntegration": ["https://github.com/NIEHS/chopin/actions", "https://github.com/NIEHS/chopin/actions/workflows/check-standard.yaml"],

diff --git a/tests/testthat/test-gridding.R b/tests/testthat/test-gridding.R
@@ -307,3 +307,55 @@ testthat::test_that("Quantile cut internal tests", {
   )
 
 })
+
+# par_split_list tests ####
+testthat::test_that("par_split_list returns correct output", {
+  withr::local_package("sf")
+  withr::local_package("terra")
+  withr::local_options(list(sf_use_s2 = FALSE))
+
+  ncpath <- system.file("shape/nc.shp", package = "sf")
+  nc <- read_sf(ncpath)
+  nc <- st_transform(nc, "EPSG:5070")
+  nc_comp_region <-
+    par_pad_grid(
+      nc,
+      mode = "grid",
+      nx = 4L, ny = 2L,
+      padding = 10000
+    )
+
+  ps <- par_split_list(nc_comp_region)
+
+  testthat::expect_true(is.list(ps))
+  testthat::expect_equal(length(ps), 8)
+
+  # WKT mode
+  nc_comp_region_wkt <-
+    par_pad_grid(
+      nc,
+      mode = "grid",
+      nx = 4L, ny = 2L,
+      padding = 10000,
+      return_wkt = TRUE
+    )
+  psw <- par_split_list(nc_comp_region_wkt)
+
+  testthat::expect_true(is.list(psw))
+  testthat::expect_equal(length(psw), 8)
+  testthat::expect_true(is.character(psw[[1]]$original))
+
+
+  # Create sample input
+  gridlist <- list(data.frame(x = c(1, 2, 3, 4, 5),
+                              y = c(1, 2, 3, 4, 5)),
+                   data.frame(x = c(8, 6, 4, 0, -1),
+                              y = c(7, 5, 3, 0, -1)))
+
+  # Call the function
+  result <- par_split_list(gridlist)
+
+  # Check the output
+  testthat::expect_equal(result[[1]]$original, data.frame(x = 1, y = 1))
+  testthat::expect_equal(result[[1]]$padded, data.frame(x = 8, y = 7))
+})