Skip to content

Commit

Permalink
Start nanoparquet format
Browse files Browse the repository at this point in the history
  • Loading branch information
wlandau committed Aug 1, 2024
1 parent 3c850ab commit 025c0c4
Show file tree
Hide file tree
Showing 24 changed files with 306 additions and 22 deletions.
10 changes: 10 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ S3method(print,tar_resources_feather)
S3method(print,tar_resources_fst)
S3method(print,tar_resources_future)
S3method(print,tar_resources_gcp)
S3method(print,tar_resources_nanoparquet)
S3method(print,tar_resources_network)
S3method(print,tar_resources_parquet)
S3method(print,tar_resources_qs)
Expand All @@ -34,12 +35,14 @@ S3method(resources_validate,tar_resources_feather)
S3method(resources_validate,tar_resources_fst)
S3method(resources_validate,tar_resources_future)
S3method(resources_validate,tar_resources_gcp)
S3method(resources_validate,tar_resources_nanoparquet)
S3method(resources_validate,tar_resources_network)
S3method(resources_validate,tar_resources_parquet)
S3method(resources_validate,tar_resources_qs)
S3method(resources_validate,tar_resources_url)
S3method(store_assert_format,default)
S3method(store_assert_format,tar_feather)
S3method(store_assert_format,tar_nanoparquet)
S3method(store_assert_format,tar_parquet)
S3method(store_assert_format,tar_store_file)
S3method(store_assert_format,tar_url)
Expand All @@ -54,6 +57,7 @@ S3method(store_assert_format_setting,fst_dt)
S3method(store_assert_format_setting,fst_tbl)
S3method(store_assert_format_setting,gcp_file)
S3method(store_assert_format_setting,keras)
S3method(store_assert_format_setting,nanoparquet)
S3method(store_assert_format_setting,parquet)
S3method(store_assert_format_setting,qs)
S3method(store_assert_format_setting,rds)
Expand All @@ -74,6 +78,7 @@ S3method(store_class_format,fst)
S3method(store_class_format,fst_dt)
S3method(store_class_format,fst_tbl)
S3method(store_class_format,keras)
S3method(store_class_format,nanoparquet)
S3method(store_class_format,parquet)
S3method(store_class_format,qs)
S3method(store_class_format,rds)
Expand All @@ -86,6 +91,7 @@ S3method(store_convert_object,tar_feather)
S3method(store_convert_object,tar_fst)
S3method(store_convert_object,tar_fst_dt)
S3method(store_convert_object,tar_fst_tbl)
S3method(store_convert_object,tar_nanoparquet)
S3method(store_convert_object,tar_parquet)
S3method(store_convert_object,tar_store_custom)
S3method(store_convert_object,tar_store_file)
Expand All @@ -112,6 +118,7 @@ S3method(store_get_packages,tar_fst_dt)
S3method(store_get_packages,tar_fst_tbl)
S3method(store_get_packages,tar_gcp)
S3method(store_get_packages,tar_keras)
S3method(store_get_packages,tar_nanoparquet)
S3method(store_get_packages,tar_parquet)
S3method(store_get_packages,tar_qs)
S3method(store_get_packages,tar_torch)
Expand Down Expand Up @@ -162,6 +169,7 @@ S3method(store_read_path,tar_fst)
S3method(store_read_path,tar_fst_dt)
S3method(store_read_path,tar_fst_tbl)
S3method(store_read_path,tar_keras)
S3method(store_read_path,tar_nanoparquet)
S3method(store_read_path,tar_parquet)
S3method(store_read_path,tar_qs)
S3method(store_read_path,tar_rds)
Expand Down Expand Up @@ -205,6 +213,7 @@ S3method(store_write_object,tar_url)
S3method(store_write_path,tar_feather)
S3method(store_write_path,tar_fst)
S3method(store_write_path,tar_keras)
S3method(store_write_path,tar_nanoparquet)
S3method(store_write_path,tar_parquet)
S3method(store_write_path,tar_qs)
S3method(store_write_path,tar_rds)
Expand Down Expand Up @@ -477,6 +486,7 @@ export(tar_resources_feather)
export(tar_resources_fst)
export(tar_resources_future)
export(tar_resources_gcp)
export(tar_resources_nanoparquet)
export(tar_resources_network)
export(tar_resources_parquet)
export(tar_resources_qs)
Expand Down
66 changes: 66 additions & 0 deletions R/class_nanoparquet.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#' @export
store_class_format.nanoparquet <- function(format) {
c("tar_nanoparquet", "tar_store")
}

#' @export
store_assert_format_setting.nanoparquet <- function(format) {
}

#' @export
store_read_path.tar_nanoparquet <- function(store, path) {
object <- getNamespace("nanoparquet")[["read_parquet"]](
file = path,
options = store_nanoparquet_options()
)
metadata <- getNamespace("nanoparquet")[["parquet_metadata"]](file = path)
classes <- metadata$file_meta_data$key_value_metadata[[1L]]
class(object) <- classes$value[classes$key == "class"]
object
}

#' @export
store_write_path.tar_nanoparquet <- function(store, object, path) {
metadata <- class(object)
names(metadata) <- rep("class", length(metadata))
getNamespace("nanoparquet")[["write_parquet"]](
x = object,
file = path,
metadata = metadata,
compression = store$resources$nanoparquet[["compression"]],
options = store_nanoparquet_options()
)
}

#' @export
store_assert_format.tar_nanoparquet <- function(store, object, name) { # nolint
msg <- paste(
"target",
name,
"has nanoparquet format, so it must be a data frame",
"which can be converted to a tibble."
)
tar_assert_inherits(
x = object %|||% data.frame(),
class = "data.frame",
msg = msg
)
}

#' @export
store_convert_object.tar_nanoparquet <- function(store, object) {
if_any(is.null(object), as.data.frame(NULL), object)
}

#' @export
store_get_packages.tar_nanoparquet <- function(store) {
"nanoparquet"
}

store_nanoparquet_options <- function() {
getNamespace("nanoparquet")[["parquet_options"]](
class = character(0L),
use_arrow_metadata = TRUE,
write_arrow_metadata = TRUE
)
}
29 changes: 29 additions & 0 deletions R/class_resources_nanoparquet.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
resources_nanoparquet_init <- function(
compression = "snappy"
) {
resources_nanoparquet_new(
compression = compression
)
}

resources_nanoparquet_new <- function(
compression = NULL
) {
force(compression)
enclass(environment(), c("tar_resources_nanoparquet", "tar_resources"))
}

#' @export
resources_validate.tar_resources_nanoparquet <- function(resources) {
tar_assert_scalar(resources$compression)
tar_assert_chr(resources$compression)
tar_assert_nzchar(resources$compression)
}

#' @export
print.tar_resources_nanoparquet <- function(x, ...) {
cat(
"<tar_resources_nanoparquet>\n ",
paste0(paste_list(as.list(x)), collapse = "\n ")
)
}
11 changes: 9 additions & 2 deletions R/tar_resources.R
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
#' @param feather Output of function [tar_resources_feather()].
#' Non-default arguments to `arrow::read_feather()` and
#' `arrow::write_feather()` for `arrow`/feather-based storage formats.
#' Applies to all formats ending with the `"_feather"` suffix.
#' Applies to the `"feather"` storage format.
#' For details on formats, see the `format` argument of [tar_target()].
#' @param fst Output of function `tar_resources_fst()`.
#' Non-default arguments to `fst::read_fst()` and
Expand All @@ -75,6 +75,12 @@
#' See the cloud storage section of
#' <https://books.ropensci.org/targets/data.html>
#' for details for instructions.
#' @param nanoparquet Output of function `tar_resources_nanoparquet()`.
#' Non-default arguments to `nanoparquet::read_parquet()` and
#' `nanoparquet::write_parquet()` for `arrow`/parquet-based storage formats.
#' Applies to the `"nanoparquet"` storage format.
#' For details on formats, see the `format` argument of [tar_target()].
#' @param qs Output of function `tar_resources_qs()`.
#' @param network Output of function `tar_resources_network()`.
#' Settings to configure how to handle unreliable network connections
#' in the case of uploading, downloading, and checking data
Expand All @@ -88,7 +94,7 @@
#' @param parquet Output of function `tar_resources_parquet()`.
#' Non-default arguments to `arrow::read_parquet()` and
#' `arrow::write_parquet()` for `arrow`/parquet-based storage formats.
#' Applies to all formats ending with the `"_parquet"` suffix.
#' Applies to the `"parquet"` storage format.
#' For details on formats, see the `format` argument of [tar_target()].
#' @param qs Output of function `tar_resources_qs()`.
#' Non-default arguments to `qs::qread()` and
Expand Down Expand Up @@ -120,6 +126,7 @@ tar_resources <- function(
fst = tar_option_get("resources")$fst,
future = tar_option_get("resources")$future,
gcp = tar_option_get("resources")$gcp,
nanoparquet = tar_option_get("resources")$nanoparquet,
network = tar_option_get("resources")$network,
parquet = tar_option_get("resources")$parquet,
qs = tar_option_get("resources")$qs,
Expand Down
32 changes: 32 additions & 0 deletions R/tar_resources_nanoparquet.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#' @title Target resources: nanoparquet storage format
#' @export
#' @family resources
#' @description Create the nanoparquet argument of `tar_resources()`
#' to specify optional settings for nanoparquet data frame storage formats
#' powered by the `nanoparquet` R package.
#' See the `format` argument of [tar_target()] for details.
#' @inheritSection tar_resources Resources
#' @return Object of class `"tar_resources_nanoparquet"`, to be supplied
#' to the nanoparquet argument of `tar_resources()`.
#' @param compression Character of length 1, `compression`
#' argument of `nanoparquet::write_parquet()`. Defaults to `"snappy"`.
#' @examples
#' # Somewhere in you target script file (usually _targets.R):
#' tar_target(
#' name,
#' command(),
#' format = "nanoparquet",
#' resources = tar_resources(
#' nanoparquet = tar_resources_nanoparquet(compression = "zstd")
#' )
#' )
tar_resources_nanoparquet <- function(
compression = targets::tar_option_get("resources")$nanoparquet$compression
) {
compression <- compression %|||% "snappy"
out <- resources_nanoparquet_init(
compression = compression
)
resources_validate(out)
out
}
16 changes: 10 additions & 6 deletions R/tar_target.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,22 @@
#' * `"qs"`: Uses `qs::qsave()` and `qs::qread()`. Should work for
#' most objects, much faster than `"rds"`. Optionally set the
#' preset for `qsave()` through `tar_resources()` and `tar_resources_qs()`.
#' * `"feather"`: Uses `arrow::write_feather()` and
#' `arrow::read_feather()` (version 2.0). Much faster than `"rds"`,
#' but the value must be a data frame. Optionally set
#' `compression` and `compression_level` in `arrow::write_feather()`
#' through `tar_resources()` and `tar_resources_feather()`.
#' Requires the `arrow` package (not installed by default).
#' * `"nanoparquet"`: Uses `nanoparquet::write_parquet()` and
#' `nanoparquet::read_parquet()`. Much faster than `"rds"`, but
#' the value must be a data frame and is converted into a `tibble`.
#' Requires the `nanoparquet` package (not installed by default).
#' * `"parquet"`: Uses `arrow::write_parquet()` and
#' `arrow::read_parquet()` (version 2.0). Much faster than `"rds"`,
#' but the value must be a data frame. Optionally set
#' `compression` and `compression_level` in `arrow::write_parquet()`
#' through `tar_resources()` and `tar_resources_parquet()`.
#' Requires the `arrow` package (not installed by default).
#' * `"feather"`: Uses `arrow::write_feather()` and
#' `arrow::read_feather()` (version 2.0). Much faster than `"rds"`,
#' but the value must be a data frame. Optionally set
#' `compression` and `compression_level` in `arrow::write_feather()`
#' through `tar_resources()` and `tar_resources_feather()`.
#' Requires the `arrow` package (not installed by default).
#' * `"fst"`: Uses `fst::write_fst()` and `fst::read_fst()`.
#' Much faster than `"rds"`, but the value must be
#' a data frame. Optionally set the compression level for
Expand Down
16 changes: 10 additions & 6 deletions man/tar_option_set.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 10 additions & 2 deletions man/tar_resources.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions man/tar_resources_aws.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions man/tar_resources_clustermq.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions man/tar_resources_crew.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions man/tar_resources_custom_format.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions man/tar_resources_feather.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 025c0c4

Please sign in to comment.