Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

177 import preprocessed hdf5 #181

Merged
merged 2 commits into from
Apr 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/R-CMD-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:
- {os: windows-latest, r: 'release'}
- {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'}
- {os: ubuntu-latest, r: 'release'}
- {os: ubuntu-latest, r: 'oldrel-1'}
# - {os: ubuntu-latest, r: 'oldrel-1'}

env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
Expand Down
25 changes: 25 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,30 @@

# Birdflow 0.1.0.9057
2024-04-17

* `import_birdflow()` now works with preprocessed hdf5 files. See #177
* `preprocess_birdflow()` and `import_birdflow()` are now more consistent
with some dimension names:
* `bf$geom$ext` is now an unnamed vector in objects from either function.
Previously one was a named vector. The `ext(bf)` return object is unchanged.
* `distr`, `dynamic_mask`, `uci`, and `lci` now always have `NULL`
rownames and
`"t<timestep>"` column names, and the names
of those two dimensions are `"i"` and `"time"`.
* `import_birdflow()` no longer drops two metadata items that it was losing
before: `ebirdst_version` and `birdflowr_preprocess_version` these
will exist in old `.hdf5` files but not in old `.rds` files.
* `compare_lists()` (internal helper) return format changed slightly.
* `import_birdflow()` and `export_birdflow()` now work with sparse models.
Marginals from these models are converted to standard matrices before
writing so some of the benefits of sparsification are lost when writing to
hdf5 files - but compression will probably mitigate this somewhat. Sparse
Matrices are re-created when sparse models are re-imported.


# BirdFlowR 0.1.0.9056
2024-04-04

## Flux III
* `is_between()` now uses a `SparseArray::SparseArray()` for the logical array.
to reduce memory usage
Expand Down
12 changes: 12 additions & 0 deletions R/clean_hdf5_dataframe.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@

# This strips weird extra within column attributes from dataframes returned
# by h5read
# It's used by import_birdflow_v3.
clean_hdf5_dataframe <- function(df) {
if (!inherits(df, "data.frame"))
return(df)

for (i in seq_len(ncol(df)))
df[[i]] <- as.vector(df[[i]])
return(df)
}
7 changes: 3 additions & 4 deletions R/compare_list_item_names.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,13 @@ compare_list_item_names <- function(x, y, map = "x", differences) {
differences <- rbind(
differences,
data.frame(where = map,
differences = paste0("extra:",
paste0(lost, collapse = ", "))))
differences = paste0("extra:", lost)))

if (length(gained) != 0)
differences <- rbind(
differences,
data.frame(where = map,
differences = paste0("missing:",
paste0(gained, collapse = ", "))))
differences = paste0("missing:", gained)))
return(differences)
}

Expand Down
8 changes: 8 additions & 0 deletions R/export_birdflow.R
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,14 @@ export_birdflow <- function(bf, file = NULL,
return()
}

# Convert sparse matrices to standard (for hdf5 only)
if (has_marginals(bf) && bf$metadata$is_sparse) {
mn <- setdiff(names(bf$marginals), "index")
for (m in mn) {
bf$marginals[[m]] <- as.matrix(bf$marginals[[m]])
}
}

# Write HDF5
ns <- names(bf)
for (i in seq_along(ns)) {
Expand Down
237 changes: 154 additions & 83 deletions R/import_birdflow_v3.R
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ import_birdflow_v3 <- function(hdf5) {
"geom/crs",
"geom/mask",
"transitions",
"marginals",
"dates",
"distr",
"species",
Expand All @@ -72,8 +71,6 @@ import_birdflow_v3 <- function(hdf5) {
"metadata/has_marginals",
"metadata/has_transitions",
"metadata/has_distr",
"metadata/hyperparameters",
"metadata/loss_values",
"metadata/n_transitions",
"metadata/n_active",
"metadata/n_timesteps",
Expand All @@ -82,13 +79,26 @@ import_birdflow_v3 <- function(hdf5) {
"metadata/ebird_access_end_date",
"metadata/birdflow_preprocess_date",
"metadata/birdflow_model_date",
"metadata/is_sparse",
"marginals")
"metadata/is_sparse"
)

fit_model_items <- c(
"metadata/hyperparameters",
"metadata/loss_values",
"marginals"
)


# Check HDF5 for version consistency and missing contents
contents <- h5ls(hdf5)
contents <- paste0(contents$group, "/", contents$name)
contents <- gsub("^/*", "", contents)

is_fitted_model <- "marginals" %in% contents
if (is_fitted_model) {
expected_contents <- c(expected_contents, fit_model_items)
}

absent <- setdiff(expected_contents, contents)
extra <- setdiff(contents, expected_contents) # nolint: object_usage_linter

Expand All @@ -97,6 +107,7 @@ import_birdflow_v3 <- function(hdf5) {
paste(absent, collapse = "', '"), "'")
}


expected_version <- 3 # of HDF5 BirdFlow export
version <- as.vector(h5read(hdf5, "metadata/birdflow_version"))
if (version != expected_version) {
Expand Down Expand Up @@ -127,31 +138,34 @@ import_birdflow_v3 <- function(hdf5) {
}

# hyperparameters
hp <- h5read(hdf5, "metadata/hyperparameters")
# hdf5 seems to store logical as a factor or at least R reads them as such.
# The code below looks for factors that store logical values and
# explicitly converts them to logical
for (i in seq_along(hp)) {
a <- hp[[i]] # this hyper parameter
if (is.factor(a) && all(tolower(levels(a)) %in% c("true", "false"))) {
a <- as.logical(a)
if (is_fitted_model) {

hp <- h5read(hdf5, "metadata/hyperparameters")
# hdf5 seems to store logical as a factor or at least R reads them as such.
# The code below looks for factors that store logical values and
# explicitly converts them to logical
for (i in seq_along(hp)) {
a <- hp[[i]] # this hyper parameter
if (is.factor(a) && all(tolower(levels(a)) %in% c("true", "false"))) {
a <- as.logical(a)
}
if (inherits(a, "array")) {
a <- as.vector(a)
}
hp[[i]] <- a
}
if (inherits(a, "array")) {
a <- as.vector(a)
bf$metadata$hyperparameters <- hp

# loss values
lv <- as.data.frame(h5read(hdf5, "metadata/loss_values"))
for (i in seq_len(ncol(lv))) {
# IF R re-exports an imported hdf5 the loss values columns are each
# arrays. This returns them to standard data.frame columns
lv[[i]] <- as.vector(lv[[i]])
}
hp[[i]] <- a
}
bf$metadata$hyperparameters <- hp

# loss values
lv <- as.data.frame(h5read(hdf5, "metadata/loss_values"))
for (i in seq_len(ncol(lv))) {
# IF R re-exports an imported hdf5 the loss values columns are each
# arrays. This returns them to standard data.frame columns
lv[[i]] <- as.vector(lv[[i]])
}
bf$metadata$loss_values <- lv
bf$metadata$loss_values <- lv

} # end only for fitted models


# dates
Expand All @@ -162,77 +176,134 @@ import_birdflow_v3 <- function(hdf5) {
colnames(dates) <- gsub("^week_", "", colnames(dates))
bf$dates <- dates

# Save marginals into list
marg <- h5read(hdf5, "marginals", native = TRUE)
nt <- length(marg[!names(marg) == "index"])
bf$metadata$n_transitions <- nt
if (is.null(bf$metadata$timestep_padding))
bf$metadata$timestep_padding <- nchar(nt)
circular <- nt == length(unique(dates$date))
bf$marginals <- vector(mode = "list", length = nt)

# If the hdf5 has been re-exported from R than we just copy the marginals over
if ("index" %in% names(marg)) {
bf$marginals <- marg
} else {
# If this hdf5 was written by python then we need to copy and rename
# marginals
for (i in seq_len(nt)) {
python_label <- paste0("Week", i, "_to_", i + 1)
if (circular && i == nt) {
label <- paste0("M_", pad_timestep(i, bf), "-", pad_timestep(1, bf))
} else {
label <- paste0("M_", pad_timestep(i, bf), "-", pad_timestep(i + 1, bf))
if (is_fitted_model) {

# Save marginals into list
marg <- h5read(hdf5, "marginals", native = TRUE)
nt <- length(marg[!names(marg) == "index"])
bf$metadata$n_transitions <- nt
if (is.null(bf$metadata$timestep_padding))
bf$metadata$timestep_padding <- nchar(nt)
circular <- nt == length(unique(dates$date))
bf$marginals <- vector(mode = "list", length = nt)

# If the hdf5 has been re-exported from R, just copy the marginals over
if ("index" %in% names(marg)) {
bf$marginals <- marg
} else {
# If this hdf5 was written by python then we need to copy and rename
# marginals
for (i in seq_len(nt)) {
python_label <- paste0("Week", i, "_to_", i + 1)
if (circular && i == nt) {
label <- paste0("M_", pad_timestep(i, bf), "-", pad_timestep(1, bf))
} else {
label <- paste0("M_", pad_timestep(i, bf), "-",
pad_timestep(i + 1, bf))
}
bf$marginals[[i]] <- marg[[python_label]]
names(bf$marginals)[i] <- label
}
bf$marginals[[i]] <- marg[[python_label]]
names(bf$marginals)[i] <- label
bf$metadata$has_marginals <- TRUE
}
bf$metadata$has_marginals <- TRUE
}

# Save distributions
bf$distr <- h5read(hdf5, "distr", native = TRUE)

# Cleanup duplicated distribution, dynamic_mask row, and date added to
# input to force circular model fitting.
if (is_fitted_model) {
# Cleanup duplicated distribution, dynamic_mask row, and date added to
# input to force circular model fitting.

# Cleanup duplicated date row
sv <- duplicated(bf$dates$date)
if (any(sv)) {
bf$dates <- bf$dates[!sv, ]
}
bf$metadata$n_timesteps <- nrow(bf$dates)

# Delete duplicated distribution
d <- bf$distr
if (ncol(d) == n_timesteps(bf) + 1) {
if (!all(d[, 1] == d[, ncol(d)]))
stop("Expected extra distribution to match first distribution")
d <- d[, 1:(ncol(d) - 1)]
}

bf$distr <- d

# Delete duplicated dynamic mask row
dm <- bf$geom$dynamic_mask
if (ncol(dm) == n_timesteps(bf) + 1) {
if (!all(dm[, 1] == dm[, ncol(dm)]))
stop("Expected first and last dynamic mask columns to matrch in ",
"circular BirdFlow model")
dm <- dm[, 1:(ncol(dm) - 1)]
}

bf$geom$dynamic_mask <- dm

# Make and save marginal index
bf$marginals$index <- make_marginal_index(bf)

} # end fitted model only

# Cleanup duplicated date row
sv <- duplicated(bf$dates$date)
if (any(sv)) {
bf$dates <- bf$dates[!sv, ]
}
bf$metadata$n_timesteps <- nrow(bf$dates)

# Delete duplicated distribution
d <- bf$distr
if (ncol(d) == n_timesteps(bf) + 1) {
if (!all(d[, 1] == d[, ncol(d)]))
stop("Expected extra distribution to match first distribution")
d <- d[, 1:(ncol(d) - 1)]
}

### back compatibility code
# Restore distr and dynamic mask dimnames (lost in hdf5 write+read)
ts_col <- ifelse(bf$metadata$ebird_version_year < 2022,
"interval",
"timestep"
)
dimnames(d) <- list(i = NULL, time = paste0("t", bf$dates[[ts_col]]))
bf$distr <- d

# Delete duplicated dynamic mask row
dm <- bf$geom$dynamic_mask
if (ncol(dm) == n_timesteps(bf) + 1) {
if (!all(dm[, 1] == dm[, ncol(dm)]))
stop("Expected first and last dynamic mask columns to matrch in circular",
"BirdFlow model")
dm <- dm[, 1:(ncol(dm) - 1)]
) ### back compatibility code

# Set dimnames for distr and dynamic mask
dn <- list(i = NULL, time = paste0("t", bf$dates[[ts_col]]))
dimnames(bf$distr) <- dn
dimnames(bf$geom$dynamic_mask) <- dn

if (!is_fitted_model) {
# Need to import some stuff here
bf$marginals <- NULL
bf$distances <- as.numeric(h5read(hdf5, "distances"))
bf$uci <- h5read(hdf5, "uci", native = TRUE)
bf$lci <- h5read(hdf5, "lci", native = TRUE)
dimnames(bf$uci) <- dn
dimnames(bf$lci) <- dn
}

dimnames(dm) <- list(i = NULL, time = paste0("t", bf$dates[[ts_col]]))
bf$geom$dynamic_mask <- dm
# Convert sparse matricies (if present) back into sparse matrices
# (only relevant if reimporting a previously imported and exported model)
if (has_marginals(bf) && bf$metadata$is_sparse) {
mn <- setdiff(names(bf$marginals), "index")
for (m in mn) {
bf$marginals[[m]] <- Matrix::Matrix(bf$marginals[[m]], sparse = TRUE)
}

# Clean up metadata$sparse attriubtes and order problems

sparse <- bf$metadata$sparse

# Restore standard order to sparse if names are as expected
sparse_order <-
c("fix_stats", "method", "arguments", "stats",
"pct_zero", "pct_density_lost")
if (setequal(sparse_order, names(sparse))) {
sparse <- sparse[sparse_order]
}

# Remove extra attributes hidden in data frame columns and vectors
for (i in seq_along(bf$metadata$sparse)) {
if (inherits(sparse[[i]], "data.frame")) {
sparse[[i]] <- clean_hdf5_dataframe(sparse[[i]])
} else {
sparse[[i]] <- as.vector(sparse[[i]])
}
}
# Cleanup extra attributes in argument list
if ("arguments" %in% names(sparse))
sparse$arguments <- lapply(sparse$arguments, as.vector)

# Make and save marginal index
bf$marginals$index <- make_marginal_index(bf)
bf$metadata$sparse <- sparse
} # end if sparse

return(bf)
}
2 changes: 2 additions & 0 deletions R/new_birdflow.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,11 @@ new_BirdFlow <- function() {
ebird_version_year = NA_integer_,
ebird_release_year = NA_integer_,
ebird_access_end_date = NA,
ebirdst_version = NA,
birdflow_preprocess_date = NA,
birdflow_model_date = NA,
birdflow_version = 3,
birdflowr_preprocess_version = NA,
birdflowr_version =
as.character(utils::packageVersion("BirdFlowR")),
is_sparse = FALSE,
Expand Down
Loading
Loading