Skip to content

Commit

Permalink
Merge pull request #181 from birdflow-science/177-import-preprocessed…
Browse files Browse the repository at this point in the history
…-hdf5

177 import preprocessed hdf5
  • Loading branch information
ethanplunkett committed Apr 23, 2024
2 parents a42cf70 + a212c3e commit dddb54d
Show file tree
Hide file tree
Showing 16 changed files with 330 additions and 122 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/R-CMD-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:
- {os: windows-latest, r: 'release'}
- {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'}
- {os: ubuntu-latest, r: 'release'}
- {os: ubuntu-latest, r: 'oldrel-1'}
# - {os: ubuntu-latest, r: 'oldrel-1'}

env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
Expand Down
25 changes: 25 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,30 @@

# Birdflow 0.1.0.9057
2024-04-17

* `import_birdflow()` now works with preprocessed hdf5 files. See #177
* `preprocess_birdflow()` and `import_birdflow()` are now more consistent
with some dimension names:
* `bf$geom$ext` is now an unnamed vector in objects from either function.
Previously one was a named vector. The `ext(bf)` return object is unchanged.
* `distr`, `dynamic_mask`, `uci`, and `lci` now always have `NULL`
rownames and
`"t<timestep>"` column names, and the names
of those two dimensions are `"i"` and `"time"`.
* `import_birdflow()` no longer drops two metadata items that it was losing
before: `ebirdst_version` and `birdflowr_preprocess_version` these
will exist in old `.hdf5` files but not in old `.rds` files.
* `compare_lists()` (internal helper) return format changed slightly.
* `import_birdflow()` and `export_birdflow()` now work with sparse models.
Marginals from these models are converted to standard matrices before
writing so some of the benefits of sparsification are lost when writing to
hdf5 files - but compression will probably mitigate this somewhat. Sparse
Matrices are re-created when sparse models are re-imported.


# BirdFlowR 0.1.0.9056
2024-04-04

## Flux III
* `is_between()` now uses a `SparseArray::SparseArray()` for the logical array.
to reduce memory usage
Expand Down
12 changes: 12 additions & 0 deletions R/clean_hdf5_dataframe.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@

# This strips weird extra within column attributes from dataframes returned
# by h5read
# It's used by import_birdflow_v3.
clean_hdf5_dataframe <- function(df) {
if (!inherits(df, "data.frame"))
return(df)

for (i in seq_len(ncol(df)))
df[[i]] <- as.vector(df[[i]])
return(df)
}
7 changes: 3 additions & 4 deletions R/compare_list_item_names.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,13 @@ compare_list_item_names <- function(x, y, map = "x", differences) {
differences <- rbind(
differences,
data.frame(where = map,
differences = paste0("extra:",
paste0(lost, collapse = ", "))))
differences = paste0("extra:", lost)))

if (length(gained) != 0)
differences <- rbind(
differences,
data.frame(where = map,
differences = paste0("missing:",
paste0(gained, collapse = ", "))))
differences = paste0("missing:", gained)))
return(differences)
}

Expand Down
8 changes: 8 additions & 0 deletions R/export_birdflow.R
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,14 @@ export_birdflow <- function(bf, file = NULL,
return()
}

# Convert sparse matrices to standard (for hdf5 only)
if (has_marginals(bf) && bf$metadata$is_sparse) {
mn <- setdiff(names(bf$marginals), "index")
for (m in mn) {
bf$marginals[[m]] <- as.matrix(bf$marginals[[m]])
}
}

# Write HDF5
ns <- names(bf)
for (i in seq_along(ns)) {
Expand Down
237 changes: 154 additions & 83 deletions R/import_birdflow_v3.R
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ import_birdflow_v3 <- function(hdf5) {
"geom/crs",
"geom/mask",
"transitions",
"marginals",
"dates",
"distr",
"species",
Expand All @@ -72,8 +71,6 @@ import_birdflow_v3 <- function(hdf5) {
"metadata/has_marginals",
"metadata/has_transitions",
"metadata/has_distr",
"metadata/hyperparameters",
"metadata/loss_values",
"metadata/n_transitions",
"metadata/n_active",
"metadata/n_timesteps",
Expand All @@ -82,13 +79,26 @@ import_birdflow_v3 <- function(hdf5) {
"metadata/ebird_access_end_date",
"metadata/birdflow_preprocess_date",
"metadata/birdflow_model_date",
"metadata/is_sparse",
"marginals")
"metadata/is_sparse"
)

fit_model_items <- c(
"metadata/hyperparameters",
"metadata/loss_values",
"marginals"
)


# Check HDF5 for version consistency and missing contents
contents <- h5ls(hdf5)
contents <- paste0(contents$group, "/", contents$name)
contents <- gsub("^/*", "", contents)

is_fitted_model <- "marginals" %in% contents
if (is_fitted_model) {
expected_contents <- c(expected_contents, fit_model_items)
}

absent <- setdiff(expected_contents, contents)
extra <- setdiff(contents, expected_contents) # nolint: object_usage_linter

Expand All @@ -97,6 +107,7 @@ import_birdflow_v3 <- function(hdf5) {
paste(absent, collapse = "', '"), "'")
}


expected_version <- 3 # of HDF5 BirdFlow export
version <- as.vector(h5read(hdf5, "metadata/birdflow_version"))
if (version != expected_version) {
Expand Down Expand Up @@ -127,31 +138,34 @@ import_birdflow_v3 <- function(hdf5) {
}

# hyperparameters
hp <- h5read(hdf5, "metadata/hyperparameters")
# hdf5 seems to store logical as a factor or at least R reads them as such.
# The code below looks for factors that store logical values and
# explicitly converts them to logical
for (i in seq_along(hp)) {
a <- hp[[i]] # this hyper parameter
if (is.factor(a) && all(tolower(levels(a)) %in% c("true", "false"))) {
a <- as.logical(a)
if (is_fitted_model) {

hp <- h5read(hdf5, "metadata/hyperparameters")
# hdf5 seems to store logical as a factor or at least R reads them as such.
# The code below looks for factors that store logical values and
# explicitly converts them to logical
for (i in seq_along(hp)) {
a <- hp[[i]] # this hyper parameter
if (is.factor(a) && all(tolower(levels(a)) %in% c("true", "false"))) {
a <- as.logical(a)
}
if (inherits(a, "array")) {
a <- as.vector(a)
}
hp[[i]] <- a
}
if (inherits(a, "array")) {
a <- as.vector(a)
bf$metadata$hyperparameters <- hp

# loss values
lv <- as.data.frame(h5read(hdf5, "metadata/loss_values"))
for (i in seq_len(ncol(lv))) {
# IF R re-exports an imported hdf5 the loss values columns are each
# arrays. This returns them to standard data.frame columns
lv[[i]] <- as.vector(lv[[i]])
}
hp[[i]] <- a
}
bf$metadata$hyperparameters <- hp

# loss values
lv <- as.data.frame(h5read(hdf5, "metadata/loss_values"))
for (i in seq_len(ncol(lv))) {
# IF R re-exports an imported hdf5 the loss values columns are each
# arrays. This returns them to standard data.frame columns
lv[[i]] <- as.vector(lv[[i]])
}
bf$metadata$loss_values <- lv
bf$metadata$loss_values <- lv

} # end only for fitted models


# dates
Expand All @@ -162,77 +176,134 @@ import_birdflow_v3 <- function(hdf5) {
colnames(dates) <- gsub("^week_", "", colnames(dates))
bf$dates <- dates

# Save marginals into list
marg <- h5read(hdf5, "marginals", native = TRUE)
nt <- length(marg[!names(marg) == "index"])
bf$metadata$n_transitions <- nt
if (is.null(bf$metadata$timestep_padding))
bf$metadata$timestep_padding <- nchar(nt)
circular <- nt == length(unique(dates$date))
bf$marginals <- vector(mode = "list", length = nt)

# If the hdf5 has been re-exported from R than we just copy the marginals over
if ("index" %in% names(marg)) {
bf$marginals <- marg
} else {
# If this hdf5 was written by python then we need to copy and rename
# marginals
for (i in seq_len(nt)) {
python_label <- paste0("Week", i, "_to_", i + 1)
if (circular && i == nt) {
label <- paste0("M_", pad_timestep(i, bf), "-", pad_timestep(1, bf))
} else {
label <- paste0("M_", pad_timestep(i, bf), "-", pad_timestep(i + 1, bf))
if (is_fitted_model) {

# Save marginals into list
marg <- h5read(hdf5, "marginals", native = TRUE)
nt <- length(marg[!names(marg) == "index"])
bf$metadata$n_transitions <- nt
if (is.null(bf$metadata$timestep_padding))
bf$metadata$timestep_padding <- nchar(nt)
circular <- nt == length(unique(dates$date))
bf$marginals <- vector(mode = "list", length = nt)

# If the hdf5 has been re-exported from R, just copy the marginals over
if ("index" %in% names(marg)) {
bf$marginals <- marg
} else {
# If this hdf5 was written by python then we need to copy and rename
# marginals
for (i in seq_len(nt)) {
python_label <- paste0("Week", i, "_to_", i + 1)
if (circular && i == nt) {
label <- paste0("M_", pad_timestep(i, bf), "-", pad_timestep(1, bf))
} else {
label <- paste0("M_", pad_timestep(i, bf), "-",
pad_timestep(i + 1, bf))
}
bf$marginals[[i]] <- marg[[python_label]]
names(bf$marginals)[i] <- label
}
bf$marginals[[i]] <- marg[[python_label]]
names(bf$marginals)[i] <- label
bf$metadata$has_marginals <- TRUE
}
bf$metadata$has_marginals <- TRUE
}

# Save distributions
bf$distr <- h5read(hdf5, "distr", native = TRUE)

# Cleanup duplicated distribution, dynamic_mask row, and date added to
# input to force circular model fitting.
if (is_fitted_model) {
# Cleanup duplicated distribution, dynamic_mask row, and date added to
# input to force circular model fitting.

# Cleanup duplicated date row
sv <- duplicated(bf$dates$date)
if (any(sv)) {
bf$dates <- bf$dates[!sv, ]
}
bf$metadata$n_timesteps <- nrow(bf$dates)

# Delete duplicated distribution
d <- bf$distr
if (ncol(d) == n_timesteps(bf) + 1) {
if (!all(d[, 1] == d[, ncol(d)]))
stop("Expected extra distribution to match first distribution")
d <- d[, 1:(ncol(d) - 1)]
}

bf$distr <- d

# Delete duplicated dynamic mask row
dm <- bf$geom$dynamic_mask
if (ncol(dm) == n_timesteps(bf) + 1) {
if (!all(dm[, 1] == dm[, ncol(dm)]))
stop("Expected first and last dynamic mask columns to matrch in ",
"circular BirdFlow model")
dm <- dm[, 1:(ncol(dm) - 1)]
}

bf$geom$dynamic_mask <- dm

# Make and save marginal index
bf$marginals$index <- make_marginal_index(bf)

} # end fitted model only

# Cleanup duplicated date row
sv <- duplicated(bf$dates$date)
if (any(sv)) {
bf$dates <- bf$dates[!sv, ]
}
bf$metadata$n_timesteps <- nrow(bf$dates)

# Delete duplicated distribution
d <- bf$distr
if (ncol(d) == n_timesteps(bf) + 1) {
if (!all(d[, 1] == d[, ncol(d)]))
stop("Expected extra distribution to match first distribution")
d <- d[, 1:(ncol(d) - 1)]
}

### back compatibility code
# Restore distr and dynamic mask dimnames (lost in hdf5 write+read)
ts_col <- ifelse(bf$metadata$ebird_version_year < 2022,
"interval",
"timestep"
)
dimnames(d) <- list(i = NULL, time = paste0("t", bf$dates[[ts_col]]))
bf$distr <- d

# Delete duplicated dynamic mask row
dm <- bf$geom$dynamic_mask
if (ncol(dm) == n_timesteps(bf) + 1) {
if (!all(dm[, 1] == dm[, ncol(dm)]))
stop("Expected first and last dynamic mask columns to matrch in circular",
"BirdFlow model")
dm <- dm[, 1:(ncol(dm) - 1)]
) ### back compatibility code

# Set dimnames for distr and dynamic mask
dn <- list(i = NULL, time = paste0("t", bf$dates[[ts_col]]))
dimnames(bf$distr) <- dn
dimnames(bf$geom$dynamic_mask) <- dn

if (!is_fitted_model) {
# Need to import some stuff here
bf$marginals <- NULL
bf$distances <- as.numeric(h5read(hdf5, "distances"))
bf$uci <- h5read(hdf5, "uci", native = TRUE)
bf$lci <- h5read(hdf5, "lci", native = TRUE)
dimnames(bf$uci) <- dn
dimnames(bf$lci) <- dn
}

dimnames(dm) <- list(i = NULL, time = paste0("t", bf$dates[[ts_col]]))
bf$geom$dynamic_mask <- dm
# Convert sparse matricies (if present) back into sparse matrices
# (only relevant if reimporting a previously imported and exported model)
if (has_marginals(bf) && bf$metadata$is_sparse) {
mn <- setdiff(names(bf$marginals), "index")
for (m in mn) {
bf$marginals[[m]] <- Matrix::Matrix(bf$marginals[[m]], sparse = TRUE)
}

# Clean up metadata$sparse attriubtes and order problems

sparse <- bf$metadata$sparse

# Restore standard order to sparse if names are as expected
sparse_order <-
c("fix_stats", "method", "arguments", "stats",
"pct_zero", "pct_density_lost")
if (setequal(sparse_order, names(sparse))) {
sparse <- sparse[sparse_order]
}

# Remove extra attributes hidden in data frame columns and vectors
for (i in seq_along(bf$metadata$sparse)) {
if (inherits(sparse[[i]], "data.frame")) {
sparse[[i]] <- clean_hdf5_dataframe(sparse[[i]])
} else {
sparse[[i]] <- as.vector(sparse[[i]])
}
}
# Cleanup extra attributes in argument list
if ("arguments" %in% names(sparse))
sparse$arguments <- lapply(sparse$arguments, as.vector)

# Make and save marginal index
bf$marginals$index <- make_marginal_index(bf)
bf$metadata$sparse <- sparse
} # end if sparse

return(bf)
}
2 changes: 2 additions & 0 deletions R/new_birdflow.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,11 @@ new_BirdFlow <- function() {
ebird_version_year = NA_integer_,
ebird_release_year = NA_integer_,
ebird_access_end_date = NA,
ebirdst_version = NA,
birdflow_preprocess_date = NA,
birdflow_model_date = NA,
birdflow_version = 3,
birdflowr_preprocess_version = NA,
birdflowr_version =
as.character(utils::packageVersion("BirdFlowR")),
is_sparse = FALSE,
Expand Down
Loading

0 comments on commit dddb54d

Please sign in to comment.