birdflow-science · ethanplunkett · Apr 23, 2024 · Apr 23, 2024 · Apr 23, 2024
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
@@ -22,7 +22,7 @@ jobs:
           - {os: windows-latest, r: 'release'}
           - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
           - {os: ubuntu-latest,   r: 'release'}
-          - {os: ubuntu-latest,   r: 'oldrel-1'}
+#          - {os: ubuntu-latest,   r: 'oldrel-1'}
 
     env:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,30 @@
+
+# Birdflow 0.1.0.9057
+2024-04-17
+
+* `import_birdflow()` now works with preprocessed hdf5 files. See #177
+* `preprocess_birdflow()` and `import_birdflow()` are now more consistent 
+  with some dimension names:
+    * `bf$geom$ext` is now an unnamed vector in objects from either function.
+    Previously one was a named vector. The `ext(bf)` return object is unchanged.
+    * `distr`, `dynamic_mask`, `uci`, and `lci` now always have `NULL` 
+    rownames and 
+    `"t<timestep>"` column names, and the names
+    of those two dimensions are `"i"` and `"time"`.
+    * `import_birdflow()` no longer drops two metadata items that it was losing
+    before: `ebirdst_version`  and `birdflowr_preprocess_version` these 
+    will exist in old `.hdf5` files but not in old `.rds` files.  
+* `compare_lists()` (internal helper) return format changed slightly.
+* `import_birdflow()` and `export_birdflow()` now work with sparse models.
+   Marginals from these models are converted to standard matrices before 
+   writing so some of the benefits of sparsification are lost when writing to
+   hdf5 files - but compression will probably mitigate this somewhat. Sparse
+   Matrices are re-created when sparse models are re-imported.
+
+
 # BirdFlowR 0.1.0.9056
 2024-04-04
+
 ## Flux III 
 * `is_between()` now uses a `SparseArray::SparseArray()` for the logical array.
   to reduce memory usage

diff --git a/R/clean_hdf5_dataframe.R b/R/clean_hdf5_dataframe.R
@@ -0,0 +1,12 @@
+
+# This strips weird extra within column attributes from dataframes returned
+# by h5read
+# It's used by import_birdflow_v3.
+clean_hdf5_dataframe <- function(df) {
+  if (!inherits(df, "data.frame"))
+    return(df)
+
+  for (i in seq_len(ncol(df)))
+    df[[i]] <- as.vector(df[[i]])
+ return(df)
+}
diff --git a/R/compare_list_item_names.R b/R/compare_list_item_names.R
@@ -34,14 +34,13 @@ compare_list_item_names <- function(x, y, map = "x", differences) {
       differences <- rbind(
         differences,
         data.frame(where = map,
-                   differences = paste0("extra:",
-                                        paste0(lost, collapse = ", "))))
+                   differences = paste0("extra:", lost)))
+
     if (length(gained) != 0)
       differences <- rbind(
         differences,
         data.frame(where = map,
-                   differences = paste0("missing:",
-                                        paste0(gained, collapse = ", "))))
+                   differences = paste0("missing:", gained)))
     return(differences)
   }
 

diff --git a/R/export_birdflow.R b/R/export_birdflow.R
@@ -58,6 +58,14 @@ export_birdflow <- function(bf, file = NULL,
     return()
   }
 
+  # Convert sparse matrices to standard (for hdf5 only)
+  if (has_marginals(bf) && bf$metadata$is_sparse) {
+    mn <- setdiff(names(bf$marginals), "index")
+    for (m in mn) {
+      bf$marginals[[m]] <- as.matrix(bf$marginals[[m]])
+    }
+  }
+
   # Write HDF5
   ns <- names(bf)
   for (i in seq_along(ns)) {

diff --git a/R/import_birdflow_v3.R b/R/import_birdflow_v3.R
@@ -49,7 +49,6 @@ import_birdflow_v3 <- function(hdf5) {
     "geom/crs",
     "geom/mask",
     "transitions",
-    "marginals",
     "dates",
     "distr",
     "species",
@@ -72,8 +71,6 @@ import_birdflow_v3 <- function(hdf5) {
     "metadata/has_marginals",
     "metadata/has_transitions",
     "metadata/has_distr",
-    "metadata/hyperparameters",
-    "metadata/loss_values",
     "metadata/n_transitions",
     "metadata/n_active",
     "metadata/n_timesteps",
@@ -82,13 +79,26 @@ import_birdflow_v3 <- function(hdf5) {
     "metadata/ebird_access_end_date",
     "metadata/birdflow_preprocess_date",
     "metadata/birdflow_model_date",
-    "metadata/is_sparse",
-    "marginals")
+    "metadata/is_sparse"
+  )
+
+  fit_model_items <- c(
+    "metadata/hyperparameters",
+    "metadata/loss_values",
+    "marginals"
+  )
+
 
   # Check HDF5 for version consistency and missing contents
   contents <- h5ls(hdf5)
   contents <- paste0(contents$group, "/", contents$name)
   contents <- gsub("^/*", "", contents)
+
+  is_fitted_model <- "marginals" %in% contents
+  if (is_fitted_model) {
+    expected_contents <- c(expected_contents, fit_model_items)
+  }
+
   absent <- setdiff(expected_contents, contents)
   extra <- setdiff(contents, expected_contents) # nolint: object_usage_linter
 
@@ -97,6 +107,7 @@ import_birdflow_v3 <- function(hdf5) {
          paste(absent, collapse = "', '"), "'")
   }
 
+
   expected_version <- 3  # of HDF5 BirdFlow export
   version <- as.vector(h5read(hdf5, "metadata/birdflow_version"))
   if (version != expected_version) {
@@ -127,31 +138,34 @@ import_birdflow_v3 <- function(hdf5) {
   }
 
   # hyperparameters
-  hp <- h5read(hdf5, "metadata/hyperparameters")
-  # hdf5 seems to store logical as a factor or at least R reads them as such.
-  # The code below looks for factors that store logical values and
-  # explicitly converts them to logical
-  for (i in seq_along(hp)) {
-    a <- hp[[i]] # this hyper parameter
-    if (is.factor(a) && all(tolower(levels(a)) %in% c("true", "false"))) {
-      a <- as.logical(a)
+  if (is_fitted_model) {
+
+    hp <- h5read(hdf5, "metadata/hyperparameters")
+    # hdf5 seems to store logical as a factor or at least R reads them as such.
+    # The code below looks for factors that store logical values and
+    # explicitly converts them to logical
+    for (i in seq_along(hp)) {
+      a <- hp[[i]] # this hyper parameter
+      if (is.factor(a) && all(tolower(levels(a)) %in% c("true", "false"))) {
+        a <- as.logical(a)
+      }
+      if (inherits(a, "array")) {
+        a <- as.vector(a)
+      }
+      hp[[i]] <- a
     }
-    if (inherits(a, "array")) {
-      a <- as.vector(a)
+    bf$metadata$hyperparameters <- hp
+
+    # loss values
+    lv <- as.data.frame(h5read(hdf5, "metadata/loss_values"))
+    for (i in seq_len(ncol(lv))) {
+      # IF R re-exports an imported hdf5 the loss values columns are each
+      # arrays.  This returns them to standard data.frame columns
+      lv[[i]] <- as.vector(lv[[i]])
     }
-    hp[[i]] <- a
-  }
-  bf$metadata$hyperparameters <- hp
-
-  # loss values
-  lv <- as.data.frame(h5read(hdf5, "metadata/loss_values"))
-  for (i in seq_len(ncol(lv))) {
-    # IF R re-exports an imported hdf5 the loss values columns are each
-    # arrays.  This returns them to standard data.frame columns
-    lv[[i]] <- as.vector(lv[[i]])
-  }
-  bf$metadata$loss_values <- lv
+    bf$metadata$loss_values <- lv
 
+  } # end only for fitted models
 
 
   # dates
@@ -162,77 +176,134 @@ import_birdflow_v3 <- function(hdf5) {
   colnames(dates) <- gsub("^week_", "", colnames(dates))
   bf$dates <- dates
 
-  # Save marginals into list
-  marg <- h5read(hdf5, "marginals", native = TRUE)
-  nt <- length(marg[!names(marg) == "index"])
-  bf$metadata$n_transitions <- nt
-  if (is.null(bf$metadata$timestep_padding))
-    bf$metadata$timestep_padding <- nchar(nt)
-  circular <- nt == length(unique(dates$date))
-  bf$marginals <- vector(mode = "list", length = nt)
-
-  # If the hdf5 has been re-exported from R than we just copy the marginals over
-  if ("index" %in% names(marg)) {
-    bf$marginals <- marg
-  } else {
-    # If this hdf5 was written by python then we need to copy and rename
-    # marginals
-    for (i in seq_len(nt)) {
-      python_label <- paste0("Week", i, "_to_", i + 1)
-      if (circular && i == nt) {
-        label <- paste0("M_", pad_timestep(i, bf), "-", pad_timestep(1, bf))
-      } else {
-        label <- paste0("M_", pad_timestep(i, bf), "-", pad_timestep(i + 1, bf))
+  if (is_fitted_model) {
+
+    # Save marginals into list
+    marg <- h5read(hdf5, "marginals", native = TRUE)
+    nt <- length(marg[!names(marg) == "index"])
+    bf$metadata$n_transitions <- nt
+    if (is.null(bf$metadata$timestep_padding))
+      bf$metadata$timestep_padding <- nchar(nt)
+    circular <- nt == length(unique(dates$date))
+    bf$marginals <- vector(mode = "list", length = nt)
+
+    # If the hdf5 has been re-exported from R, just copy the marginals over
+    if ("index" %in% names(marg)) {
+      bf$marginals <- marg
+    } else {
+      # If this hdf5 was written by python then we need to copy and rename
+      # marginals
+      for (i in seq_len(nt)) {
+        python_label <- paste0("Week", i, "_to_", i + 1)
+        if (circular && i == nt) {
+          label <- paste0("M_", pad_timestep(i, bf), "-", pad_timestep(1, bf))
+        } else {
+          label <- paste0("M_", pad_timestep(i, bf), "-",
+                          pad_timestep(i + 1, bf))
+        }
+        bf$marginals[[i]] <- marg[[python_label]]
+        names(bf$marginals)[i] <- label
       }
-      bf$marginals[[i]] <- marg[[python_label]]
-      names(bf$marginals)[i] <- label
+      bf$metadata$has_marginals <- TRUE
     }
-    bf$metadata$has_marginals <- TRUE
   }
 
   # Save distributions
   bf$distr <- h5read(hdf5, "distr", native = TRUE)
 
-  # Cleanup duplicated distribution, dynamic_mask row, and date added to
-  # input to force circular model fitting.
+  if (is_fitted_model) {
+    # Cleanup duplicated distribution, dynamic_mask row, and date added to
+    # input to force circular model fitting.
+
+    # Cleanup duplicated date row
+    sv <- duplicated(bf$dates$date)
+    if (any(sv)) {
+      bf$dates <- bf$dates[!sv, ]
+    }
+    bf$metadata$n_timesteps <- nrow(bf$dates)
+
+    # Delete duplicated distribution
+    d <- bf$distr
+    if (ncol(d) == n_timesteps(bf) + 1) {
+      if (!all(d[, 1] == d[, ncol(d)]))
+        stop("Expected extra distribution to match first distribution")
+      d <- d[, 1:(ncol(d) - 1)]
+    }
+
+    bf$distr <- d
+
+    # Delete duplicated dynamic mask row
+    dm <- bf$geom$dynamic_mask
+    if (ncol(dm) == n_timesteps(bf) + 1) {
+      if (!all(dm[, 1] == dm[, ncol(dm)]))
+        stop("Expected first and last dynamic mask columns to matrch in ",
+             "circular BirdFlow model")
+      dm <- dm[, 1:(ncol(dm) - 1)]
+    }
+
+    bf$geom$dynamic_mask <- dm
+
+    # Make and save marginal index
+    bf$marginals$index <- make_marginal_index(bf)
+
+  } # end fitted model only
 
-  # Cleanup duplicated date row
-  sv <- duplicated(bf$dates$date)
-  if (any(sv)) {
-    bf$dates <- bf$dates[!sv, ]
-  }
-  bf$metadata$n_timesteps <- nrow(bf$dates)
-
-  # Delete duplicated distribution
-  d <- bf$distr
-  if (ncol(d) == n_timesteps(bf) + 1) {
-    if (!all(d[, 1] == d[, ncol(d)]))
-      stop("Expected extra distribution to match first distribution")
-    d <- d[, 1:(ncol(d) - 1)]
-  }
 
-  ### back compatibility code
+  # Restore distr and dynamic mask dimnames (lost in hdf5 write+read)
   ts_col <- ifelse(bf$metadata$ebird_version_year < 2022,
                    "interval",
                    "timestep"
-  )
-  dimnames(d) <- list(i = NULL, time = paste0("t", bf$dates[[ts_col]]))
-  bf$distr <- d
-
-  # Delete duplicated dynamic mask row
-  dm <- bf$geom$dynamic_mask
-  if (ncol(dm) == n_timesteps(bf) + 1) {
-    if (!all(dm[, 1] == dm[, ncol(dm)]))
-      stop("Expected first and last dynamic mask columns to matrch in circular",
-           "BirdFlow model")
-    dm <- dm[, 1:(ncol(dm) - 1)]
+  )      ### back compatibility code
+
+  # Set dimnames for distr and dynamic mask
+  dn <- list(i = NULL, time = paste0("t", bf$dates[[ts_col]]))
+  dimnames(bf$distr) <- dn
+  dimnames(bf$geom$dynamic_mask) <- dn
+
+  if (!is_fitted_model) {
+    # Need to import some stuff here
+    bf$marginals <- NULL
+    bf$distances <- as.numeric(h5read(hdf5, "distances"))
+    bf$uci <- h5read(hdf5, "uci", native = TRUE)
+    bf$lci <- h5read(hdf5, "lci", native = TRUE)
+    dimnames(bf$uci) <- dn
+    dimnames(bf$lci) <- dn
   }
 
-  dimnames(dm) <- list(i = NULL, time = paste0("t", bf$dates[[ts_col]]))
-  bf$geom$dynamic_mask <- dm
+  # Convert sparse matricies (if present) back into sparse matrices
+  # (only relevant if reimporting a previously imported and exported model)
+  if (has_marginals(bf) && bf$metadata$is_sparse) {
+    mn <- setdiff(names(bf$marginals), "index")
+    for (m in mn) {
+      bf$marginals[[m]] <- Matrix::Matrix(bf$marginals[[m]], sparse = TRUE)
+    }
+
+    # Clean up metadata$sparse attriubtes and order problems
+
+    sparse <- bf$metadata$sparse
+
+    # Restore standard order to sparse if names are as expected
+    sparse_order <-
+      c("fix_stats", "method", "arguments", "stats",
+        "pct_zero", "pct_density_lost")
+    if (setequal(sparse_order, names(sparse))) {
+      sparse <- sparse[sparse_order]
+    }
+
+    # Remove extra attributes hidden in data frame columns and vectors
+    for (i in seq_along(bf$metadata$sparse)) {
+      if (inherits(sparse[[i]], "data.frame")) {
+        sparse[[i]] <- clean_hdf5_dataframe(sparse[[i]])
+      } else {
+        sparse[[i]] <- as.vector(sparse[[i]])
+       }
+    }
+    # Cleanup extra attributes in argument list
+    if ("arguments" %in% names(sparse))
+      sparse$arguments <- lapply(sparse$arguments, as.vector)
 
-  # Make and save marginal index
-  bf$marginals$index <- make_marginal_index(bf)
+    bf$metadata$sparse <- sparse
+  } # end if sparse
 
   return(bf)
 }
diff --git a/R/new_birdflow.R b/R/new_birdflow.R
@@ -45,9 +45,11 @@ new_BirdFlow <- function() {
                    ebird_version_year = NA_integer_,
                    ebird_release_year = NA_integer_,
                    ebird_access_end_date = NA,
+                   ebirdst_version = NA,
                    birdflow_preprocess_date = NA,
                    birdflow_model_date = NA,
                    birdflow_version = 3,
+                   birdflowr_preprocess_version = NA,
                    birdflowr_version =
                      as.character(utils::packageVersion("BirdFlowR")),
                    is_sparse = FALSE,