Merge branch 'main' of sciome-bot-git:Spatiotemporal-Exposures-and-To…

…xicology/PrestoGP into build-workflow
NIEHS · Jan 12, 2024 · 245eaa3 · 245eaa3
2 parents 5355e24 + c060815
commit 245eaa3
Show file tree

Hide file tree

Showing 26 changed files with 1,149 additions and 183 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -5,3 +5,4 @@
 ^\.Rproj\.user$
 ^doc$
 ^Meta$
+^\.github/
diff --git a/.gitignore b/.gitignore
@@ -31,6 +31,7 @@ manuscript/
 
 # Output files from R CMD build
 /*.tar.gz
+*.dll
 
 # Output files from R CMD check
 /*.Rcheck/

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -36,15 +36,18 @@ Imports:
     knitr,
     dplyr,
     glmnet,
-    rmarkdown,
-    markdown,
     gtools,
     geoR,
-    doParallel,
     RANN,
+    stats,
+    methods,
     foreach,
     rlang,
-    methods
+    mvtnorm,
+    spam,
+    psych,
+    doParallel,
+    covr
 License: GPL-3
 Encoding: UTF-8
 VignetteBuilder: knitr 
@@ -62,3 +65,7 @@ Collate:
     'Visualization.R'
     'package.R'
     'utils-tidy-eval.R'
+Suggests: 
+    roxygen2,
+    testthat (>= 3.0.0)
+Config/testthat/edition: 3
diff --git a/NAMESPACE b/NAMESPACE
@@ -29,6 +29,7 @@ import(glmnet)
 import(ncvreg)
 import(readxl)
 import(scoringRules)
+import(covr)
 importFrom(aod,wald.test)
 importFrom(dplyr,"%>%")
 importFrom(foreach,"%dopar%")

diff --git a/R/Log_Likelihood.R b/R/Log_Likelihood.R
@@ -12,15 +12,18 @@
 #'
 #' @examples
 #' @noRd
-negloglik_vecchia_ST <- function(logparms, res, vecchia.approx, param.seq, scaling, nscale) {
+negloglik_vecchia_ST <- function(logparms, res, vecchia.approx, param.seq,
+                                 scaling, nscale) {
   parms <- unlog.params(logparms, param.seq, 1)
   locs.scaled <- vecchia.approx$locsord
   for (j in 1:nscale) {
     locs.scaled[, scaling == j] <- locs.scaled[, scaling == j] /
       parms[param.seq[2, 1] + j - 1]
   }
   vecchia.approx$locsord <- locs.scaled
-  -vecchia_likelihood(res, vecchia.approx, c(parms[1], 1, parms[param.seq[3, 1]]), parms[param.seq[4, 1]])
+  -vecchia_likelihood(res, vecchia.approx, c(parms[1], 1,
+                                             parms[param.seq[3, 1]]),
+                      parms[param.seq[4, 1]])
 }
 
 #' negloglik_vecchia
@@ -39,7 +42,8 @@ negloglik_vecchia_ST <- function(logparms, res, vecchia.approx, param.seq, scali
 #' @noRd
 negloglik_vecchia <- function(logparms, res, vecchia.approx, param.seq) {
   parms <- unlog.params(logparms, param.seq, 1)
-  -vecchia_likelihood(res, vecchia.approx, c(parms[1], parms[2], parms[3]), parms[4])
+  -vecchia_likelihood(res, vecchia.approx, c(parms[1], parms[2], parms[3]),
+                      parms[4])
 }
 
 #' negloglik_full_ST
@@ -91,7 +95,8 @@ negloglik.full <- function(logparams, d, y, param.seq) {
   params <- unlog.params(logparams, param.seq, 1)
   #    d <- fields::rdist(locs)
   N <- nrow(d)
-  cov.mat <- params[1] * fields::Matern(d, range = params[2], smoothness = params[3]) +
+  cov.mat <- params[1] * fields::Matern(d, range = params[2],
+                                        smoothness = params[3]) +
     params[4] * diag(N)
   return(-1 * mvtnorm::dmvnorm(y, rep(0, N), cov.mat, log = TRUE))
 }
@@ -128,7 +133,8 @@ mvnegloglik <- function(logparams, vecchia.approx, y, param.seq, P) {
 ##############################################################################
 ### Flexible Spatiotemporal Multivariate Matern Negative Loglikelihood Function ###########
 
-mvnegloglik_ST <- function(logparams, vecchia.approx, y, param.seq, P, scaling, nscale) {
+mvnegloglik_ST <- function(logparams, vecchia.approx, y, param.seq, P, scaling,
+                           nscale) {
   #  Input-
   #  logparams: A numeric vector of length (4*P)+(4*choose(P,2)).
   #             To construct these parameters we unlist a list of the 7 covariance
@@ -236,11 +242,15 @@ create.cov.upper.flex <- function(P, marg.var, marg.range, marg.smooth,
       j <- combs[iter, 2]
 
       smoothness.mat[i, j] <- (marg.smooth[i] + marg.smooth[j]) / 2
-      range.mat[i, j] <- 1 / sqrt(((1 / marg.range[i])^2 + (1 / marg.range[j])^2) / 2)
+      range.mat[i, j] <- 1 / sqrt(((1 / marg.range[i])^2 +
+                                     (1 / marg.range[j])^2) / 2)
 
       s1 <- sqrt(marg.var[i] * marg.var[j])
-      s2 <- ((1 / marg.range[i])^marg.smooth[i] * (1 / marg.range[j])^marg.smooth[j]) / ((1 / range.mat[i, j])^(2 * smoothness.mat[i, j]))
-      s3 <- gamma(smoothness.mat[i, j]) / (sqrt(gamma(marg.smooth[i])) * sqrt(gamma(marg.smooth[j])))
+      s2 <- ((1 / marg.range[i])^marg.smooth[i] *
+               (1 / marg.range[j])^marg.smooth[j]) /
+        ((1 / range.mat[i, j])^(2 * smoothness.mat[i, j]))
+      s3 <- gamma(smoothness.mat[i, j]) / (sqrt(gamma(marg.smooth[i])) *
+                                             sqrt(gamma(marg.smooth[j])))
       s4 <- R.corr[iter]
       sig2.mat[i, j] <- s1 * s2 * s3 * s4
     }
@@ -282,10 +292,12 @@ cat.covariances <- function(locs.list, sig2, range, smoothness, nugget) {
     # Calculate the covariance matrix - if/then based on its location in the super-matrix
     N <- nrow(d)
     if (i == j) { # To accomodate varying size outcomes- the nugget is not included on cross-covariances
-      cov.mat.ij <- sig2[i, j] * geoR::matern(d, phi = range[i, j], kappa = smoothness[i, j]) +
+      cov.mat.ij <- sig2[i, j] * geoR::matern(d, phi = range[i, j], kappa =
+                                                smoothness[i, j]) +
         nugget[i, j] * diag(N)
     } else {
-      cov.mat.ij <- sig2[i, j] * geoR::matern(d, phi = range[i, j], kappa = smoothness[i, j])
+      cov.mat.ij <- sig2[i, j] * geoR::matern(d, phi = range[i, j], kappa =
+                                                smoothness[i, j])
     }
 
 

diff --git a/R/PrestoGP_CreateU_Multivariate.R b/R/PrestoGP_CreateU_Multivariate.R
@@ -66,7 +66,8 @@ max_min_ordering <- function(locs, dist_func) {
 #' @param dist_func Any distance function with a signature of dist(query_location, locations_matrix)
 #'
 #' @return A vector containing the indices of the neighbors
-knn_indices <- function(ordered_locs, query, n_neighbors, dist_func, dist_func_code) {
+knn_indices <- function(ordered_locs, query, n_neighbors,
+                        dist_func, dist_func_code) {
   if (dist_func_code == "custom") {
     dists <- dist_func(query, ordered_locs)
     dists_order <- order(dists)
@@ -90,25 +91,34 @@ knn_indices <- function(ordered_locs, query, n_neighbors, dist_func, dist_func_c
 #' @param dist_func Any distance function with a signature of dist(query_location, locations_matrix)
 #'
 #' @return A list containing two matrices, each with one row per location: an indices matrix with the indices of nearest neighbors for each location, and a distance matrix with the associated distances
-sparseNN <- function(ordered_locs, n_neighbors, dist_func, dist_func_code, ordered_locs_pred = NULL) {
+sparseNN <- function(ordered_locs, n_neighbors,
+                     dist_func, dist_func_code, ordered_locs_pred = NULL) {
   ee <- min(apply(ordered_locs, 2, stats::sd))
   n <- nrow(ordered_locs)
   ordered_locs <- ordered_locs + matrix(
     ee * 1e-04 *
       stats::rnorm(n * ncol(ordered_locs)),
     n, ncol(ordered_locs)
   )
-  indices_matrix <- matrix(data = NA, nrow = nrow(ordered_locs), ncol = n_neighbors)
-  distances_matrix <- matrix(data = NA, nrow = nrow(ordered_locs), ncol = n_neighbors)
+  indices_matrix <- matrix(data = NA, nrow = nrow(ordered_locs),
+                           ncol = n_neighbors)
+  distances_matrix <- matrix(data = NA, nrow = nrow(ordered_locs),
+                             ncol = n_neighbors)
   for (row in 1:n_neighbors) {
     # for the locations from 1 to n_neighbors, use the entire locs list to find the neighbors
-    nn <- knn_indices(ordered_locs[1:(n_neighbors + 1), , drop = FALSE][-row, , drop = FALSE], ordered_locs[row, , drop = FALSE], n_neighbors, dist_func, dist_func_code)
+    nn <- knn_indices(ordered_locs[1:
+                                     (n_neighbors + 1), , drop = FALSE][-row, ,
+                                                              drop = FALSE],
+                      ordered_locs[row, , drop = FALSE], n_neighbors,
+                      dist_func, dist_func_code)
     indices_matrix[row, 1:n_neighbors] <- nn$indices[1:n_neighbors]
     distances_matrix[row, 1:n_neighbors] <- nn$distances[1:n_neighbors]
   }
   for (row in (n_neighbors + 1):nrow(ordered_locs)) {
     # get the m nearest neighbors from the locs before this one in the max-min order
-    nn <- knn_indices(ordered_locs[1:(row - 1), , drop = FALSE], ordered_locs[row, , drop = FALSE], n_neighbors, dist_func, dist_func_code)
+    nn <- knn_indices(ordered_locs[1:(row - 1), , drop = FALSE],
+                      ordered_locs[row, , drop = FALSE], n_neighbors,
+                      dist_func, dist_func_code)
     indices_matrix[row, 1:n_neighbors] <- nn$indices[1:n_neighbors]
     distances_matrix[row, 1:n_neighbors] <- nn$distances[1:n_neighbors]
   }

diff --git a/R/PrestoGP_Model.R b/R/PrestoGP_Model.R
@@ -70,7 +70,7 @@ setMethod("initialize", "PrestoGPModel", function(.Object, ...) {
 setGeneric("show_theta", function(object, Y_names) standardGeneric("show_theta"))
 setGeneric("prestogp_fit", function(model, Y, X, locs, scaling = NULL, apanasovich = FALSE, covparams = NULL, beta.hat = NULL, tol = 0.999999, max_iters = 100, verbose = FALSE, optim.method = "Nelder-Mead", optim.control = list(trace = 0, reltol = 1e-3, maxit = 5000), parallel = FALSE, foldid = NULL) standardGeneric("prestogp_fit"))
 setGeneric("prestogp_predict", function(model, X = "matrix", locs = "matrix", m = "numeric", ordering.pred = c("obspred", "general"), pred.cond = c("independent", "general"), return.values = c("mean", "meanvar")) standardGeneric("prestogp_predict"))
-setGeneric("calc_covparams", function(model, locs, Y) standardGeneric("calc_covparams"))
+setGeneric("calc_covparams", function(model, locs, Y, covparams) standardGeneric("calc_covparams"))
 setGeneric("specify", function(model, ...) standardGeneric("specify"))
 setGeneric("compute_residuals", function(model, Y, Y.hat) standardGeneric("compute_residuals"))
 setGeneric("transform_data", function(model, Y, X) standardGeneric("transform_data"))
@@ -81,6 +81,7 @@ setGeneric("scale_locs", function(model, locs) standardGeneric("scale_locs"))
 setGeneric("theta_names", function(model) standardGeneric("theta_names"))
 setGeneric("transform_covariance_parameters", function(model) standardGeneric("transform_covariance_parameters"))
 setGeneric("check_input", function(model, Y, X, locs) standardGeneric("check_input"))
+setGeneric("check_input_pred", function(model, X, locs) standardGeneric("check_input_pred"))
 
 #' show
 #'
@@ -205,22 +206,35 @@ setMethod(
            optim.control = list(trace = 0, reltol = 1e-3, maxit = 5000),
            parallel = FALSE, foldid = NULL) {
     model <- check_input(model, Y, X, locs)
-    if (!is.double(beta.hat) && !is.null(beta.hat)) {
-      stop("The beta.hat parameter must be floating point number.")
+    if (!is.null(beta.hat)) {
+        if (!is.vector(beta.hat) | !is.numeric(beta.hat)) {
+            stop("beta.hat parameter must be a numeric vector")
+        }
+        if (length(beta.hat) != (ncol(model@X_train) + 1)) {
+            stop("Length of beta.hat must match the number of predictors")
+        }
+        beta.hat <- as.matrix(beta.hat)
     }
-    if (!is.double(tol)) {
-      stop("The tol parameter must be floating point number.")
+    if (!is.numeric(tol)) {
+      stop("tol must be numeric")
     }
-    if (is.null(scaling)) {
-      if (is.matrix(locs)) {
-        scaling <- rep(1, ncol(locs))
-      } else {
-        scaling <- rep(1, ncol(locs[[1]]))
-      }
+    if (length(tol) != 1) {
+      stop("tol must be a scalar")
     }
-    nscale <- length(unique(scaling))
-    if (sum(sort(unique(scaling)) == 1:nscale) < nscale) {
-      stop("scaling must consist of sequential integers between 1 and ncol(locs)")
+    if (tol<=0 | tol>1) {
+      stop("tol must satisfy 0<tol<=1")
+    }
+    if (is.null(scaling)) {
+        scaling <- rep(1, ncol(model@locs_train[[1]]))
+        nscale <- 1
+    } else {
+        if (length(scaling) != ncol(model@locs_train[[1]])) {
+            stop("Length of scaling must equal ncol of locs")
+        }
+        nscale <- length(unique(scaling))
+        if (sum(sort(unique(scaling)) == 1:nscale) < nscale) {
+            stop("scaling must consist of sequential integers starting at 1")
+        }
     }
     if (is.null(apanasovich)) {
       if (nscale == 1) {
@@ -235,14 +249,18 @@ setMethod(
     model@scaling <- scaling
     model@nscale <- nscale
     model@apanasovich <- apanasovich
-    if (is.null(covparams)) {
-      model <- calc_covparams(model, locs, Y)
-    }
-    if (!is.vector(model@covparams)) {
-      stop("The covparams paramter must be a numeric vector.")
+    if (!is.null(covparams)) {
+      if (!is.vector(covparams) | !is.numeric(covparams)) {
+        stop("covparams must be a numeric vector")
+      }
     }
+    model <- calc_covparams(model, locs, Y, covparams)
     if (model@n_neighbors < model@min_m) {
-      stop(paste("M must be at least ", model@min_m, ".", sep = ""))
+      stop(paste("m must be at least ", model@min_m, sep = ""))
+    }
+    if (model@n_neighbors >= nrow(model@Y_train)) {
+      warning("Conditioning set size m chosen to be >=n. Changing to m=n-1")
+      model@n_neighbors <- nrow(model@Y_train) - 1
     }
 
     model <- specify(model)
@@ -278,7 +296,7 @@ setMethod(
         model <- specify(model)
       }
       model <- transform_data(model, model@Y_train, model@X_train)
-      model <- estimate_betas(model, parallel, foldid)
+      model <- estimate_betas(model, parallel)
       min.error <- compute_error(model)
       ### Check min-error against the previous error and tolerance
       if (min.error < prev.error * tol) {
@@ -390,35 +408,76 @@ setMethod("compute_error", "PrestoGPModel", function(model) {
 #' @param Y the dependent variable matrix
 #'
 #' @return a model with initial covariance parameters
-setMethod("calc_covparams", "PrestoGPModel", function(model, locs, Y) {
+setMethod("calc_covparams", "PrestoGPModel", function(model, locs, Y, covparams) {
   if (!is.list(locs)) {
     P <- 1
     locs <- list(locs)
     Y <- list(Y)
   } else {
     P <- length(locs)
   }
-  col.vars <- rep(NA, P)
-  D.sample.bar <- rep(NA, model@nscale * P)
-  for (i in 1:P) {
-    col.vars[i] <- var(Y[[i]])
-    N <- length(Y[[i]])
-    # TODO find a better way to compute initial spatial range
-    for (j in 1:model@nscale) {
-      d.sample <- sample(1:N, max(2, ceiling(N / 50)), replace = FALSE)
-      D.sample <- rdist(locs[[i]][d.sample, model@scaling == j])
-      D.sample.bar[(i - 1) * model@nscale + j] <- mean(D.sample) / 4
-    }
+  pseq <- create.param.sequence(P, model@nscale)
+  if (is.null(covparams)) {
+      col.vars <- rep(NA, P)
+      D.sample.bar <- rep(NA, model@nscale * P)
+      for (i in 1:P) {
+          col.vars[i] <- var(Y[[i]])
+          N <- length(Y[[i]])
+          # TODO find a better way to compute initial spatial range
+          for (j in 1:model@nscale) {
+              d.sample <- sample(1:N, max(2, ceiling(N / 50)), replace = FALSE)
+              D.sample <- rdist(locs[[i]][d.sample, model@scaling == j])
+              D.sample.bar[(i - 1) * model@nscale + j] <- mean(D.sample) / 4
+          }
+      }
+      model@logparams <- create.initial.values.flex(
+          c(0.9 * col.vars), # marginal variance
+          D.sample.bar, # range
+          rep(0.5, P), # smoothness
+          c(.1 * col.vars), # nuggets
+          rep(0, choose(P, 2)),
+          P
+      )
+  } else {
+      if (P==1) {
+          if (length(covparams) != pseq[4,2]) {
+              stop("Incorrect number of parameters in covparams")
+          }
+      } else {
+          if (length(covparams) != pseq[5,2]) {
+              stop("Incorrect number of parameters in covparams")
+          }
+      }
+      init.var <- covparams[pseq[1,1]:pseq[1,2]]
+      init.range <- covparams[pseq[2,1]:pseq[2,2]]
+      init.smooth <- covparams[pseq[3,1]:pseq[3,2]]
+      init.nugget <- covparams[pseq[4,1]:pseq[4,2]]
+      if (P>1) {
+          init.corr <- covparams[pseq[5,1]:pseq[5,2]]
+      }
+      else {
+          init.corr <- 0
+      }
+      if (sum(init.var<=0)>0) {
+          stop("Initial variance estimates must be positive")
+      }
+      if (sum(init.range<=0)>0) {
+          stop("Initial range estimates must be positive")
+      }
+      if (sum(init.nugget<=0)>0) {
+          stop("Initial nugget estimates must be positive")
+      }
+      if (sum(init.smooth<=0)>0 | sum(init.smooth>=2.5)>0) {
+          stop("Initial smoothness estimates must be between 0 and 2.5")
+      }
+      if (sum(init.corr < -1)>0 | sum(init.corr > 1)>0) {
+          stop("Initial correlation estimates must be between -1 and 1")
+      }
+      model@logparams <- create.initial.values.flex(init.var, init.range,
+                                                    init.smooth, init.nugget,
+                                                    init.corr, P)
   }
-  model@logparams <- create.initial.values.flex(
-    c(0.9 * col.vars), # marginal variance
-    D.sample.bar, # range
-    rep(0.5, P), # smoothness
-    c(.1 * col.vars), # nuggets
-    rep(0, choose(P, 2)),
-    P
-  )
-  model@param_sequence <- create.param.sequence(P, model@nscale)
+  model@param_sequence <- pseq
   model <- transform_covariance_parameters(model)
   invisible(model)
 })