NIEHS · kyle-messier · Apr 9, 2024 · Apr 5, 2024 · Apr 9, 2024
diff --git a/.Rbuildignore b/.Rbuildignore
diff --git a/.github/workflows/check-standard.yaml b/.github/workflows/check-standard.yaml
diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
diff --git a/.github/workflows/sanitizers.yaml b/.github/workflows/sanitizers.yaml
diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml
diff --git a/.gitignore b/.gitignore
diff --git a/.lintr b/.lintr
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: PrestoGP
 Type: Package
 Title: Penalized Regression for Spatio-Temporal Outcomes via Gaussian Processes 
-Version: 0.2.0.9026
+Version: 0.2.0.9029
 Authors@R: c(
     person(given = "Eric",
            family = "Bair",

diff --git a/Makefile b/Makefile
diff --git a/NAMESPACE b/NAMESPACE
diff --git a/R/Log_Likelihood.R b/R/Log_Likelihood.R
diff --git a/R/PrestoGP-package.R b/R/PrestoGP-package.R
diff --git a/R/PrestoGP_CreateU_Multivariate.R b/R/PrestoGP_CreateU_Multivariate.R
diff --git a/R/PrestoGP_Full.R b/R/PrestoGP_Full.R
diff --git a/R/PrestoGP_Model.R b/R/PrestoGP_Model.R
@@ -21,6 +21,8 @@ setOldClass("cv.glmnet")
 #' "super matrix" for multivariate models. See
 #' \code{\link[psych]{superMatrix}}.
 #' @slot Y_train A column matrix containing the original response values.
+#' @slot Y_bar A vector containing the means of each outcome.
+#' @slot Y_obs A logical vector used to track which values of Y are non-missing.
 #' @slot X_tilde The matrix of transformed predictors.
 #' @slot y_tilde The column matrix containing the transformed response values.
 #' @slot res numeric.
@@ -78,6 +80,8 @@ PrestoGPModel <- setClass("PrestoGPModel",
     linear_model = "cv.glmnet", # the linear model
     X_train = "matrix", # the original independent variable matrix
     Y_train = "matrix", # the original dependent variable matrix
+    Y_bar = "numeric", # the means of each outcome variable
+    Y_obs = "logical", # a logical vector indicating whether the ith observation was observed
     locs_train = "list", # the location / temporal matrix
     converged = "logical", # a logical variable that is true if the model fitting process has converged
     LL_Vecchia_krig = "numeric", # the value of the negative log likelihood function after optimization
@@ -115,7 +119,8 @@ setGeneric(
   "prestogp_fit",
   function(model, Y, X, locs, scaling = NULL, apanasovich = FALSE,
     covparams = NULL, beta.hat = NULL, tol = 0.999999, max_iters = 100,
-    verbose = FALSE, optim.method = "Nelder-Mead",
+    center.y = NULL, impute.y = FALSE, lod = NULL, verbose = FALSE,
+    optim.method = "Nelder-Mead",
     optim.control = list(trace = 0, reltol = 1e-3, maxit = 5000),
     family = c("gaussian", "binomial"), nfolds = 10, foldid = NULL,
     parallel = FALSE) {
@@ -214,11 +219,12 @@ setGeneric("compute_residuals", function(model, Y, Y.hat, family) standardGeneri
 setGeneric("transform_data", function(model, Y, X) standardGeneric("transform_data"))
 setGeneric("estimate_theta", function(model, locs, optim.control, method) standardGeneric("estimate_theta"))
 setGeneric("estimate_betas", function(model, family, nfolds, foldid, parallel) standardGeneric("estimate_betas"))
+setGeneric("impute_y", function(model) standardGeneric("impute_y"))
 setGeneric("compute_error", function(model, y, X) standardGeneric("compute_error"))
 setGeneric("scale_locs", function(model, locs) standardGeneric("scale_locs"))
 setGeneric("theta_names", function(model) standardGeneric("theta_names"))
 setGeneric("transform_covariance_parameters", function(model) standardGeneric("transform_covariance_parameters"))
-setGeneric("check_input", function(model, Y, X, locs) standardGeneric("check_input"))
+setGeneric("check_input", function(model, Y, X, locs, center.y, impute.y, lod) standardGeneric("check_input"))
 setGeneric("check_input_pred", function(model, X, locs) standardGeneric("check_input_pred"))
 
 #' show
@@ -329,6 +335,14 @@ setMethod(
 #' tol*previous_error (optional). Defaults to 0.999999.
 #' @param max_iters Maximum number of iterations for the model fitting
 #' procedure. Defaults to 100.
+#' @param center.y Should the Y's be mean centered before fitting the model?
+#' Defaults to TRUE for gaussian models and FALSE for binomial models.
+#' @param impute.y Should missing Y's be imputed? Defaults to FALSE.
+#' @param lod Limit of detection value(s). Any Y value less than lod is
+#' assumed to be missing when performing missing data imputation. Should be
+#' numeric for univariate models and a list for multivariate models, where
+#' each element of the list corresponds to an outcome. If not specified, it
+#' is assumed that no limit of detection exists. Ignored if impute.y is FALSE.
 #' @param verbose If TRUE, additional information about model fit will be
 #' printed. Defaults to FALSE.
 #' @param optim.method Optimization method to be used for the maximum
@@ -408,13 +422,33 @@ setMethod(
 setMethod(
   "prestogp_fit", "PrestoGPModel",
   function(model, Y, X, locs, scaling = NULL, apanasovich = NULL,
-    covparams = NULL, beta.hat = NULL, tol = 0.999999,
-    max_iters = 100, verbose = FALSE, optim.method = "Nelder-Mead",
+    covparams = NULL, beta.hat = NULL, tol = 0.999999, max_iters = 100,
+    center.y = NULL, impute.y = FALSE, lod = NULL, verbose = FALSE,
+    optim.method = "Nelder-Mead",
     optim.control = list(trace = 0, reltol = 1e-3, maxit = 5000),
     family = c("gaussian", "binomial"),
     nfolds = 10, foldid = NULL, parallel = FALSE) {
-    model <- check_input(model, Y, X, locs)
     family <- match.arg(family)
+    if (is.null(center.y)) {
+      if (family == "gaussian") {
+        center.y <- TRUE
+      } else {
+        center.y <- FALSE
+      }
+    }
+    model <- check_input(model, Y, X, locs, center.y, impute.y, lod)
+    if (is.null(lod)) {
+      lodv <- rep(Inf, nrow(model@Y_train))
+    } else {
+      if (is.numeric(lod)) {
+        lod <- list(lod)
+      }
+      lodv <- NULL
+      for (i in seq_along(lod)) {
+        lodv <- c(lodv, rep(lod[[i]] - model@Y_bar[i],
+            nrow(model@locs_train[[i]])))
+      }
+    }
     if (!is.null(beta.hat)) {
       if (!is.vector(beta.hat) | !is.numeric(beta.hat)) {
         stop("beta.hat parameter must be a numeric vector")
@@ -524,6 +558,9 @@ setMethod(
             s = "lambda.1se",
             type = "response")
         )
+        model <- impute_y(model)
+        alod <- !model@Y_obs & model@Y_train > lodv
+        model@Y_train[alod] <- lodv[alod]
         covparams.iter <- model@covparams
         Vecchia.SCAD.iter <- model@linear_model
       } else {
@@ -653,7 +690,7 @@ setMethod("calc_covparams", "PrestoGPModel", function(model, locs, Y, covparams)
     col.vars <- rep(NA, P)
     D.sample.bar <- rep(NA, model@nscale * P)
     for (i in 1:P) {
-      col.vars[i] <- var(Y[[i]])
+      col.vars[i] <- var(Y[[i]], na.rm = TRUE)
       N <- length(Y[[i]])
       # TODO find a better way to compute initial spatial range
       for (j in 1:model@nscale) {

diff --git a/R/PrestoGP_Multivariate_Vecchia.R b/R/PrestoGP_Multivariate_Vecchia.R
@@ -38,7 +38,9 @@ setMethod("prestogp_predict", "MultivariateVecchiaModel",
     ordering.pred <- match.arg(ordering.pred)
     pred.cond <- match.arg(pred.cond)
     return.values <- match.arg(return.values)
-    X <- check_input_pred(model, X, locs)
+    pred.list <- check_input_pred(model, X, locs)
+    X <- pred.list$X
+    Y_bar <- pred.list$Y_bar
     if (is.null(m)) { # m defaults to the value used for training
       m <- model@n_neighbors
     }
@@ -91,7 +93,7 @@ setMethod("prestogp_predict", "MultivariateVecchiaModel",
 
     # prediction function can return both mean and sds
     # returns a list with elements mu.pred,mu.obs,var.pred,var.obs,V.ord
-    Vec.mean <- pred$mu.pred + Vecchia.Pred # residual + mean trend
+    Vec.mean <- Y_bar + pred$mu.pred + Vecchia.Pred # residual + mean trend
     if (return.values == "mean") {
       return.list <- list(means = Vec.mean)
     } else {
@@ -112,7 +114,7 @@ setMethod("prestogp_predict", "MultivariateVecchiaModel",
   }
 )
 
-setMethod("check_input", "MultivariateVecchiaModel", function(model, Y, X, locs) {
+setMethod("check_input", "MultivariateVecchiaModel", function(model, Y, X, locs, center.y, impute.y, lod) {
   if (!is.list(locs)) {
     stop("locs must be a list for multivariate models")
   }
@@ -128,6 +130,14 @@ setMethod("check_input", "MultivariateVecchiaModel", function(model, Y, X, locs)
   if (length(locs) != length(X)) {
     stop("locs and X must have the same length")
   }
+  if (!is.null(lod)) {
+    if (!is.list(lod)) {
+      stop("lod must be a list for multivariate models")
+    }
+    if (length(locs) != length(lod)) {
+      stop("locs and lod must have the same length")
+    }
+  }
   for (i in seq_along(locs)) {
     if (!is.matrix(locs[[i]])) {
       stop("Each locs must be a matrix")
@@ -155,9 +165,44 @@ setMethod("check_input", "MultivariateVecchiaModel", function(model, Y, X, locs)
     if (nrow(Y[[i]]) != nrow(X[[i]])) {
       stop("Each Y must have the same number of rows as X")
     }
+    if (sum(is.na(X[[i]])) > 0) {
+      stop("X must not contain NA's")
+    }
+    if (sum(is.na(locs[[i]])) > 0) {
+      stop("locs must not contain NA's")
+    }
+    if (sum(is.na(Y[[i]])) > 0 & !impute.y) {
+      stop("Y contains NA's and impute.y is FALSE. Set impute.y=TRUE to impute missing Y's.")
+    }
+    if (!is.null(lod[[i]])) {
+      if (!is.numeric(lod[[i]])) {
+        stop("Each lod must be numeric")
+      }
+      if (length(lod[[i]]) != 1) {
+        stop("Each lod must have length 1")
+      }
+    }
+  }
+  Y_bar <- rep(NA, length(Y))
+  Y_obs <- NULL
+  for (i in seq_along(Y)) {
+    Y_obs <- c(Y_obs, !is.na(Y[[i]]))
+    if (!is.null(lod)) {
+      Y[[i]][is.na(Y[[i]])] <- lod[[i]] / 2
+    }
+    if (center.y) {
+      Y_bar[i] <- mean(Y[[i]], na.rm = TRUE)
+      Y[[i]] <- Y[[i]] - Y_bar[i]
+      Y[[i]][is.na(Y[[i]])] <- 0
+    } else {
+      Y[[i]][is.na(Y[[i]])] <- mean(Y[[i]], na.rm = TRUE)
+      Y_bar[i] <- 0
+    }
   }
+  model@Y_bar <- Y_bar
   model@locs_train <- locs
   model@Y_train <- as.matrix(unlist(Y))
+  model@Y_obs <- Y_obs
   if (length(X) == 1) {
     model@X_train <- X[[1]]
   } else {
@@ -195,13 +240,76 @@ setMethod("check_input_pred", "MultivariateVecchiaModel", function(model, X, loc
   }
   if (length(X) == 1) {
     X <- X[[1]]
+    Y_bar <- rep(model@Y_bar[1], nrow(X))
   } else {
+    Y_bar <- NULL
+    for (i in seq_along(X)) {
+      Y_bar <- c(Y_bar, rep(model@Y_bar[i], nrow(X[[i]])))
+    }
     X <- psych::superMatrix(X)
   }
   if (ncol(X) != ncol(model@X_train)) {
     stop("X and X_train must have the same number of predictors")
   }
-  return(X)
+  return(list(X = X, Y_bar = Y_bar))
+})
+
+setMethod("impute_y", "MultivariateVecchiaModel", function(model) {
+  if (sum(!model@Y_obs) > 0) {
+    Vecchia.Pred <- predict(model@linear_model,
+      newx = model@X_train[!model@Y_obs, ],
+      s = model@linear_model$lambda[model@lambda_1se_idx])
+    Vecchia.hat <- predict(model@linear_model,
+      newx = model@X_train[model@Y_obs, ],
+      s = model@linear_model$lambda[model@lambda_1se_idx])
+
+    # Test set prediction
+    res <- model@Y_train[model@Y_obs] - Vecchia.hat
+
+    locs.scaled <- scale_locs(model, model@locs_train)
+    all.obs <- model@Y_obs
+    locs.otr <- list()
+    locs.otst <- list()
+    for (i in seq_along(model@locs_train)) {
+      nl <- nrow(locs.scaled[[i]])
+      cur.obs <- all.obs[1:nl]
+      locs.otr[[i]] <- locs.scaled[[i]][cur.obs, ]
+      locs.otst[[i]] <- locs.scaled[[i]][!cur.obs, ]
+      all.obs <- all.obs[-(1:nl)]
+    }
+
+    if (model@apanasovich & (length(model@locs_train) > 1)) {
+      locs.nd <- eliminate_dupes(locs.otr, locs.otst)
+      locs.otr <- locs.nd$locs
+      locs.otst <- locs.nd$locs.pred
+    }
+    vec.approx.test <- vecchia_Mspecify(locs.otr, model@n_neighbors,
+      locs.list.pred = locs.otst,
+      ordering.pred = "obspred",
+      pred.cond = "independent"
+    )
+
+    ## carry out prediction
+    if (!model@apanasovich) {
+      params <- model@covparams
+      param.seq <- model@param_sequence
+      pred <- vecchia_Mprediction(res, vec.approx.test,
+        c(
+          params[1:param.seq[1, 2]],
+          rep(1, param.seq[2, 2] - param.seq[2, 1] + 1),
+          params[param.seq[3, 1]:param.seq[5, 2]]
+        ),
+        return.values = "mean"
+      )
+    } else {
+      pred <- vecchia_Mprediction(res, vec.approx.test, model@covparams,
+        return.values = "mean"
+      )
+    }
+
+    model@Y_train[!model@Y_obs] <- pred$mu.pred + Vecchia.Pred
+  }
+  invisible(model)
 })
 
 setMethod("specify", "MultivariateVecchiaModel", function(model) {

diff --git a/R/PrestoGP_Util_Functions.R b/R/PrestoGP_Util_Functions.R
diff --git a/R/PrestoGP_Vecchia.R b/R/PrestoGP_Vecchia.R
@@ -81,7 +81,8 @@ setMethod("prestogp_predict", "VecchiaModel",
 
     # prediction function can return both mean and sds
     # returns a list with elements mu.pred,mu.obs,var.pred,var.obs,V.ord
-    Vec.mean <- pred$mu.pred + Vecchia.Pred # residual + mean trend
+    # residual + mean trend
+    Vec.mean <- model@Y_bar + pred$mu.pred + Vecchia.Pred
     if (return.values == "mean") {
       return.list <- list(means = Vec.mean)
     } else {
@@ -94,7 +95,7 @@ setMethod("prestogp_predict", "VecchiaModel",
   }
 )
 
-setMethod("check_input", "VecchiaModel", function(model, Y, X, locs) {
+setMethod("check_input", "VecchiaModel", function(model, Y, X, locs, center.y, impute.y, lod) {
   if (!is.matrix(locs)) {
     stop("locs must be a matrix")
   }
@@ -116,6 +117,35 @@ setMethod("check_input", "VecchiaModel", function(model, Y, X, locs) {
   if (nrow(Y) != nrow(X)) {
     stop("Y must have the same number of rows as X")
   }
+  if (sum(is.na(X)) > 0) {
+    stop("X must not contain NA's")
+  }
+  if (sum(is.na(locs)) > 0) {
+    stop("locs must not contain NA's")
+  }
+  if (sum(is.na(Y)) > 0 & !impute.y) {
+    stop("Y contains NA's and impute.y is FALSE. Set impute.y=TRUE to impute missing Y's.")
+  }
+  if (!is.null(lod)) {
+    if (!is.numeric(lod)) {
+      stop("lod must be numeric")
+    }
+    if (length(lod) != 1) {
+      stop("lod must have length 1")
+    }
+  }
+  model@Y_obs <- !is.na(as.vector(Y))
+  if (!is.null(lod)) {
+    Y[!model@Y_obs] <- lod / 2
+  }
+  if (center.y) {
+    model@Y_bar <- mean(Y, na.rm = TRUE)
+    Y <- Y - model@Y_bar
+    Y[is.na(Y)] <- 0
+  } else {
+    model@Y_bar <- 0
+    Y[is.na(Y)] <- mean(Y, na.rm = TRUE)
+  }
   model@X_train <- X
   model@Y_train <- Y
   model@locs_train <- list(locs)
@@ -141,6 +171,47 @@ setMethod("check_input_pred", "VecchiaModel", function(model, X, locs) {
   invisible(model)
 })
 
+setMethod("impute_y", "VecchiaModel", function(model) {
+  if (sum(!model@Y_obs) > 0) {
+    Vecchia.Pred <- predict(model@linear_model,
+      newx = model@X_train[!model@Y_obs, ],
+      s = model@linear_model$lambda[model@lambda_1se_idx])
+    Vecchia.hat <- predict(model@linear_model,
+      newx = model@X_train[model@Y_obs, ],
+      s = model@linear_model$lambda[model@lambda_1se_idx])
+
+    # Test set prediction
+    res <- model@Y_train[model@Y_obs] - Vecchia.hat
+
+    locs.scaled <- scale_locs(model, model@locs_train)[[1]]
+    vec.approx.test <- vecchia_specify(locs.scaled[model@Y_obs, ],
+      model@n_neighbors,
+      locs.pred = locs.scaled[!model@Y_obs, ],
+      ordering.pred = "obspred", pred.cond = "independent")
+
+    ## carry out prediction
+    if (!model@apanasovich) {
+      pred <- vecchia_prediction(
+        res,
+        vec.approx.test,
+        c(model@covparams[1], 1, model@covparams[3]),
+        model@covparams[4],
+        return.values = "mean"
+      )
+    } else {
+      pred <- vecchia_prediction(
+        res,
+        vec.approx.test,
+        c(model@covparams[1], model@covparams[2], model@covparams[3]),
+        model@covparams[4], return.values = "mean"
+      )
+    }
+
+    model@Y_train[!model@Y_obs] <- pred$mu.pred + Vecchia.Pred
+  }
+  invisible(model)
+})
+
 #' specify
 #'
 #' Specify the conditioning set using m nearest neighbors.