Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement imputation beta #56

Merged
merged 2 commits into from
Apr 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file modified .Rbuildignore
100755 → 100644
Empty file.
Empty file modified .github/workflows/check-standard.yaml
100644 → 100755
Empty file.
Empty file modified .github/workflows/lint.yaml
100644 → 100755
Empty file.
Empty file modified .github/workflows/release.yaml
100644 → 100755
Empty file.
Empty file modified .github/workflows/sanitizers.yaml
100644 → 100755
Empty file.
Empty file modified .github/workflows/test-coverage.yaml
100644 → 100755
Empty file.
Empty file modified .gitignore
100755 → 100644
Empty file.
Empty file modified .lintr
100644 → 100755
Empty file.
2 changes: 1 addition & 1 deletion DESCRIPTION
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: PrestoGP
Type: Package
Title: Penalized Regression for Spatio-Temporal Outcomes via Gaussian Processes
Version: 0.2.0.9026
Version: 0.2.0.9029
Authors@R: c(
person(given = "Eric",
family = "Bair",
Expand Down
Empty file modified Makefile
100755 → 100644
Empty file.
Empty file modified NAMESPACE
100755 → 100644
Empty file.
Empty file modified R/Log_Likelihood.R
100755 → 100644
Empty file.
Empty file modified R/PrestoGP-package.R
100755 → 100644
Empty file.
Empty file modified R/PrestoGP_CreateU_Multivariate.R
100755 → 100644
Empty file.
Empty file modified R/PrestoGP_Full.R
100755 → 100644
Empty file.
49 changes: 43 additions & 6 deletions R/PrestoGP_Model.R
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ setOldClass("cv.glmnet")
#' "super matrix" for multivariate models. See
#' \code{\link[psych]{superMatrix}}.
#' @slot Y_train A column matrix containing the original response values.
#' @slot Y_bar A vector containing the means of each outcome.
#' @slot Y_obs A logical vector used to track which values of Y are non-missing.
#' @slot X_tilde The matrix of transformed predictors.
#' @slot y_tilde The column matrix containing the transformed response values.
#' @slot res numeric.
Expand Down Expand Up @@ -78,6 +80,8 @@ PrestoGPModel <- setClass("PrestoGPModel",
linear_model = "cv.glmnet", # the linear model
X_train = "matrix", # the original independent variable matrix
Y_train = "matrix", # the original dependent variable matrix
Y_bar = "numeric", # the means of each outcome variable
Y_obs = "logical", # a logical vector indicating whether the ith observation was observed
locs_train = "list", # the location / temporal matrix
converged = "logical", # a logical variable that is true if the model fitting process has converged
LL_Vecchia_krig = "numeric", # the value of the negative log likelihood function after optimization
Expand Down Expand Up @@ -115,7 +119,8 @@ setGeneric(
"prestogp_fit",
function(model, Y, X, locs, scaling = NULL, apanasovich = FALSE,
covparams = NULL, beta.hat = NULL, tol = 0.999999, max_iters = 100,
verbose = FALSE, optim.method = "Nelder-Mead",
center.y = NULL, impute.y = FALSE, lod = NULL, verbose = FALSE,
optim.method = "Nelder-Mead",
optim.control = list(trace = 0, reltol = 1e-3, maxit = 5000),
family = c("gaussian", "binomial"), nfolds = 10, foldid = NULL,
parallel = FALSE) {
Expand Down Expand Up @@ -214,11 +219,12 @@ setGeneric("compute_residuals", function(model, Y, Y.hat, family) standardGeneri
setGeneric("transform_data", function(model, Y, X) standardGeneric("transform_data"))
setGeneric("estimate_theta", function(model, locs, optim.control, method) standardGeneric("estimate_theta"))
setGeneric("estimate_betas", function(model, family, nfolds, foldid, parallel) standardGeneric("estimate_betas"))
setGeneric("impute_y", function(model) standardGeneric("impute_y"))
setGeneric("compute_error", function(model, y, X) standardGeneric("compute_error"))
setGeneric("scale_locs", function(model, locs) standardGeneric("scale_locs"))
setGeneric("theta_names", function(model) standardGeneric("theta_names"))
setGeneric("transform_covariance_parameters", function(model) standardGeneric("transform_covariance_parameters"))
setGeneric("check_input", function(model, Y, X, locs) standardGeneric("check_input"))
setGeneric("check_input", function(model, Y, X, locs, center.y, impute.y, lod) standardGeneric("check_input"))
setGeneric("check_input_pred", function(model, X, locs) standardGeneric("check_input_pred"))

#' show
Expand Down Expand Up @@ -329,6 +335,14 @@ setMethod(
#' tol*previous_error (optional). Defaults to 0.999999.
#' @param max_iters Maximum number of iterations for the model fitting
#' procedure. Defaults to 100.
#' @param center.y Should the Y's be mean centered before fitting the model?
#' Defaults to TRUE for gaussian models and FALSE for binomial models.
#' @param impute.y Should missing Y's be imputed? Defaults to FALSE.
#' @param lod Limit of detection value(s). Any Y value less than lod is
#' assumed to be missing when performing missing data imputation. Should be
#' numeric for univariate models and a list for multivariate models, where
#' each element of the list corresponds to an outcome. If not specified, it
#' is assumed that no limit of detection exists. Ignored if impute.y is FALSE.
#' @param verbose If TRUE, additional information about model fit will be
#' printed. Defaults to FALSE.
#' @param optim.method Optimization method to be used for the maximum
Expand Down Expand Up @@ -408,13 +422,33 @@ setMethod(
setMethod(
"prestogp_fit", "PrestoGPModel",
function(model, Y, X, locs, scaling = NULL, apanasovich = NULL,
covparams = NULL, beta.hat = NULL, tol = 0.999999,
max_iters = 100, verbose = FALSE, optim.method = "Nelder-Mead",
covparams = NULL, beta.hat = NULL, tol = 0.999999, max_iters = 100,
center.y = NULL, impute.y = FALSE, lod = NULL, verbose = FALSE,
optim.method = "Nelder-Mead",
optim.control = list(trace = 0, reltol = 1e-3, maxit = 5000),
family = c("gaussian", "binomial"),
nfolds = 10, foldid = NULL, parallel = FALSE) {
model <- check_input(model, Y, X, locs)
family <- match.arg(family)
if (is.null(center.y)) {
if (family == "gaussian") {
center.y <- TRUE
} else {
center.y <- FALSE
}
}
model <- check_input(model, Y, X, locs, center.y, impute.y, lod)
if (is.null(lod)) {
lodv <- rep(Inf, nrow(model@Y_train))
} else {
if (is.numeric(lod)) {
lod <- list(lod)
}
lodv <- NULL
for (i in seq_along(lod)) {
lodv <- c(lodv, rep(lod[[i]] - model@Y_bar[i],
nrow(model@locs_train[[i]])))
}
}
if (!is.null(beta.hat)) {
if (!is.vector(beta.hat) | !is.numeric(beta.hat)) {
stop("beta.hat parameter must be a numeric vector")
Expand Down Expand Up @@ -524,6 +558,9 @@ setMethod(
s = "lambda.1se",
type = "response")
)
model <- impute_y(model)
alod <- !model@Y_obs & model@Y_train > lodv
model@Y_train[alod] <- lodv[alod]
covparams.iter <- model@covparams
Vecchia.SCAD.iter <- model@linear_model
} else {
Expand Down Expand Up @@ -653,7 +690,7 @@ setMethod("calc_covparams", "PrestoGPModel", function(model, locs, Y, covparams)
col.vars <- rep(NA, P)
D.sample.bar <- rep(NA, model@nscale * P)
for (i in 1:P) {
col.vars[i] <- var(Y[[i]])
col.vars[i] <- var(Y[[i]], na.rm = TRUE)
N <- length(Y[[i]])
# TODO find a better way to compute initial spatial range
for (j in 1:model@nscale) {
Expand Down
116 changes: 112 additions & 4 deletions R/PrestoGP_Multivariate_Vecchia.R
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,9 @@ setMethod("prestogp_predict", "MultivariateVecchiaModel",
ordering.pred <- match.arg(ordering.pred)
pred.cond <- match.arg(pred.cond)
return.values <- match.arg(return.values)
X <- check_input_pred(model, X, locs)
pred.list <- check_input_pred(model, X, locs)
X <- pred.list$X
Y_bar <- pred.list$Y_bar
if (is.null(m)) { # m defaults to the value used for training
m <- model@n_neighbors
}
Expand Down Expand Up @@ -91,7 +93,7 @@ setMethod("prestogp_predict", "MultivariateVecchiaModel",

# prediction function can return both mean and sds
# returns a list with elements mu.pred,mu.obs,var.pred,var.obs,V.ord
Vec.mean <- pred$mu.pred + Vecchia.Pred # residual + mean trend
Vec.mean <- Y_bar + pred$mu.pred + Vecchia.Pred # residual + mean trend
if (return.values == "mean") {
return.list <- list(means = Vec.mean)
} else {
Expand All @@ -112,7 +114,7 @@ setMethod("prestogp_predict", "MultivariateVecchiaModel",
}
)

setMethod("check_input", "MultivariateVecchiaModel", function(model, Y, X, locs) {
setMethod("check_input", "MultivariateVecchiaModel", function(model, Y, X, locs, center.y, impute.y, lod) {
if (!is.list(locs)) {
stop("locs must be a list for multivariate models")
}
Expand All @@ -128,6 +130,14 @@ setMethod("check_input", "MultivariateVecchiaModel", function(model, Y, X, locs)
if (length(locs) != length(X)) {
stop("locs and X must have the same length")
}
if (!is.null(lod)) {
if (!is.list(lod)) {
stop("lod must be a list for multivariate models")
}
if (length(locs) != length(lod)) {
stop("locs and lod must have the same length")
}
}
for (i in seq_along(locs)) {
if (!is.matrix(locs[[i]])) {
stop("Each locs must be a matrix")
Expand Down Expand Up @@ -155,9 +165,44 @@ setMethod("check_input", "MultivariateVecchiaModel", function(model, Y, X, locs)
if (nrow(Y[[i]]) != nrow(X[[i]])) {
stop("Each Y must have the same number of rows as X")
}
if (sum(is.na(X[[i]])) > 0) {
stop("X must not contain NA's")
}
if (sum(is.na(locs[[i]])) > 0) {
stop("locs must not contain NA's")
}
if (sum(is.na(Y[[i]])) > 0 & !impute.y) {
stop("Y contains NA's and impute.y is FALSE. Set impute.y=TRUE to impute missing Y's.")
}
if (!is.null(lod[[i]])) {
if (!is.numeric(lod[[i]])) {
stop("Each lod must be numeric")
}
if (length(lod[[i]]) != 1) {
stop("Each lod must have length 1")
}
}
}
Y_bar <- rep(NA, length(Y))
Y_obs <- NULL
for (i in seq_along(Y)) {
Y_obs <- c(Y_obs, !is.na(Y[[i]]))
if (!is.null(lod)) {
Y[[i]][is.na(Y[[i]])] <- lod[[i]] / 2
}
if (center.y) {
Y_bar[i] <- mean(Y[[i]], na.rm = TRUE)
Y[[i]] <- Y[[i]] - Y_bar[i]
Y[[i]][is.na(Y[[i]])] <- 0
} else {
Y[[i]][is.na(Y[[i]])] <- mean(Y[[i]], na.rm = TRUE)
Y_bar[i] <- 0
}
}
model@Y_bar <- Y_bar
model@locs_train <- locs
model@Y_train <- as.matrix(unlist(Y))
model@Y_obs <- Y_obs
if (length(X) == 1) {
model@X_train <- X[[1]]
} else {
Expand Down Expand Up @@ -195,13 +240,76 @@ setMethod("check_input_pred", "MultivariateVecchiaModel", function(model, X, loc
}
if (length(X) == 1) {
X <- X[[1]]
Y_bar <- rep(model@Y_bar[1], nrow(X))
} else {
Y_bar <- NULL
for (i in seq_along(X)) {
Y_bar <- c(Y_bar, rep(model@Y_bar[i], nrow(X[[i]])))
}
X <- psych::superMatrix(X)
}
if (ncol(X) != ncol(model@X_train)) {
stop("X and X_train must have the same number of predictors")
}
return(X)
return(list(X = X, Y_bar = Y_bar))
})

setMethod("impute_y", "MultivariateVecchiaModel", function(model) {
if (sum(!model@Y_obs) > 0) {
Vecchia.Pred <- predict(model@linear_model,
newx = model@X_train[!model@Y_obs, ],
s = model@linear_model$lambda[model@lambda_1se_idx])
Vecchia.hat <- predict(model@linear_model,
newx = model@X_train[model@Y_obs, ],
s = model@linear_model$lambda[model@lambda_1se_idx])

# Test set prediction
res <- model@Y_train[model@Y_obs] - Vecchia.hat

locs.scaled <- scale_locs(model, model@locs_train)
all.obs <- model@Y_obs
locs.otr <- list()
locs.otst <- list()
for (i in seq_along(model@locs_train)) {
nl <- nrow(locs.scaled[[i]])
cur.obs <- all.obs[1:nl]
locs.otr[[i]] <- locs.scaled[[i]][cur.obs, ]
locs.otst[[i]] <- locs.scaled[[i]][!cur.obs, ]
all.obs <- all.obs[-(1:nl)]
}

if (model@apanasovich & (length(model@locs_train) > 1)) {
locs.nd <- eliminate_dupes(locs.otr, locs.otst)
locs.otr <- locs.nd$locs
locs.otst <- locs.nd$locs.pred
}
vec.approx.test <- vecchia_Mspecify(locs.otr, model@n_neighbors,
locs.list.pred = locs.otst,
ordering.pred = "obspred",
pred.cond = "independent"
)

## carry out prediction
if (!model@apanasovich) {
params <- model@covparams
param.seq <- model@param_sequence
pred <- vecchia_Mprediction(res, vec.approx.test,
c(
params[1:param.seq[1, 2]],
rep(1, param.seq[2, 2] - param.seq[2, 1] + 1),
params[param.seq[3, 1]:param.seq[5, 2]]
),
return.values = "mean"
)
} else {
pred <- vecchia_Mprediction(res, vec.approx.test, model@covparams,
return.values = "mean"
)
}

model@Y_train[!model@Y_obs] <- pred$mu.pred + Vecchia.Pred
}
invisible(model)
})

setMethod("specify", "MultivariateVecchiaModel", function(model) {
Expand Down
Empty file modified R/PrestoGP_Util_Functions.R
100755 → 100644
Empty file.
75 changes: 73 additions & 2 deletions R/PrestoGP_Vecchia.R
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,8 @@ setMethod("prestogp_predict", "VecchiaModel",

# prediction function can return both mean and sds
# returns a list with elements mu.pred,mu.obs,var.pred,var.obs,V.ord
Vec.mean <- pred$mu.pred + Vecchia.Pred # residual + mean trend
# residual + mean trend
Vec.mean <- model@Y_bar + pred$mu.pred + Vecchia.Pred
if (return.values == "mean") {
return.list <- list(means = Vec.mean)
} else {
Expand All @@ -94,7 +95,7 @@ setMethod("prestogp_predict", "VecchiaModel",
}
)

setMethod("check_input", "VecchiaModel", function(model, Y, X, locs) {
setMethod("check_input", "VecchiaModel", function(model, Y, X, locs, center.y, impute.y, lod) {
if (!is.matrix(locs)) {
stop("locs must be a matrix")
}
Expand All @@ -116,6 +117,35 @@ setMethod("check_input", "VecchiaModel", function(model, Y, X, locs) {
if (nrow(Y) != nrow(X)) {
stop("Y must have the same number of rows as X")
}
if (sum(is.na(X)) > 0) {
stop("X must not contain NA's")
}
if (sum(is.na(locs)) > 0) {
stop("locs must not contain NA's")
}
if (sum(is.na(Y)) > 0 & !impute.y) {
stop("Y contains NA's and impute.y is FALSE. Set impute.y=TRUE to impute missing Y's.")
}
if (!is.null(lod)) {
if (!is.numeric(lod)) {
stop("lod must be numeric")
}
if (length(lod) != 1) {
stop("lod must have length 1")
}
}
model@Y_obs <- !is.na(as.vector(Y))
if (!is.null(lod)) {
Y[!model@Y_obs] <- lod / 2
}
if (center.y) {
model@Y_bar <- mean(Y, na.rm = TRUE)
Y <- Y - model@Y_bar
Y[is.na(Y)] <- 0
} else {
model@Y_bar <- 0
Y[is.na(Y)] <- mean(Y, na.rm = TRUE)
}
model@X_train <- X
model@Y_train <- Y
model@locs_train <- list(locs)
Expand All @@ -141,6 +171,47 @@ setMethod("check_input_pred", "VecchiaModel", function(model, X, locs) {
invisible(model)
})

setMethod("impute_y", "VecchiaModel", function(model) {
if (sum(!model@Y_obs) > 0) {
Vecchia.Pred <- predict(model@linear_model,
newx = model@X_train[!model@Y_obs, ],
s = model@linear_model$lambda[model@lambda_1se_idx])
Vecchia.hat <- predict(model@linear_model,
newx = model@X_train[model@Y_obs, ],
s = model@linear_model$lambda[model@lambda_1se_idx])

# Test set prediction
res <- model@Y_train[model@Y_obs] - Vecchia.hat

locs.scaled <- scale_locs(model, model@locs_train)[[1]]
vec.approx.test <- vecchia_specify(locs.scaled[model@Y_obs, ],
model@n_neighbors,
locs.pred = locs.scaled[!model@Y_obs, ],
ordering.pred = "obspred", pred.cond = "independent")

## carry out prediction
if (!model@apanasovich) {
pred <- vecchia_prediction(
res,
vec.approx.test,
c(model@covparams[1], 1, model@covparams[3]),
model@covparams[4],
return.values = "mean"
)
} else {
pred <- vecchia_prediction(
res,
vec.approx.test,
c(model@covparams[1], model@covparams[2], model@covparams[3]),
model@covparams[4], return.values = "mean"
)
}

model@Y_train[!model@Y_obs] <- pred$mu.pred + Vecchia.Pred
}
invisible(model)
})

#' specify
#'
#' Specify the conditioning set using m nearest neighbors.
Expand Down
Loading
Loading