Skip to content

Commit

Permalink
update to 1.1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
ericward-noaa committed May 25, 2021
1 parent 037b6a9 commit d85e775
Show file tree
Hide file tree
Showing 95 changed files with 3,120 additions and 2,949 deletions.
2 changes: 2 additions & 0 deletions CRAN-RELEASE
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
This package was submitted to CRAN on 2021-05-18.
Once it is accepted, delete this file and tag the release (commit 037b6a9).
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: bayesdfa
Type: Package
Title: Bayesian Dynamic Factor Analysis (DFA) with 'Stan'
Version: 1.0.0
Version: 1.1.0
Authors@R: c(
person(c("Eric", "J."), "Ward", role = c("aut", "cre"),
email = "eric.ward@noaa.gov"),
Expand Down
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,7 @@
# bayesdfa 1.0.0

* Added constraint on diagonal of Z matrix to keep parameter estimates from 'flipping' within MCMC chains. Ensures convergence for problematic cases. This was present in 0.1.1, but later removed.

# bayesdfa 1.1.0

* Following 1.0.0, included a new argument to fit_dfa() function 'expansion_prior' that allows user to toggle on / off the constraint. If not included (default=FALSE), there is no constraint on the Z diagonal, and post-hoc MCMC chain inverting resolves identifiability. If 'expansion_prior' = TRUE, then the positive constraint is applied, in combination with the expansion prior for trends and loadings.
5 changes: 2 additions & 3 deletions R/converge_rhat.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,8 @@
#' @export
#'
is_converged <- function(fitted_model,
threshold = 1.05,
parameters = c("sigma", "x", "Z")) {

threshold = 1.05,
parameters = c("sigma", "x", "Z")) {
Rhats <-
fitted_model$monitor[which(grepl(
paste(parameters, collapse = "|"),
Expand Down
172 changes: 88 additions & 84 deletions R/dfa_cv.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,79 +20,79 @@
#' \dontrun{
#' set.seed(42)
#' s <- sim_dfa(num_trends = 1, num_years = 20, num_ts = 3)
#' obs <- c(s$y_sim[1,], s$y_sim[2,], s$y_sim[3,])
#' long = data.frame("obs" = obs, "ts" = sort(rep(1:3,20)), "time" = rep(1:20,3))
#' m <- fit_dfa(y = long, iter = 50, chains = 1, data_shape="long", sample=FALSE)
#' obs <- c(s$y_sim[1, ], s$y_sim[2, ], s$y_sim[3, ])
#' long <- data.frame("obs" = obs, "ts" = sort(rep(1:3, 20)), "time" = rep(1:20, 3))
#' m <- fit_dfa(y = long, iter = 50, chains = 1, data_shape = "long", sample = FALSE)
#' # random folds
#' fit_cv = dfa_cv(m, cv_method="loocv", n_folds = 5, iter=50, chains=1)
#' fit_cv <- dfa_cv(m, cv_method = "loocv", n_folds = 5, iter = 50, chains = 1)
#'
#' # folds can also be passed in
#' fold_ids = sample(1:5, size=nrow(long), replace=TRUE)
#' m <- fit_dfa(y = long, iter = 50, chains = 1, data_shape="long", sample=FALSE)
#' fit_cv = dfa_cv(m, cv_method="loocv", n_folds = 5, iter=50, chains=1, fold_ids=fold_ids)
#' fold_ids <- sample(1:5, size = nrow(long), replace = TRUE)
#' m <- fit_dfa(y = long, iter = 50, chains = 1, data_shape = "long", sample = FALSE)
#' fit_cv <- dfa_cv(m, cv_method = "loocv", n_folds = 5, iter = 50, chains = 1, fold_ids = fold_ids)
#'
#' # do an example of leave-time-out cross validation where years are dropped
#' fold_ids = long$time
#' m <- fit_dfa(y = long, iter = 50, chains = 1, data_shape="long", sample=FALSE)
#' fit_cv = dfa_cv(m, cv_method="loocv", iter=100, chains=1, fold_ids = fold_ids)
#' fold_ids <- long$time
#' m <- fit_dfa(y = long, iter = 50, chains = 1, data_shape = "long", sample = FALSE)
#' fit_cv <- dfa_cv(m, cv_method = "loocv", iter = 100, chains = 1, fold_ids = fold_ids)
#'
#' # example with covariates and long format data
#' obs_covar = expand.grid("time"=1:20,"timeseries"=1:3,"covariate"=1:2)
#' obs_covar$value=rnorm(nrow(obs_covar),0,0.1)
#' obs <- c(s$y_sim[1,], s$y_sim[2,], s$y_sim[3,])
#' m <- fit_dfa(y = long, iter = 50, chains = 1, obs_covar=obs_covar,data_shape="long", sample=FALSE)
#' fit_cv = dfa_cv(m, cv_method="loocv", n_folds = 5, iter=50, chains=1)
#' obs_covar <- expand.grid("time" = 1:20, "timeseries" = 1:3, "covariate" = 1:2)
#' obs_covar$value <- rnorm(nrow(obs_covar), 0, 0.1)
#' obs <- c(s$y_sim[1, ], s$y_sim[2, ], s$y_sim[3, ])
#' m <- fit_dfa(y = long, iter = 50, chains = 1, obs_covar = obs_covar, data_shape = "long", sample = FALSE)
#' fit_cv <- dfa_cv(m, cv_method = "loocv", n_folds = 5, iter = 50, chains = 1)
#' }
#'
dfa_cv <- function(stanfit,
cv_method = c("loocv","lfocv"),
fold_ids = NULL,
n_folds = 10,
iter = 2000,
chains = 4,
thin = 1,
...) {

cv_method <- match.arg(cv_method, c("loocv","lfocv"))
if(is.null(fold_ids)) {
cv_method = c("loocv", "lfocv"),
fold_ids = NULL,
n_folds = 10,
iter = 2000,
chains = 4,
thin = 1,
...) {
cv_method <- match.arg(cv_method, c("loocv", "lfocv"))
if (is.null(fold_ids)) {
warning("the vector fold_ids containing fold ids is null, so random folds are being used")
fold_ids <- sample(1:n_folds, nrow(stanfit$orig_data), replace=TRUE)
fold_ids <- sample(1:n_folds, nrow(stanfit$orig_data), replace = TRUE)
}
if(length(fold_ids) != nrow(stanfit$orig_data)) {
if (length(fold_ids) != nrow(stanfit$orig_data)) {
stop("The length of the vector fold_ids needs to tbe the same as the number of rows in the long format dataframe")
}
if(stanfit$shape!="long") {
if (stanfit$shape != "long") {
stop("Error, please reshape the data into long format")
}

if(!is.null(fold_ids)) n_folds = max(fold_ids)
if (!is.null(fold_ids)) n_folds <- max(fold_ids)
y <- stanfit$orig_data
y$time = y$time - min(y$time) + 1
y$time <- y$time - min(y$time) + 1

# loop over the folds, re-fitting the dfa model each time with the folds held out
log_lik <- matrix(0, nrow=ceiling(iter/(2*thin))*chains, ncol=n_folds)
for(f in 1:n_folds) {
log_lik <- matrix(0, nrow = ceiling(iter / (2 * thin)) * chains, ncol = n_folds)
for (f in 1:n_folds) {

# fit model holding out each time slice. subset observed data and covar
y_train <- y
y_train[which(fold_ids == f),"obs"] <- NA
y_test <- y[which(fold_ids == f),]
obs_covar_train=NULL
if(length(stanfit$sampling_args$data$obs_covar_value) > 0) {
stanfit$obs_covar$time_timeseries <- paste(stanfit$obs_covar$time,stanfit$obs_covar$timeseries)
y_train$time_timeseries <- paste(y_train$time,y_train$ts)
y_test$time_timeseries <- paste(y_test$time,y_test$ts)
obs_covar_train <- stanfit$obs_covar[which(stanfit$obs_covar$time_timeseries %in% y_train$time_timeseries[which(fold_ids!=f)]),1:4]
obs_covar_test <- stanfit$obs_covar[which(stanfit$obs_covar$time_timeseries %in% y_test$time_timeseries),1:4]
y_train[which(fold_ids == f), "obs"] <- NA
y_test <- y[which(fold_ids == f), ]
obs_covar_train <- NULL
if (length(stanfit$sampling_args$data$obs_covar_value) > 0) {
stanfit$obs_covar$time_timeseries <- paste(stanfit$obs_covar$time, stanfit$obs_covar$timeseries)
y_train$time_timeseries <- paste(y_train$time, y_train$ts)
y_test$time_timeseries <- paste(y_test$time, y_test$ts)
obs_covar_train <- stanfit$obs_covar[which(stanfit$obs_covar$time_timeseries %in% y_train$time_timeseries[which(fold_ids != f)]), 1:4]
obs_covar_test <- stanfit$obs_covar[which(stanfit$obs_covar$time_timeseries %in% y_test$time_timeseries), 1:4]
}
pro_covar_train=NULL
if(length(stanfit$sampling_args$data$pro_covar_value) > 0) {
pro_covar_train <- stanfit$pro_covar[which(fold_ids != f),]
pro_covar_test <- stanfit$pro_covar[which(fold_ids == f),]
pro_covar_train <- NULL
if (length(stanfit$sampling_args$data$pro_covar_value) > 0) {
pro_covar_train <- stanfit$pro_covar[which(fold_ids != f), ]
pro_covar_test <- stanfit$pro_covar[which(fold_ids == f), ]
}

# fit the new model
fit.mod <- fit_dfa(y = y_train,
fit.mod <- fit_dfa(
y = y_train,
num_trends = stanfit$sampling_args$data$K,
varIndx = stanfit$sampling_args$data$varIndx,
zscore = stanfit$zscore,
Expand All @@ -103,7 +103,7 @@ dfa_cv <- function(stanfit,
nu_fixed = stanfit$sampling_args$data$nu_fixed,
est_correlation = stanfit$sampling_args$data$est_cor,
estimate_nu = stanfit$sampling_args$data$estimate_nu,
estimate_trend_ar = ifelse(stanfit$sampling_args$data$est_phi==1, TRUE, FALSE),
estimate_trend_ar = ifelse(stanfit$sampling_args$data$est_phi == 1, TRUE, FALSE),
estimate_trend_ma = ifelse(stanfit$sampling_args$data$est_theta == 1, TRUE, FALSE),
estimate_process_sigma = ifelse(stanfit$sampling_args$data$est_sigma_process == 1, TRUE, FALSE),
equal_process_sigma = ifelse(stanfit$sampling_args$data$n_sigma_process == 1, TRUE, FALSE),
Expand All @@ -114,63 +114,67 @@ dfa_cv <- function(stanfit,
z_bound = stanfit$z_bound,
z_model = stanfit$z_model,
trend_model = stanfit$trend_model,
verbose = FALSE)
verbose = FALSE
)

# extract posterior parameters for the training set
pars <- rstan::extract(fit.mod$model)
r <- rotate_trends(fit.mod)
# loop over each iterations (mcmc sample)
for(j in 1:nrow(log_lik)) {
for (j in 1:nrow(log_lik)) {

# determine if covariates are included
obs_covar_offset = rep(0, nrow(y_test))
if(is.null(obs_covar_train) & is.null(pro_covar_train)) {
#pred <- pars$Z[j,,] %*% matrix(pars$x[j,,],nrow=stanfit$sampling_args$data$K)
pred <- r$Z_rot[j,,] %*% matrix(r$trends[j,,],nrow=stanfit$sampling_args$data$K)
obs_covar_offset <- rep(0, nrow(y_test))
if (is.null(obs_covar_train) & is.null(pro_covar_train)) {
# pred <- pars$Z[j,,] %*% matrix(pars$x[j,,],nrow=stanfit$sampling_args$data$K)
pred <- r$Z_rot[j, , ] %*% matrix(r$trends[j, , ], nrow = stanfit$sampling_args$data$K)
# subset predictions corresponding to observations
pred <- pred[cbind(y_test$ts,y_test$time)]
#pred = pars$Z[j,,] %*% matrix(pars$xstar[j,,],ncol=1)
pred <- pred[cbind(y_test$ts, y_test$time)]
# pred = pars$Z[j,,] %*% matrix(pars$xstar[j,,],ncol=1)
}
if(!is.null(obs_covar_train) & is.null(pro_covar_train)) {
#pred = pars$Z[j,,] %*% matrix(pars$xstar[j,,],ncol=1) + pars$b_obs[j,,] * obs_covar_test$value
#pred <- pars$Z[j,,] %*% matrix(pars$x[j,,],nrow=stanfit$sampling_args$data$K)
pred <- r$Z_rot[j,,] %*% matrix(r$trends[j,,],nrow=stanfit$sampling_args$data$K)
pred <- pred[cbind(y_test$ts,y_test$time)]
for(i in 1:max(obs_covar_test$covariate)) {
if (!is.null(obs_covar_train) & is.null(pro_covar_train)) {
# pred = pars$Z[j,,] %*% matrix(pars$xstar[j,,],ncol=1) + pars$b_obs[j,,] * obs_covar_test$value
# pred <- pars$Z[j,,] %*% matrix(pars$x[j,,],nrow=stanfit$sampling_args$data$K)
pred <- r$Z_rot[j, , ] %*% matrix(r$trends[j, , ], nrow = stanfit$sampling_args$data$K)
pred <- pred[cbind(y_test$ts, y_test$time)]
for (i in 1:max(obs_covar_test$covariate)) {
indx <- which(obs_covar_test$covariate == i)
pred <- pred + pars$b_obs[j,i,obs_covar_test$timeseries[indx]] * obs_covar_test$value[indx]
pred <- pred + pars$b_obs[j, i, obs_covar_test$timeseries[indx]] * obs_covar_test$value[indx]
}
}

log_lik[j,f] <- sum(dnorm(x = y_test$obs,
log_lik[j, f] <- sum(dnorm(
x = y_test$obs,
mean = pred,
sd = pars$sigma[j,stanfit$sampling_args$data$varIndx], log=TRUE), na.rm=T)
#log_lik[j,k] = sum(dnorm(x = ytest, mean = pred, sd = pars$sigma[j,varIndx], log=TRUE), na.rm=T)
sd = pars$sigma[j, stanfit$sampling_args$data$varIndx], log = TRUE
), na.rm = T)
# log_lik[j,k] = sum(dnorm(x = ytest, mean = pred, sd = pars$sigma[j,varIndx], log=TRUE), na.rm=T)
}

# Predictions now vary based on how the cross validation is done, and whether covariates used
#if(cv_method == "loocv") {
#}
#if(cv_method == "lfocv") {
# for(j in 1:nrow(log_lik)) {
# # loop over iterations
# if(is.null(obs_covar) & is.null(pro_covar)) {
# pred = pars$Z[j,,] %*% matrix(pars$xstar[j,,],ncol=1)
# }
# if(!is.null(obs_covar) & is.null(pro_covar)) {
# pred = pars$Z[j,,] %*% matrix(pars$xstar[j,,],ncol=1) + pars$b_obs[j,,] * covar_test$value
# }
# log_lik[j,k] = sum(dnorm(x = ytest, mean = pred, sd = pars$sigma[j,varIndx], log=TRUE), na.rm=T)
# }
#}

# if(cv_method == "loocv") {
# }
# if(cv_method == "lfocv") {
# for(j in 1:nrow(log_lik)) {
# # loop over iterations
# if(is.null(obs_covar) & is.null(pro_covar)) {
# pred = pars$Z[j,,] %*% matrix(pars$xstar[j,,],ncol=1)
# }
# if(!is.null(obs_covar) & is.null(pro_covar)) {
# pred = pars$Z[j,,] %*% matrix(pars$xstar[j,,],ncol=1) + pars$b_obs[j,,] * covar_test$value
# }
# log_lik[j,k] = sum(dnorm(x = ytest, mean = pred, sd = pars$sigma[j,varIndx], log=TRUE), na.rm=T)
# }
# }
}

elpds <- apply(log_lik,2,log_sum_exp)
elpd <- list("log_lik"=log_lik,
elpds <- apply(log_lik, 2, log_sum_exp)
elpd <- list(
"log_lik" = log_lik,
"elpds" = elpds,
"elpd_kfold"=sum(elpds),
"se_elpd_kfold" = sqrt(length(elpds) * var(elpds)))
"elpd_kfold" = sum(elpds),
"se_elpd_kfold" = sqrt(length(elpds) * var(elpds))
)
return(elpd)
}

Expand Down
33 changes: 17 additions & 16 deletions R/dfa_fitted.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@
dfa_fitted <- function(modelfit, conf_level = 0.95, names = NULL) {

# pred and Y have same dimensions if data is wide
pred = predicted(modelfit)
n_mcmc = dim(pred)[1]
n_chains = dim(pred)[2]
n_years = dim(pred)[3]
n_ts = dim(pred)[4]
pred <- predicted(modelfit)
n_mcmc <- dim(pred)[1]
n_chains <- dim(pred)[2]
n_years <- dim(pred)[3]
n_ts <- dim(pred)[4]

# this is the same for both data types
df_pred <- data.frame(
Expand All @@ -33,31 +33,33 @@ dfa_fitted <- function(modelfit, conf_level = 0.95, names = NULL) {
"upper" = c(t(apply(pred, c(3, 4), quantile, (1 - conf_level) / 2)))
)

if(modelfit$shape == "wide") {
if (modelfit$shape == "wide") {
df_obs <- data.frame(
"ID" = rep(seq_len(n_ts), n_years),
"time" = sort(rep(seq_len(n_years), n_ts)),
"y" = c(modelfit$orig_data))
"ID" = rep(seq_len(n_ts), n_years),
"time" = sort(rep(seq_len(n_years), n_ts)),
"y" = c(modelfit$orig_data)
)
} else {
df_obs <- data.frame(
"ID" = modelfit$orig_data[["ts"]],
"time" = modelfit$orig_data[["time"]],
"y" = modelfit$orig_data[["obs"]])
"y" = modelfit$orig_data[["obs"]]
)
}
df_obs$time <- df_obs$time - min(df_obs$time) + 1

# standardize
for(i in seq_len(n_ts)) {
indx = which(df_obs[["ID"]] == i)
df_obs[indx,"y"] = scale(df_obs[indx,"y" ], center = TRUE, scale = TRUE)
for (i in seq_len(n_ts)) {
indx <- which(df_obs[["ID"]] == i)
df_obs[indx, "y"] <- scale(df_obs[indx, "y"], center = TRUE, scale = TRUE)
}

df_obs <- df_obs[order(df_obs$ID, df_obs$time), ]
df_pred <- df_pred[order(df_pred$ID, df_pred$time), ]

if (!is.null(names)) {
if(length(names) != n_ts) {
warning("bayesdfa: Length of 'names' should match number of time series. Ignoring 'names'.")
if (length(names) != n_ts) {
warning("bayesdfa: Length of 'names' should match number of time series. Ignoring 'names'.")
} else {
df_pred$ID <- names[df_pred$ID]
df_obs$ID <- names[df_obs$ID]
Expand All @@ -66,5 +68,4 @@ dfa_fitted <- function(modelfit, conf_level = 0.95, names = NULL) {

df <- merge(df_obs, df_pred, by = c("ID", "time"), sort = FALSE)
return(df)

}
9 changes: 4 additions & 5 deletions R/dfa_loadings.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,17 @@
#' r <- rotate_trends(m)
#' loadings <- dfa_loadings(r, summary = TRUE)
#' loadings <- dfa_loadings(r, summary = FALSE)

dfa_loadings <- function(rotated_modelfit,
names = NULL,
summary = TRUE,
conf_level = 0.95) {

v <- reshape2::melt(rotated_modelfit$Z_rot,
varnames = c("iter", "name", "trend"), value.name = "loading")
varnames = c("iter", "name", "trend"), value.name = "loading"
)
v$draw <- as.numeric(gsub("_chain.*$", "", v$iter))
v$chain <- as.numeric(gsub("^[0-9]+_chain:", "", v$iter))
v$iter <- NULL
v <- v[ , c("chain", "draw", "name", "trend", "loading")]
v <- v[, c("chain", "draw", "name", "trend", "loading")]

v$trend <- paste0("Trend ", v$trend)
v$trend <- as.factor(v$trend)
Expand All @@ -52,7 +51,7 @@ dfa_loadings <- function(rotated_modelfit,
v <- as.data.frame(dplyr::ungroup(v))
out <- v

if(summary) {
if (summary) {
vsum <- dplyr::group_by(v, .data$name, .data$trend)
vsum <- dplyr::summarize(vsum,
median = median(.data$loading),
Expand Down
4 changes: 2 additions & 2 deletions R/dfa_trends.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
#' r <- rotate_trends(m)
#' trends <- dfa_trends(r)
dfa_trends <- function(rotated_modelfit, years = NULL) {

rotated <- rotated_modelfit
n_ts <- dim(rotated$Z_rot)[2]
n_trends <- dim(rotated$Z_rot)[3]
Expand All @@ -28,7 +27,8 @@ dfa_trends <- function(rotated_modelfit, years = NULL) {
trend_number = paste0("Trend ", sort(rep(seq_len(n_trends), n_years))),
estimate = c(t(rotated$trends_mean)),
lower = c(t(rotated$trends_lower)),
upper = c(t(rotated$trends_upper)))
upper = c(t(rotated$trends_upper))
)

return(df)
}
Loading

0 comments on commit d85e775

Please sign in to comment.