From 85807f0ef83f1b53d9dd0e771200896f40d52447 Mon Sep 17 00:00:00 2001 From: "Steven Paul Sanderson II, MPH" Date: Mon, 23 May 2022 13:25:32 -0400 Subject: [PATCH] Fixes #189 --- DESCRIPTION | 2 +- NAMESPACE | 1 + NEWS.md | 1 + R/empirical-tidy-bootstrap.R | 95 ++++++++++++++++++++++++++++++++++++ _pkgdown.yml | 4 ++ man/tidy_bootstrap.Rd | 50 +++++++++++++++++++ man/tidyeval.Rd | 30 ++++++++---- 7 files changed, 172 insertions(+), 11 deletions(-) create mode 100644 R/empirical-tidy-bootstrap.R create mode 100644 man/tidy_bootstrap.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 56696df5..adde8a4d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -13,7 +13,7 @@ Description: License: MIT + file LICENSE Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.1.2 +RoxygenNote: 7.2.0 URL: https://github.com/spsanderson/TidyDensity BugReports: https://github.com/spsanderson/TidyDensity/issues Imports: diff --git a/NAMESPACE b/NAMESPACE index d16bc68c..7618b47c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -15,6 +15,7 @@ export(td_scale_fill_colorblind) export(tidy_autoplot) export(tidy_beta) export(tidy_binomial) +export(tidy_bootstrap) export(tidy_burr) export(tidy_cauchy) export(tidy_chisquare) diff --git a/NEWS.md b/NEWS.md index 558d761e..2cea6e11 100644 --- a/NEWS.md +++ b/NEWS.md @@ -7,6 +7,7 @@ None 1. Fix #181 - Add functions `color_blind()` `td_scale_fill_colorblind()` and `td_scale_color_colorblind()` 2. Fix #187 - Add functions `ci_lo()` and `ci_hi()` +3. Fix #189 - Add function `tidy_bootstrap()` ## Minor Fixes and Improvements 1. Fix #176 - Update `_autoplot` functions to include cumulative mean MCMC chart diff --git a/R/empirical-tidy-bootstrap.R b/R/empirical-tidy-bootstrap.R new file mode 100644 index 00000000..ba82959a --- /dev/null +++ b/R/empirical-tidy-bootstrap.R @@ -0,0 +1,95 @@ +#' Bootstrap Empirical Data +#' +#' @family Bootstrap +#' +#' @author Steven P. Sanderson II, MPH +#' +#' @details This function will take in a numeric input vector and produce a tibble +#' of bootstrapped values in a list. The table that is output will have two columns: +#' `sim_number` and `bootstrap_samples` +#' +#' The `sim_number` corresponds to how many times you want the data to be resampled, +#' and the `bootstrap_samples` column contains a list of the boostrapped resampled +#' data. +#' +#' @description Takes an input vector of numeric data and produces a bootstrapped +#' nested tibble by simulation number. +#' +#' @param .x The vector of data being passed to the function. Must be a numeric +#' vector. +#' @param .num_sims The default is 2000, can be set to anything desired. A warning +#' will pass to the console if the value is less than 2000. +#' @param .proportion How much of the original data do you want to pass through +#' to the sampling function. The default is 0.80 (80%) +#' @param .distribution_type This can either be 'continuous' or 'discrete' +#' +#' @examples +#' x <- mtcars$mpg +#' tidy_bootstrap(x) +#' +#' @return +#' A nested tibble +#' +#' @export +#' + +tidy_bootstrap <- function(.x, .num_sims = 2000, .proportion = 0.8, + .distribution_type = "continuous"){ + + # Tidyeval ---- + x_term <- as.numeric(.x) + n <- length(x_term) + dist_type <- tolower(as.character(.distribution_type)) + num_sims <- as.integer(.num_sims) + size <- as.numeric(.proportion) + + # Checks ---- + if (!is.vector(x_term)) { + rlang::abort( + message = "You must pass a vector as the .x argument to this function.", + use_cli_format = TRUE + ) + } + + if (size < 0 | size > 1){ + rlang::abort( + message = "The '.proportion' parameter must be between 0 and 1 inclusive.", + use_cli_format = TRUE + ) + } + + if (!dist_type %in% c("continuous","discrete")){ + rlang::abort( + message = "You must choose either 'continuous' or 'discrete'.", + use_cli_format = TRUE + ) + } + + if (num_sims < 2000){ + rlang::warn( + message = "Setting '.num_sims' to less than 2000 means that results can be + potentially unstable. Consider setting to 2000 or more.", + use_cli_format = TRUE + ) + } + + # Data ---- + df <- dplyr::tibble(sim_number = as.factor(1:num_sims)) %>% + dplyr::group_by(sim_number) %>% + dplyr::mutate(bootstrap_samples = list( + sample(x = x_term, size = floor(size * n) ,replace = TRUE) + ) + ) %>% + dplyr::ungroup() + + # Attach descriptive attributes to tibble + attr(df, "distribution_family_type") <- dist_type + attr(df, ".x") <- .x + attr(df, ".num_sims") <- .num_sims + attr(df, "tibble_type") <- "tidy_bootstrap" + attr(df, "dist_with_params") <- "Empirical" + + # Return ---- + return(df) + +} diff --git a/_pkgdown.yml b/_pkgdown.yml index 03df8aed..c939521c 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -37,6 +37,10 @@ reference: desc: Function for creating mixture model data contents: - has_concept("Mixture Data") + - title: Bootstrap + desc: Functions for bootstrapping data + contents: + - has_concept("Bootstrap") - title: Parameter Estimation Functions desc: Functions that help to estimate parameters from raw data. contents: diff --git a/man/tidy_bootstrap.Rd b/man/tidy_bootstrap.Rd new file mode 100644 index 00000000..2fbe95e4 --- /dev/null +++ b/man/tidy_bootstrap.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/empirical-tidy-bootstrap.R +\name{tidy_bootstrap} +\alias{tidy_bootstrap} +\title{Bootstrap Empirical Data} +\usage{ +tidy_bootstrap( + .x, + .num_sims = 2000, + .proportion = 0.8, + .distribution_type = "continuous" +) +} +\arguments{ +\item{.x}{The vector of data being passed to the function. Must be a numeric +vector.} + +\item{.num_sims}{The default is 2000, can be set to anything desired. A warning +will pass to the console if the value is less than 2000.} + +\item{.proportion}{How much of the original data do you want to pass through +to the sampling function. The default is 0.80 (80\%)} + +\item{.distribution_type}{This can either be 'continuous' or 'discrete'} +} +\value{ +A nested tibble +} +\description{ +Takes an input vector of numeric data and produces a bootstrapped +nested tibble by simulation number. +} +\details{ +This function will take in a numeric input vector and produce a tibble +of bootstrapped values in a list. The table that is output will have two columns: +\code{sim_number} and \code{bootstrap_samples} + +The \code{sim_number} corresponds to how many times you want the data to be resampled, +and the \code{bootstrap_samples} column contains a list of the boostrapped resampled +data. +} +\examples{ +x <- mtcars$mpg +tidy_bootstrap(x) + +} +\author{ +Steven P. Sanderson II, MPH +} +\concept{Bootstrap} diff --git a/man/tidyeval.Rd b/man/tidyeval.Rd index 1fde2d29..f773abf2 100644 --- a/man/tidyeval.Rd +++ b/man/tidyeval.Rd @@ -24,18 +24,22 @@ operators which you should not have to use in simple cases. The curly-curly operator \verb{\{\{} allows you to tunnel data-variables passed from function arguments inside other tidy eval functions. \verb{\{\{} is designed for individual arguments. To pass multiple -arguments contained in dots, use \code{...} in the normal way.\preformatted{my_function <- function(data, var, ...) \{ +arguments contained in dots, use \code{...} in the normal way. + +\if{html}{\out{
}}\preformatted{my_function <- function(data, var, ...) \{ data \%>\% group_by(...) \%>\% summarise(mean = mean(\{\{ var \}\})) \} -} +}\if{html}{\out{
}} \item \code{\link[=enquo]{enquo()}} and \code{\link[=enquos]{enquos()}} delay the execution of one or several function arguments. The former returns a single expression, the latter returns a list of expressions. Once defused, expressions will no longer evaluate on their own. They must be injected back into an evaluation context with \verb{!!} (for a single expression) and -\verb{!!!} (for a list of expressions).\preformatted{my_function <- function(data, var, ...) \{ +\verb{!!!} (for a list of expressions). + +\if{html}{\out{
}}\preformatted{my_function <- function(data, var, ...) \{ # Defuse var <- enquo(var) dots <- enquos(...) @@ -45,7 +49,7 @@ into an evaluation context with \verb{!!} (for a single expression) and group_by(!!!dots) \%>\% summarise(mean = mean(!!var)) \} -} +}\if{html}{\out{
}} In this simple case, the code is equivalent to the usage of \verb{\{\{} and \code{...} above. Defusing with \code{enquo()} or \code{enquos()} is only @@ -53,30 +57,36 @@ needed in more complex cases, for instance if you need to inspect or modify the expressions in some way. \item The \code{.data} pronoun is an object that represents the current slice of data. If you have a variable name in a string, use the -\code{.data} pronoun to subset that variable with \code{[[}.\preformatted{my_var <- "disp" +\code{.data} pronoun to subset that variable with \code{[[}. + +\if{html}{\out{
}}\preformatted{my_var <- "disp" mtcars \%>\% summarise(mean = mean(.data[[my_var]])) -} +}\if{html}{\out{
}} \item Another tidy eval operator is \verb{:=}. It makes it possible to use glue and curly-curly syntax on the LHS of \code{=}. For technical reasons, the R language doesn't support complex expressions on -the left of \code{=}, so we use \verb{:=} as a workaround.\preformatted{my_function <- function(data, var, suffix = "foo") \{ +the left of \code{=}, so we use \verb{:=} as a workaround. + +\if{html}{\out{
}}\preformatted{my_function <- function(data, var, suffix = "foo") \{ # Use `\{\{` to tunnel function arguments and the usual glue # operator `\{` to interpolate plain strings. data \%>\% summarise("\{\{ var \}\}_mean_\{suffix\}" := mean(\{\{ var \}\})) \} -} +}\if{html}{\out{
}} \item Many tidy eval functions like \code{dplyr::mutate()} or \code{dplyr::summarise()} give an automatic name to unnamed inputs. If you need to create the same sort of automatic names by yourself, use \code{as_label()}. For instance, the glue-tunnelling syntax above -can be reproduced manually with:\preformatted{my_function <- function(data, var, suffix = "foo") \{ +can be reproduced manually with: + +\if{html}{\out{
}}\preformatted{my_function <- function(data, var, suffix = "foo") \{ var <- enquo(var) prefix <- as_label(var) data \%>\% summarise("\{prefix\}_mean_\{suffix\}" := mean(!!var)) \} -} +}\if{html}{\out{
}} Expressions defused with \code{enquo()} (or tunnelled with \verb{\{\{}) need not be simple column names, they can be arbitrarily complex.