Add `.num_sims` parameter to `tidy_empirical()` #188

spsanderson · 2022-05-18T16:07:25Z

Adding the .num_sims parameter to the tidy_empirical() will allow bootstrapping of the values, this keeps it inline with the rest of the tidy_ distribution functions.

Check for downstream effects, as they are yet unknown.

Function:

tidy_empirical <- function(.x, .num_sims = 1, .distribution_type = "continuous") {
  x_term <- .x
  n <- length(x_term)
  dist_type <- tolower(as.character(.distribution_type))
  num_sims = as.integer(.num_sims)
  
  if (!is.vector(x_term)) {
    rlang::abort("You must pass a vector as the .x argument to this function.")
  }
  
  if (!dist_type %in% c("continuous","discrete")){
    rlang::abort("You must choose either 'continuous' or 'discrete'.")
  }
  
  ## New P
  e <- stats::ecdf(x_term)
  
  df <- dplyr::tibble(sim_number = as.factor(1:num_sims)) %>%
    dplyr::group_by(sim_number) %>%
    dplyr::mutate(x = list(1:n)) %>%
    dplyr::mutate(y = ifelse(
      num_sims == 1,
      list(x_term),
      list(sample(x_term, replace = TRUE))
    )) %>%
    dplyr::mutate(d = list(density(unlist(y), n = n)[c("x", "y")] %>%
                             purrr::set_names("dx", "dy") %>%
                             dplyr::as_tibble())) %>%
    dplyr::mutate(p = list(e(unlist(y)))) %>%
    dplyr::mutate(q = NA) %>%
    tidyr::unnest(cols = c(x, y, d, p, q)) %>%
    dplyr::ungroup()
  
  q_vec <- df %>%
    dplyr::select(sim_number, y) %>%
    dplyr::group_by(sim_number) %>%
    dplyr::mutate(
      q = rep(
        stats::quantile(y, probs = seq(0, 1, 1 / (n - 1)), type = 1), 
        1
      )
    ) %>%
    dplyr::ungroup() %>%
    dplyr::select(q)
  
  df <- df %>%
    dplyr::mutate(q = q_vec$q)
  
  # Attach descriptive attributes to tibble
  attr(df, "distribution_family_type") <- dist_type
  attr(df, ".x") <- .x
  attr(df, ".n") <- n
  attr(df, ".num_sims") <- num_sims
  attr(df, "tibble_type") <- "tidy_empirical"
  attr(df, "dist_with_params") <- "Empirical"
  
  # Return ----
  return(df)
}

Example:

> x <- mtcars$mpg
> tidy_empirical(x, .num_sims = 1)
# A tibble: 32 x 7
   sim_number     x     y    dx       dy     p     q
   <fct>      <int> <dbl> <dbl>    <dbl> <dbl> <dbl>
 1 1              1  21    2.97 0.000114 0.625  10.4
 2 1              2  21    4.21 0.000455 0.625  10.4
 3 1              3  22.8  5.44 0.00142  0.781  13.3
 4 1              4  21.4  6.68 0.00355  0.688  14.3
 5 1              5  18.7  7.92 0.00721  0.469  14.7
 6 1              6  18.1  9.16 0.0124   0.438  15  
 7 1              7  14.3 10.4  0.0192   0.125  15.2
 8 1              8  24.4 11.6  0.0281   0.812  15.2
 9 1              9  22.8 12.9  0.0395   0.781  15.5
10 1             10  19.2 14.1  0.0516   0.531  15.8
# ... with 22 more rows
> 
> tidy_empirical(x, .num_sims = 10)
# A tibble: 320 x 7
   sim_number     x     y    dx       dy     p     q
   <fct>      <int> <dbl> <dbl>    <dbl> <dbl> <dbl>
 1 1              1  19.2  11.2 0.000283 0.531  15  
 2 1              2  18.1  12.1 0.00187  0.438  15  
 3 1              3  33.9  13.0 0.00801  1      15.5
 4 1              4  21    13.8 0.0224   0.625  15.8
 5 1              5  21.4  14.7 0.0416   0.688  15.8
 6 1              6  33.9  15.5 0.0531   1      17.3
 7 1              7  18.7  16.4 0.0516   0.469  18.1
 8 1              8  21    17.2 0.0476   0.625  18.7
 9 1              9  15.8  18.1 0.0508   0.312  19.2
10 1             10  21.4  18.9 0.0627   0.688  19.2
# ... with 310 more rows
> 
> tidy_empirical(x, .num_sims = 10) %>%
+   tidy_distribution_summary_tbl(sim_number)
# A tibble: 10 x 11
   sim_number mean_val median_val std_val min_val max_val skewness kurtosis range   iqr
   <fct>         <dbl>      <dbl>   <dbl>   <dbl>   <dbl>    <dbl>    <dbl> <dbl> <dbl>
 1 1              20.2       20.4    5.09    10.4    32.4    0.423     2.84  22    7   
 2 2              20.4       19.4    6.32    10.4    33.9    0.743     2.72  23.5  7.6 
 3 3              19.8       18.1    6.15    10.4    32.4    0.876     2.86  22    6.02
 4 4              20.9       18.4    6.66    14.3    33.9    1.00      2.55  19.6  5.75
 5 5              19.1       18.0    5.82    10.4    33.9    0.664     2.90  23.5  7.6 
 6 6              18.0       16.8    3.98    10.4    30.4    0.889     4.26  20    5.8 
 7 7              19.6       18.4    5.74    10.4    32.4    0.868     3.02  22    6.32
 8 8              18.8       16.8    5.99    10.4    33.9    0.738     2.89  23.5  8.1 
 9 9              18.2       18.4    4.43    10.4    30.4    0.448     3.65  20    5.8 
10 10             17.9       16.8    6.26    10.4    32.4    0.973     3.48  22    5.3 
# ... with 1 more variable: variance <dbl>

The text was updated successfully, but these errors were encountered:

Fixes #188

spsanderson added the enhancement New feature or request label May 18, 2022

spsanderson added this to the TidyDensity v1.2.0 milestone May 18, 2022

spsanderson self-assigned this May 18, 2022

spsanderson closed this as completed in 01bcdf3 May 20, 2022

spsanderson added a commit that referenced this issue May 20, 2022

Merge pull request #193 from spsanderson/development

fb7b54c

Fixes #188

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add `.num_sims` parameter to `tidy_empirical()` #188

Add `.num_sims` parameter to `tidy_empirical()` #188

spsanderson commented May 18, 2022 •

edited

Loading

Add .num_sims parameter to tidy_empirical() #188

Add .num_sims parameter to tidy_empirical() #188

Comments

spsanderson commented May 18, 2022 • edited Loading

Add `.num_sims` parameter to `tidy_empirical()` #188

Add `.num_sims` parameter to `tidy_empirical()` #188

spsanderson commented May 18, 2022 •

edited

Loading