Pareto1, also need param_estimate and stats_tbl #479

spsanderson · 2024-05-03T15:41:46Z

Param Estimates

Function:

#' Estimate Pareto Parameters
#'
#' @family Parameter Estimation
#' @family Pareto
#'
#' @author Steven P. Sanderson II, MPH
#'
#' @details This function will attempt to estimate the Pareto shape and scale
#' parameters given some vector of values.
#'
#' @description The function will return a list output by default, and if the parameter
#' `.auto_gen_empirical` is set to `TRUE` then the empirical data given to the
#' parameter `.x` will be run through the `tidy_empirical()` function and combined
#' with the estimated Pareto data.
#'
#' Two different methods of shape parameters are supplied:
#' -  LSE
#' -  MLE
#'
#' @param .x The vector of data to be passed to the function.
#' @param .auto_gen_empirical This is a boolean value of TRUE/FALSE with default
#' set to TRUE. This will automatically create the `tidy_empirical()` output
#' for the `.x` parameter and use the `tidy_combine_distributions()`. The user
#' can then plot out the data using `$combined_data_tbl` from the function output.
#'
#' @examples
#' library(dplyr)
#' library(ggplot2)
#'
#' x <- mtcars[["mpg"]]
#' output <- util_pareto1_param_estimate(x)
#'
#' output$parameter_tbl
#'
#' output$combined_data_tbl |>
#'   tidy_combined_autoplot()
#'
#' set.seed(123)
#' t <- tidy_pareto1(.n = 100, .shape = 1.5, .min = 1)[["y"]]
#' util_pareto1_param_estimate(t)$parameter_tbl
#'
#' @return
#' A tibble/list
#'
#' @name util_pareto1_param_estimate
NULL

#' @export
#' @rdname util_pareto1_param_estimate

util_pareto1_param_estimate <- function(.x, .auto_gen_empirical = TRUE) {
  
  # Tidyeval ----
  x_term <- as.numeric(.x)
  minx <- min(x_term)
  maxx <- max(x_term)
  n <- length(x_term)
  unique_terms <- length(unique(x_term))
  
  # Checks ----
  if (!is.vector(x_term, mode = "numeric") || is.factor(x_term)) {
    rlang::abort(
      message = "'.x' must be a numeric vector.",
      use_cli_format = TRUE
    )
  }
  
  if (n < 2 || any(x_term <= 0) || unique_terms < 2) {
    rlang::abort(
      message = "'.x' must contain at least two non-missing distinct values. All values of '.x' must be positive.",
      use_cli_format = TRUE
    )
  }
  
  # Get params ----
  # LSE
  ppc <- 0.375
  fhat <- stats::ppoints(n, a = ppc)
  lse_coef <- stats::lm(log(1 - fhat) ~ log(sort(x_term)))$coefficients
  lse_shape <- -lse_coef[[2]]
  lse_min <- exp(lse_coef[[1]] / lse_shape)
  
  # MLE
  mle_min <- min(x_term)
  mle_shape <- n / sum(log(x_term / mle_min))
  
  # Return Tibble ----
  if (.auto_gen_empirical) {
    te <- tidy_empirical(.x = x_term)
    td_lse <- tidy_pareto1(.n = n, .shape = round(lse_shape, 3), .min = round(lse_min, 3))
    td_mle <- tidy_pareto1(.n = n, .shape = round(mle_shape, 3), .min = round(mle_min, 3))
    combined_tbl <- tidy_combine_distributions(te, td_lse, td_mle)
  }
  
  ret <- dplyr::tibble(
    dist_type = rep("Pareto", 2),
    samp_size = rep(n, 2),
    min = rep(minx, 2),
    max = rep(maxx, 2),
    method = c("LSE", "MLE"),
    est_shape = c(lse_shape, mle_shape),
    est_min = c(lse_min, mle_min)
  )
  
  # Return ----
  attr(ret, "tibble_type") <- "parameter_estimation"
  attr(ret, "family") <- "pareto"
  attr(ret, "x_term") <- .x
  attr(ret, "n") <- n
  
  if (.auto_gen_empirical) {
    output <- list(
      combined_data_tbl = combined_tbl,
      parameter_tbl     = ret
    )
  } else {
    output <- list(
      parameter_tbl = ret
    )
  }
  
  return(output)
}

Example:

> x <- mtcars[["mpg"]]
> output <- util_pareto1_param_estimate(x)
> 
> output$parameter_tbl
# A tibble: 2 × 7
  dist_type samp_size   min   max method est_shape est_min
  <chr>         <int> <dbl> <dbl> <chr>      <dbl>   <dbl>
1 Pareto           32  10.4  33.9 LSE         2.86    13.7
2 Pareto           32  10.4  33.9 MLE         1.62    10.4
> 
> output$combined_data_tbl |>
+   tidy_combined_autoplot()
> set.seed(123)
> t <- tidy_pareto1(.n = 100, .shape = 1.5, .min = 1)[["y"]]
> util_pareto1_param_estimate(t)$parameter_tbl
# A tibble: 2 × 7
  dist_type samp_size   min   max method est_shape est_min
  <chr>         <int> <dbl> <dbl> <chr>      <dbl>   <dbl>
1 Pareto          100  1.00  137. LSE         1.36   0.936
2 Pareto          100  1.00  137. MLE         1.52   1.00

AIC Function

Function:

#' Calculate Akaike Information Criterion (AIC) for Pareto Distribution
#'
#' This function calculates the Akaike Information Criterion (AIC) for a Pareto distribution fitted to the provided data.
#'
#' @family Utility
#' @family Pareto
#' @author Steven P. Sanderson II, MPH
#'
#' @description
#' This function estimates the shape and scale parameters of a Pareto distribution
#' from the provided data using maximum likelihood estimation,
#' and then calculates the AIC value based on the fitted distribution.
#'
#' @param .x A numeric vector containing the data to be fitted to a Pareto distribution.
#'
#' @details
#' This function fits a Pareto distribution to the provided data using maximum
#' likelihood estimation. It estimates the shape and scale parameters
#' of the Pareto distribution using maximum likelihood estimation. Then, it
#' calculates the AIC value based on the fitted distribution.
#'
#' Initial parameter estimates: The function uses the method of moments estimates
#' as starting points for the shape and scale parameters of the Pareto distribution.
#'
#' Optimization method: The function uses the optim function for optimization.
#' You might explore different optimization methods within optim for potentially
#' better performance.
#'
#' Goodness-of-fit: While AIC is a useful metric for model comparison, it's
#' recommended to also assess the goodness-of-fit of the chosen model using
#' visualization and other statistical tests.
#'
#' @examples
#' # Example 1: Calculate AIC for a sample dataset
#' set.seed(123)
#' x <- tidy_pareto1()$y
#' util_pareto_aic(x)
#'
#' @return
#' The AIC value calculated based on the fitted Pareto distribution to the provided data.
#'
#' @name util_pareto_aic
NULL

#' @export
#' @rdname util_pareto_aic
util_pareto_aic <- function(.x) {
  # Tidyeval
  x <- as.numeric(.x)
  n <- length(x)
  
  # Negative log-likelihood function for Pareto distribution
  neg_log_lik_pareto <- function(par, data) {
    shape <- par[1]
    min <- par[2]
    -sum(actuar::dpareto1(data, shape = shape, min = min, log = TRUE))
  }
  
  # Get initial parameter estimates: method of moments
  pe <- TidyDensity::util_pareto1_param_estimate(x)$parameter_tbl |>
    subset(method == "MLE")
  
  # Fit Pareto distribution using optim
  fit_pareto <- stats::optim(
    c(pe$est_shape, pe$est_min),
    neg_log_lik_pareto,
    data = x
  )
  
  # Extract log-likelihood and number of parameters
  logLik_pareto <- -fit_pareto$value
  k_pareto <- 2 # Number of parameters for Pareto distribution (shape and min)
  
  # Calculate AIC
  AIC_pareto <- 2 * k_pareto - 2 * logLik_pareto
  
  # Return AIC
  return(AIC_pareto)
}

Example:

> set.seed(123)
> x <- TidyDensity::tidy_pareto1()$y
> util_pareto_aic(x)
[1] 185.0364

Stats Tibble

Function:

#' Distribution Statistics for Pareto1 Distribution
#'
#' @family Pareto
#' @family Distribution Statistics
#'
#' @details This function will take in a tibble and returns the statistics
#' of the given type of `tidy_` distribution. It is required that data be
#' passed from a `tidy_` distribution function.
#'
#' @description Returns distribution statistics in a tibble.
#'
#' @param .data The data being passed from a `tidy_` distribution function.
#'
#' @examples
#' library(dplyr)
#'
#' tidy_pareto1() |>
#'   util_pareto1_stats_tbl() |>
#'   glimpse()
#'
#' @return
#' A tibble
#'
#' @name util_pareto1_stats_tbl
NULL
#' @export
#' @rdname util_pareto1_stats_tbl

util_pareto1_stats_tbl <- function(.data) {
  
  # Immediate check for tidy_ distribution function
  if (!"tibble_type" %in% names(attributes(.data))) {
    rlang::abort(
      message = "You must pass data from the 'tidy_dist' function.",
      use_cli_format = TRUE
    )
  }
  
  if (attributes(.data)$tibble_type != "tidy_pareto_single_parameter") {
    rlang::abort(
      message = "You must use 'tidy_pareto1()'",
      use_cli_format = TRUE
    )
  }
  
  # Data
  data_tbl <- dplyr::as_tibble(.data)
  
  atb <- attributes(data_tbl)
  xm <- atb$.min
  alpha <- atb$.shape
  
  stat_mean <- ifelse(alpha <= 1, Inf, (alpha * xm) / (alpha - 1))
  stat_mode <- xm
  stat_coef_var <- ifelse(
    alpha <= 2, 
    Inf, 
    sqrt((alpha) / ((alpha - 1)^2 * (alpha - 2)))
    )
  stat_sd <- ifelse(
    alpha <= 1, 
    Inf, 
    sqrt((alpha * xm^2) / ((alpha - 1)^2 * (alpha - 2)))
    )
  stat_skewness <- ifelse(
    alpha <= 3, 
    "undefined", 
    (2 * (1 + alpha)) / (alpha - 3) * sqrt((alpha - 2) / alpha)
    )
  stat_kurtosis <- ifelse(
    alpha <= 4, 
    "undefined", 
    (6 * (alpha^3 + alpha^2 - 6 * alpha - 2)) / (alpha * (alpha - 3) * (alpha - 4))
    )
  
  # Data Tibble
  ret <- dplyr::tibble(
    tidy_function = atb$tibble_type,
    function_call = atb$dist_with_params,
    distribution = "Pareto1",
    distribution_type = "Continuous",
    points = atb$.n,
    simulations = atb$.num_sims,
    mean = stat_mean,
    mode_lower = stat_mode,
    range = paste0(xm, " to Inf"),
    std_dv = stat_sd,
    coeff_var = stat_coef_var,
    skewness = stat_skewness,
    kurtosis = stat_kurtosis,
    computed_std_skew = tidy_skewness_vec(data_tbl$y),
    computed_std_kurt = tidy_kurtosis_vec(data_tbl$y),
    ci_lo = ci_lo(data_tbl$y),
    ci_hi = ci_hi(data_tbl$y)
  )
  
  # Return
  return(ret)
}

Example:

> tidy_pareto1(.min = 2, .shape = 5) |>
+   util_pareto1_stats_tbl() |>
+   glimpse()
Rows: 1
Columns: 17
$ tidy_function     <chr> "tidy_pareto_single_parameter"
$ function_call     <chr> "Single Param Pareto c(5, 2)"
$ distribution      <chr> "Pareto1"
$ distribution_type <chr> "Continuous"
$ points            <dbl> 50
$ simulations       <dbl> 1
$ mean              <dbl> 2.5
$ mode_lower        <dbl> 2
$ range             <chr> "2 to Inf"
$ std_dv            <dbl> 0.6454972
$ coeff_var         <dbl> 0.3227486
$ skewness          <dbl> 4.64758
$ kurtosis          <dbl> 70.8
$ computed_std_skew <dbl> 1.908189
$ computed_std_kurt <dbl> 8.727513
$ ci_lo             <dbl> 2.006481
$ ci_hi             <dbl> 3.193813

spsanderson mentioned this issue May 3, 2024

New AIC/Param Estimate/Stats Tbl functions #467

Closed

15 tasks

spsanderson self-assigned this May 14, 2024

spsanderson added the enhancement New feature or request label May 14, 2024

spsanderson added this to the TidyDensity 1.4.1 milestone May 14, 2024

spsanderson closed this as completed in 2e22e38 May 15, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Pareto1, also need param_estimate and stats_tbl #479

Pareto1, also need param_estimate and stats_tbl #479

spsanderson commented May 3, 2024 •

edited

Loading

Pareto1, also need param_estimate and stats_tbl #479

Pareto1, also need param_estimate and stats_tbl #479

Comments

spsanderson commented May 3, 2024 • edited Loading

Param Estimates

AIC Function

Stats Tibble

spsanderson commented May 3, 2024 •

edited

Loading