Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generalized Pareto, also need param_estimate and stats_tbl #474

Closed
Tracked by #467
spsanderson opened this issue May 3, 2024 · 0 comments
Closed
Tracked by #467

Generalized Pareto, also need param_estimate and stats_tbl #474

spsanderson opened this issue May 3, 2024 · 0 comments
Assignees
Labels
enhancement New feature or request

Comments

@spsanderson
Copy link
Owner

spsanderson commented May 3, 2024

Param Estimate

Function:

#' Estimate Generalized Pareto Parameters
#'
#' @family Parameter Estimation
#' @family Generalized Pareto
#' 
#' @author Steven P. Sanderson II, MPH
#'
#' @details This function will attempt to estimate the generalized Pareto shape1, shape2, and rate
#' parameters given some vector of values.
#'
#' @description The function will return a list output by default, and if the parameter
#' `.auto_gen_empirical` is set to `TRUE` then the empirical data given to the
#' parameter `.x` will be run through the `tidy_empirical()` function and combined
#' with the estimated generalized Pareto data.
#'
#' @param .x The vector of data to be passed to the function.
#' @param .auto_gen_empirical This is a boolean value of TRUE/FALSE with default
#' set to TRUE. This will automatically create the `tidy_empirical()` output
#' for the `.x` parameter and use the `tidy_combine_distributions()`. The user
#' can then plot out the data using `$combined_data_tbl` from the function output.
#'
#' @examples
#' library(dplyr)
#' library(ggplot2)
#'
#' set.seed(123)
#' x <- tidy_generalized_pareto(100, .shape1 = 1, .shape2 = 2, .scale = 3)[["y"]]
#' output <- util_generalized_pareto_param_estimate(x)
#'
#' output$parameter_tbl
#'
#' output$combined_data_tbl %>%
#'   tidy_combined_autoplot()
#'
#' @return
#' A tibble/list
#' 
#' @name util_generalized_pareto_param_estimate
NULL

#' @export
#' @rdname util_generalized_pareto_param_estimate

util_generalized_pareto_param_estimate <- function(.x, .auto_gen_empirical = TRUE) {
  
  # Tidyeval ----
  x_term <- as.numeric(.x)
  n <- length(x_term)
  
  # Checks ----
  if (!is.vector(x_term, mode = "numeric") || is.factor(x_term)) {
    rlang::abort(
      message = "'.x' must be a numeric vector.",
      use_cli_format = TRUE
    )
  }
  
  if (n < 2 || any(x_term <= 0)) {
    rlang::abort(
      message = "'.x' must contain at least two non-missing distinct values. All values of '.x' must be positive.",
      use_cli_format = TRUE
    )
  }
  
  # Negative log-likelihood function for generalized Pareto distribution
  genpareto_lik <- function(params, data) {
    shape1 <- params[1]
    shape2 <- params[2]
    scale <- params[3]
    -sum(actuar::dgenpareto(data, shape1 = shape1, shape2 = shape2, scale = scale, log = TRUE))
  }
  
  # Initial parameter guesses
  initial_params <- c(shape1 = 1, shape2 = 1, scale = 1)
  
  # Optimize to minimize the negative log-likelihood
  opt_result <- optim(
    par = initial_params,
    fn = genpareto_lik,
    data = x_term,
    method = "L-BFGS-B",
    lower = c(1e-5, 1e-5, 1e-5)
  )
  
  shape1 <- opt_result$par[1]
  shape2 <- opt_result$par[2]
  scale <- opt_result$par[3]
  rate <- 1 / scale
  
  # Return Tibble ----
  if (.auto_gen_empirical) {
    te <- tidy_empirical(.x = x_term)
    td <- tidy_generalized_pareto(
      .n = n, 
      .shape1 = round(shape1, 3), 
      .shape2 = round(shape2, 3), 
      .rate = round(rate, 3)
      )
    combined_tbl <- tidy_combine_distributions(te, td)
  }
  
  ret <- dplyr::tibble(
    dist_type = "Generalized Pareto",
    samp_size = n,
    min = min(x_term),
    max = max(x_term),
    mean = mean(x_term),
    shape1 = shape1,
    shape2 = shape2,
    rate = rate,
    scale = scale
  )
  
  # Return ----
  attr(ret, "tibble_type") <- "parameter_estimation"
  attr(ret, "family") <- "generalized_pareto"
  attr(ret, "x_term") <- .x
  attr(ret, "n") <- n
  
  if (.auto_gen_empirical) {
    output <- list(
      combined_data_tbl = combined_tbl,
      parameter_tbl     = ret
    )
  } else {
    output <- list(
      parameter_tbl = ret
    )
  }
  
  return(output)
}

Example:

> set.seed(123)
> x <- tidy_generalized_pareto(100, .shape1 = 1, .shape2 = 2, .scale = 3)[["y"]]
> output <- util_generalized_pareto_param_estimate(x)
> 
> output$parameter_tbl
# A tibble: 1 × 9
  dist_type          samp_size   min   max  mean shape1 shape2  rate scale
  <chr>                  <int> <dbl> <dbl> <dbl>  <dbl>  <dbl> <dbl> <dbl>
1 Generalized Pareto       100 0.688  412.  21.5   1.08   5.47  1.01 0.993
> 
> output$combined_data_tbl %>%
+   tidy_combined_autoplot()

image

Stats Tibble

Function:

#' Distribution Statistics
#'
#' @family Generalized Pareto
#' @family Distribution Statistics
#'
#' @author Steven P. Sanderson II, MPH
#'
#' @details This function will take in a tibble and return the statistics
#' of the given type of `tidy_` distribution. It is required that data be
#' passed from a `tidy_` distribution function.
#'
#' @description Returns distribution statistics in a tibble.
#'
#' @param .data The data being passed from a `tidy_` distribution function.
#'
#' @examples
#' library(dplyr)
#'
#' tidy_generalized_pareto() |>
#'   util_generalized_pareto_stats_tbl() |>
#'   glimpse()
#'
#' @return
#' A tibble
#'
#' @name util_generalized_pareto_stats_tbl
NULL
#' @export
#' @rdname util_generalized_pareto_stats_tbl
util_generalized_pareto_stats_tbl <- function(.data) {
  
  # Immediate check for tidy_ distribution function
  if (!"tibble_type" %in% names(attributes(.data))) {
    rlang::abort(
      message = "You must pass data from the 'tidy_dist' function.",
      use_cli_format = TRUE
    )
  }
  
  if (attributes(.data)$tibble_type != "tidy_generalized_pareto") {
    rlang::abort(
      message = "You must use 'tidy_generalized_pareto()'",
      use_cli_format = TRUE
    )
  }
  
  # Data
  data_tbl <- dplyr::as_tibble(.data)
  
  atb <- attributes(data_tbl)
  shape1 <- atb$.shape1
  shape2 <- atb$.shape2
  rate <- atb$.rate
  scale <- 1 / rate
  
  stat_mean <- ifelse(shape1 <= 1, Inf, scale * shape1 / (shape1 - 1))
  stat_mode <- scale * (shape1 - 1) / shape1
  stat_median <- scale * actuar::qgenpareto(0.5, shape1 = shape1, shape2 = shape2, scale = scale)
  stat_var <- ifelse(shape1 <= 2, Inf, (scale^2 * shape1) / ((shape1 - 1)^2 * (shape1 - 2)))
  stat_skewness <- ifelse(shape1 <= 3, "undefined", 2 * (1 + shape1) / (shape1 - 3) * sqrt((shape1 - 2) / shape1))
  stat_kurtosis <- ifelse(shape1 <= 4, "undefined", 6 * (shape1^3 + shape1^2 - 6 * shape1 - 2) / (shape1 * (shape1 - 3) * (shape1 - 4)))
  
  # Data Tibble
  ret <- dplyr::tibble(
    tidy_function = atb$tibble_type,
    function_call = atb$dist_with_params,
    distribution = dist_type_extractor(atb$tibble_type),
    distribution_type = atb$distribution_family_type,
    points = atb$.n,
    simulations = atb$.num_sims,
    mean = stat_mean,
    mode = stat_mode,
    median = stat_median,
    coeff_var = sqrt(stat_var) / stat_mean,
    skewness = stat_skewness,
    kurtosis = stat_kurtosis,
    computed_std_skew = tidy_skewness_vec(data_tbl$y),
    computed_std_kurt = tidy_kurtosis_vec(data_tbl$y),
    ci_lo = ci_lo(data_tbl$y),
    ci_hi = ci_hi(data_tbl$y)
  )
  
  # Return
  return(ret)
}

Example:

> tidy_generalized_pareto() |>
+   util_generalized_pareto_stats_tbl() |>
+   glimpse()
Rows: 1
Columns: 16
$ tidy_function     <chr> "tidy_generalized_pareto"
$ function_call     <chr> "Generalized Pareto c(1, 1, 1, 1)"
$ distribution      <chr> "Generalized Pareto"
$ distribution_type <chr> "continuous"
$ points            <dbl> 50
$ simulations       <dbl> 1
$ mean              <dbl> Inf
$ mode              <dbl> 0
$ median            <dbl> 1
$ coeff_var         <dbl> NaN
$ skewness          <chr> "undefined"
$ kurtosis          <chr> "undefined"
$ computed_std_skew <dbl> 5.314218
$ computed_std_kurt <dbl> 32.13978
$ ci_lo             <dbl> 0.02209331
$ ci_hi             <dbl> 55.2022

AIC

Function:

#' Calculate Akaike Information Criterion (AIC) for Generalized Pareto Distribution
#'
#' This function calculates the Akaike Information Criterion (AIC) for a generalized Pareto distribution fitted to the provided data.
#'
#' @family Utility
#' 
#' @author Steven P. Sanderson II, MPH
#'
#' @description
#' This function estimates the shape1, shape2, and rate parameters of a generalized Pareto distribution
#' from the provided data using maximum likelihood estimation,
#' and then calculates the AIC value based on the fitted distribution.
#'
#' @param .x A numeric vector containing the data to be fitted to a generalized Pareto distribution.
#'
#' @details
#' This function fits a generalized Pareto distribution to the provided data using maximum
#' likelihood estimation. It estimates the shape1, shape2, and rate parameters
#' of the generalized Pareto distribution using maximum likelihood estimation. Then, it
#' calculates the AIC value based on the fitted distribution.
#'
#' Initial parameter estimates: The function uses the method of moments estimates
#' as starting points for the shape1, shape2, and rate parameters of the generalized Pareto distribution.
#'
#' Optimization method: The function uses the optim function for optimization.
#' You might explore different optimization methods within optim for potentially
#' better performance.
#'
#' Goodness-of-fit: While AIC is a useful metric for model comparison, it's
#' recommended to also assess the goodness-of-fit of the chosen model using
#' visualization and other statistical tests.
#'
#' @examples
#' # Example 1: Calculate AIC for a sample dataset
#' set.seed(123)
#' x <- actuar::rgenpareto(100, shape1 = 1, shape2 = 2, scale = 3)
#' util_generalized_pareto_aic(x)
#'
#' @return
#' The AIC value calculated based on the fitted generalized Pareto distribution to the provided data.
#'
#' @name util_generalized_pareto_aic
NULL

#' @export
#' @rdname util_generalized_pareto_aic
util_generalized_pareto_aic <- function(.x) {
  # Tidyeval
  x <- as.numeric(.x)
  
  # Negative log-likelihood function for generalized Pareto distribution
  neg_log_lik_genpareto <- function(par, data) {
    shape1 <- par[1]
    shape2 <- par[2]
    scale <- par[3]
    -sum(actuar::dgenpareto(data, shape1 = shape1, shape2 = shape2, scale = scale, log = TRUE))
  }
  
  # Initial parameter estimates
  pe <- TidyDensity::util_generalized_pareto_param_estimate(x)$parameter_tbl
  shape1_init <- pe$shape1
  shape2_init <- pe$shape2
  scale_init <- pe$scale
  initial_params <- c(shape1 = shape1_init, shape2 = shape2_init, scale = scale_init)
  
  # Fit generalized Pareto distribution using optim
  fit_genpareto <- stats::optim(
    par = initial_params,
    fn = neg_log_lik_genpareto,
    data = x,
    method = "L-BFGS-B",
    lower = c(1e-5, 1e-5, 1e-5)
  )
  
  # Extract log-likelihood and number of parameters
  logLik_genpareto <- -fit_genpareto$value
  k_genpareto <- 3 # Number of parameters for generalized Pareto distribution (shape1, shape2, and scale)
  
  # Calculate AIC
  AIC_genpareto <- 2 * k_genpareto - 2 * logLik_genpareto
  
  # Return AIC
  return(AIC_genpareto)
}

Example:

> set.seed(123)
> x <- actuar::rgenpareto(100, shape1 = 1, shape2 = 2, scale = 3)
> util_generalized_pareto_aic(x)
[1] 742.2865
@spsanderson spsanderson self-assigned this May 15, 2024
@spsanderson spsanderson added the enhancement New feature or request label May 15, 2024
@spsanderson spsanderson added this to the TidyDensity 1.4.1 milestone May 15, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
enhancement New feature or request
Development

No branches or pull requests

1 participant