Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Zero Truncated Negative Binomial #470

Closed
Tracked by #467
spsanderson opened this issue May 3, 2024 · 0 comments
Closed
Tracked by #467

Zero Truncated Negative Binomial #470

spsanderson opened this issue May 3, 2024 · 0 comments
Assignees
Labels
enhancement New feature or request

Comments

@spsanderson
Copy link
Owner

spsanderson commented May 3, 2024

Parameter Estimate Function

Function:

#' Estimate Zero Truncated Negative Binomial Parameters
#'
#' @family Parameter Estimation
#' @family Binomial
#' @family Zero Truncated Negative Distribution
#'
#' @author Steven P. Sanderson II, MPH
#'
#' @details This function will attempt to estimate the zero truncated negative 
#' binomial size and prob parameters given some vector of values.
#'
#' @description The function will return a list output by default, and  if the parameter
#' `.auto_gen_empirical` is set to `TRUE` then the empirical data given to the
#' parameter `.x` will be run through the `tidy_empirical()` function and combined
#' with the estimated negative binomial data.
#'
#' One method of estimating the parameters is done via:
#' -  MLE via \code{\link[stats]{optim}} function.
#'
#' @param .x The vector of data to be passed to the function.
#' @param .auto_gen_empirical This is a boolean value of TRUE/FALSE with default
#' set to TRUE. This will automatically create the `tidy_empirical()` output
#' for the `.x` parameter and use the `tidy_combine_distributions()`. The user
#' can then plot out the data using `$combined_data_tbl` from the function output.
#'
#' @examples
#' library(dplyr)
#' library(ggplot2)
#' library(actuar)
#' 
#' x <- as.integer(mtcars$mpg)
#' output <- util_ztn_binomial_param_estimate(x)
#'
#' output$parameter_tbl
#'
#' output$combined_data_tbl |>
#'   tidy_combined_autoplot()
#'
#' set.seed(123)
#' t <- rztnbinom(100, 10, .1)
#' util_ztn_binomial_param_estimate(t)$parameter_tbl
#'
#' @return
#' A tibble/list
#'
#' @export
#'

util_ztn_binomial_param_estimate <- function(.x, .auto_gen_empirical = TRUE) {
  
  # Check if actuar library is installed
  if (!requireNamespace("actuar", quietly = TRUE)) {
    stop("The 'actuar' package is needed for this function. Please install it with: install.packages('actuar')")
  }
  
  # Tidyeval ----
  x_term <- as.numeric(.x)
  sum_x <- sum(x_term, na.rm = TRUE)
  minx <- min(x_term)
  maxx <- max(x_term)
  m <- mean(x_term, na.rm = TRUE)
  n <- length(x_term)
  
  # Negative log-likelihood function for optimization
  nll_func <- function(params) {
    size <- params[1]
    prob <- params[2]
    -sum(actuar::dztnbinom(x_term, size = size, prob = prob, log = TRUE))
  }
  
  # Initial parameter guesses 
  initial_params <- c(size = 1, prob = 0.5)  # Adjust based on your data
  
  # Optimization using optim()
  optim_result <- optim(initial_params, nll_func) |>
    suppressWarnings()
  
  # Extract estimated parameters
  mle_size <- optim_result$par[1]
  mle_prob <- optim_result$par[2]
  
  # Create output tibble
  ret <- tibble::tibble(
    dist_type = "Zero-Truncated Negative Binomial",
    samp_size = n,
    min = minx,
    max = maxx,
    mean = m,
    method = "MLE_Optim",
    size = mle_size,
    prob = mle_prob
  )
  
  # Attach attributes
  attr(ret, "tibble_type") <- "parameter_estimation"
  attr(ret, "family") <- "zero_truncated_negative_binomial"
  attr(ret, "x_term") <- .x
  attr(ret, "n") <- n
  
  if (.auto_gen_empirical) {
    # Generate empirical data
    # Assuming tidy_empirical and tidy_combine_distributions functions exist
    te <- tidy_empirical(.x = x_term)
    td <- tidy_zero_truncated_negative_binomial(
      .n = n,
      .size = round(mle_size, 3),
      .prob = round(mle_prob, 3)
    )
    combined_tbl <- tidy_combine_distributions(te, td)
    
    output <- list(
      combined_data_tbl = combined_tbl,
      parameter_tbl = ret
    )
  } else {
    output <- list(
      parameter_tbl = ret
    )
  }
  
  return(output)
}

Example:

> set.seed(123)
> x <- rztnbinom(100, 10, .1)
> util_ztn_binomial_param_estimate(x)
$combined_data_tbl
# A tibble: 200 × 8
   sim_number     x     y      dx         dy     p     q dist_type
   <fct>      <int> <int>   <dbl>      <dbl> <dbl> <dbl> <fct>    
 1 1              1    71 -9.10   0.00000433  0.3     22 Empirical
 2 1              2   112 -6.85   0.00000810  0.79    41 Empirical
 3 1              3    80 -4.59   0.0000145   0.4     45 Empirical
 4 1              4   126 -2.34   0.0000248   0.87    46 Empirical
 5 1              5   141 -0.0820 0.0000403   0.95    46 Empirical
 6 1              6    46  2.17   0.0000626   0.05    53 Empirical
 7 1              7    89  4.43   0.0000931   0.55    54 Empirical
 8 1              8   128  6.68   0.000132    0.91    55 Empirical
 9 1              9    91  8.94   0.000180    0.58    55 Empirical
10 1             10    84 11.2    0.000235    0.51    55 Empirical
# ℹ 190 more rows
# ℹ Use `print(n = ...)` to see more rows

$parameter_tbl
# A tibble: 1 × 8
  dist_type                        samp_size   min   max  mean method     size  prob
  <chr>                                <int> <dbl> <dbl> <dbl> <chr>     <dbl> <dbl>
1 Zero-Truncated Negative Binomial       100    22   183  89.6 MLE_Optim  10.7 0.107

AIC

Function:

#' Calculate Akaike Information Criterion (AIC) for Zero-Truncated Negative Binomial Distribution
#'
#' This function calculates the Akaike Information Criterion (AIC) for a 
#' zero-truncated negative binomial (ZTNB) distribution fitted to the provided data.
#'
#' @family Utility
#' @author Steven P. Sanderson II, MPH
#'
#' @description
#' This function estimates the parameters (`size` and `prob`) of a ZTNB
#' distribution from the provided data using maximum likelihood estimation 
#' (via the `optim()` function), and then calculates the AIC value based on the 
#' fitted distribution. 
#'
#' @param .x A numeric vector containing the data (non-zero counts) to be 
#'   fitted to a ZTNB distribution.
#'
#' @details
#' **Initial parameter estimates:** The choice of initial values for `size` 
#' and `prob` can impact the convergence of the optimization. Consider using 
#' prior knowledge or method of moments estimates to obtain reasonable starting 
#' values. 
#'
#' **Optimization method:** The default optimization method used is 
#' "Nelder-Mead". You might explore other optimization methods available in 
#' `optim()` for potentially better performance or different constraint 
#' requirements.
#'
#' **Data requirements:** The input data `.x` should consist of non-zero counts, 
#' as the ZTNB distribution does not include zero values. 
#'
#' **Goodness-of-fit:** While AIC is a useful metric for model comparison, it's 
#' recommended to also assess the goodness-of-fit of the chosen ZTNB model using
#' visualization (e.g., probability plots, histograms) and other statistical 
#' tests (e.g., chi-square goodness-of-fit test) to ensure it adequately 
#' describes the data.
#'
#' @examples
#' library(actuar)
#' 
#' # Example data
#' set.seed(123)
#' x <- actuar::rztnbinom(30, size = 2, prob = 0.4)
#' 
#' # Calculate AIC
#' util_rztnbinom_aic(x)
#'
#' @return The AIC value calculated based on the fitted ZTNB distribution to 
#'   the provided data.
#'
#' @name util_rztnbinom_aic
NULL

#' @export
#' @rdname util_rztnbinom_aic

util_rztnbinom_aic <- function(.x) {
  # Check if actuar library is installed
  if (!requireNamespace("actuar", quietly = TRUE)) {
    stop("The 'actuar' package is needed for this function. Please install it with: install.packages('actuar')")
  }
  
  # Tidyeval
  x <- as.numeric(.x)
  
  # Get parameters
  pe <- util_ztn_binomial_param_estimate(x)$parameter_tbl
  
  # Negative log-likelihood function for zero-truncated negative binomial distribution
  neg_log_lik_rztnbinom <- function(par, data) {
    size <- par[1]
    prob <- par[2]
    -sum(actuar::dztnbinom(data, size = size, prob = prob, log = TRUE))
  }
  
  # Fit zero-truncated negative binomial distribution to data
  fit_rztnbinom <- optim(
    par = c(size = round(pe$size, 3), prob = round(pe$prob, 3)), 
    fn = neg_log_lik_rztnbinom, 
    data = x
  ) |>
    suppressWarnings()
  
  # Extract log-likelihood and number of parameters
  logLik_rztnbinom <- -fit_rztnbinom$value
  k_rztnbinom <- 2  # Number of parameters (size and prob)
  
  # Calculate AIC
  AIC_rztnbinom <- 2 * k_rztnbinom - 2 * logLik_rztnbinom
  
  # Return AIC value
  return(AIC_rztnbinom)
}

Example:

set.seed(123)
x <- actuar::rztnbinom(30, size = 2, prob = 0.4)
> util_rztnbinom_aic(x)
[1] 140.8286

> fitdist(x, "ztnbinom", start = list(size = 1, prob = 0.5))$aic
[1] 140.8286

Stats Tibble

Function:

#' Distribution Statistics for Zero-Truncated Negative Binomial
#'
#' @family Binomial
#' @family Negative Binomial
#' @family Distribution Statistics
#'
#' @author Steven P. Sanderson II, MPH
#'
#' @details This function computes statistics for a zero-truncated negative 
#' binomial distribution.
#'
#' @description Computes distribution statistics for a zero-truncated negative 
#' binomial distribution.
#'
#' @param .data The data from a zero-truncated negative binomial distribution.
#' 
#' @examples
#' library(dplyr)
#' 
#' tidy_zero_truncated_negative_binomial(.size = 1, .prob = 0.1) |>
#'  util_zero_truncated_negative_binomial_stats_tbl() |>
#'  glimpse()
#' 
#'
#' @return A tibble with distribution statistics.
#' 
#' @name util_zero_truncated_negative_binomial_stats_tbl
NULL

#' @export
#' @rdname util_zero_truncated_negative_binomial_stats_tbl

util_zero_truncated_negative_binomial_stats_tbl <- function(.data) {
  
  # Immediate check for tidy_ distribution function
  if (!"tibble_type" %in% names(attributes(.data))) {
    rlang::abort(
      message = "You must pass data from the 'tidy_dist' function.",
      use_cli_format = TRUE
    )
  }
  
  if (attributes(.data)$tibble_type != "tidy_zero_truncated_negative_binomial") {
    rlang::abort(
      message = "You must use 'tidy_zero_truncated_negative_binomial()'",
      use_cli_format = TRUE
    )
  }
  
  # Extract parameters from data
  data_tbl <- dplyr::as_tibble(.data)
  atb <- attributes(data_tbl)
  r <- atb$.size
  p <- atb$.prob
  
  # Compute statistics
  mean_val <- (p * r) / (1 - p)
  mode_val <- ifelse(r > 1, floor((p * (r - 1)) / (1 - p)), 0)
  var_val <- (p * r) / ((1 - p)^2)
  sd_val <- sqrt(var_val)
  skewness_val <- (1 + p) / sqrt(p * r)
  kurtosis_val <- 6 / r + ((1 - p)^2) / (p * r)
  
  # Create tibble of distribution statistics
  ret <- dplyr::tibble(
    tidy_function = atb$tibble_type,
    function_call = atb$dist_with_params,
    distribution = dist_type_extractor(atb$tibble_type),
    distribution_type = atb$distribution_family_type,
    points = atb$.n,
    simulations = atb$.num_sims,
    mean = mean_val,
    mode_lower = mode_val,
    range = "1 to Inf",
    std_dv = sd_val,
    coeff_var = var_val / mean_val,
    skewness = skewness_val,
    kurtosis = kurtosis_val,
    computed_std_skew = tidy_skewness_vec(data_tbl$y),
    computed_std_kurt = tidy_kurtosis_vec(data_tbl$y),
    ci_lo = ci_lo(data_tbl$y),
    ci_hi = ci_hi(data_tbl$y)
  )
  
  # Return the tibble with distribution statistics
  return(ret)
}

Example:

> tidy_zero_truncated_negative_binomial(.size = 1, .prob = 0.1) |>
+  util_zero_truncated_negative_binomial_stats_tbl() |>
+  glimpse()
Rows: 1
Columns: 17
$ tidy_function     <chr> "tidy_zero_truncated_negative_binomial"
$ function_call     <chr> "Zero Truncated Negative Binomial c(1, 0.1)"
$ distribution      <chr> "Zero Truncated Negative Binomial"
$ distribution_type <chr> "discrete"
$ points            <dbl> 50
$ simulations       <dbl> 1
$ mean              <dbl> 0.1111111
$ mode_lower        <dbl> 0
$ range             <chr> "1 to Inf"
$ std_dv            <dbl> 0.3513642
$ coeff_var         <dbl> 1.111111
$ skewness          <dbl> 3.478505
$ kurtosis          <dbl> 14.1
$ computed_std_skew <dbl> 2.07891
$ computed_std_kurt <dbl> 7.03885
$ ci_lo             <dbl> 1
$ ci_hi             <dbl> 44.1
@spsanderson spsanderson self-assigned this May 3, 2024
@spsanderson spsanderson added the enhancement New feature or request label May 3, 2024
@spsanderson spsanderson added this to the TidyDensity 1.4.1 milestone May 3, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
enhancement New feature or request
Development

No branches or pull requests

1 participant