Skip to content

Commit

Permalink
Merge branch 'random-survival-forest'
Browse files Browse the repository at this point in the history
  • Loading branch information
derrynknife committed Aug 18, 2023
2 parents f3ed2be + 6d05d3b commit 84bf5a1
Show file tree
Hide file tree
Showing 6 changed files with 593 additions and 311 deletions.
174 changes: 121 additions & 53 deletions surpyval/recurrence/parametric/hpp.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np
import numpy_indexed as npi
from scipy.optimize import minimize
from autograd import hessian, jacobian
from autograd import numpy as anp
from scipy.optimize import root
from scipy.special import gammaln

from surpyval.recurrence.parametric.parametric_recurrence import (
Expand Down Expand Up @@ -29,53 +30,112 @@ def inv_cif(self, cif, rate):
return cif / rate

@classmethod
def create_negll_func(cls, x, i, c, n):
def negll_func(rate):
ll = 0

for item in set(i):
mask_item = i == item
x_item = np.atleast_1d(x[mask_item])
c_item = np.atleast_1d(c[mask_item])
n_item = np.atleast_1d(n[mask_item])
x_prev = 0

for j in range(0, len(x_item)):
if x_item.ndim == 1:
x_j = x_item[j]
else:
x_j = x_item[j][0]
x_jr = x_item[j][1]
if c_item[j] == 0:
ll += np.log(rate) - (rate * x_j) + (rate * x_prev)
x_prev = x_j
elif c_item[j] == 1:
ll += rate * (x_prev - x_j)
x_prev = x_j
elif c_item[j] == 2:
delta_x = x_jr - x_j
ll += (
n_item[j] * np.log(rate * delta_x)
- gammaln(n_item[j] + 1)
- (rate * delta_x)
)
x_prev = x_jr
elif c_item[j] == -1:
ll += (
n_item[j] * np.log(rate * x_j)
- gammaln(n_item[j] + 1)
- (rate * x_j)
)
x_prev = x_j

return -ll
def create_negll_func(cls, data):
x, c, n = data.x, data.c, data.n
x_prev = data.find_x_previous()

has_observed = True if 0 in c else False
has_right_censoring = True if 1 in c else False
has_left_censoring = True if -1 in c else False
has_interval_censoring = True if x.ndim == 2 else False

x_l = x if x.ndim == 1 else x[:, 0]
x_r = x[:, 1] if x.ndim == 2 else None
x_prev_r = x_prev[:, 1] if x_prev.ndim == 2 else x_prev

# This code splits each observation type, if it exists, into its own
# array. This is done to avoid having to simplify the log-likelihood
# function to account for the different types of observations.

# Further by calculating the sum of the needed arrays, we can avoid
# having to do array sums in the log-likelihood function. This will be
# faster, especially for large datasets.

# Although this code is a bit more complex it results in a longer time
# to create the log-likelihood function, but a faster time to evaluate
# the log-likelihood function.

# In conclusion, this is a ridiculous optimisation that is probably
# not worth the effort that went into it.
if has_observed:
observed_mask = c == 0
x_o = x_l[observed_mask]
x_prev_o = x_prev_r[observed_mask]
len_observed = len(x_o)
observed_time = (x_prev_o - x_o).sum()
else:
len_observed = 0.0
observed_time = 0.0

if has_left_censoring:
left_mask = c == -1
x_left = x_l[left_mask]
n_left = n[left_mask]
log_xl = np.log(x_left)
n_log_x_left = n_left * log_xl
n_log_x_left_sum = n_log_x_left.sum()
x_left_sum = x_left.sum()
n_left_sum = n_left.sum()
n_l_factorial = gammaln(n_left + 1)
n_l_factorial_sum = n_l_factorial.sum()
else:
n_log_x_left_sum = 0.0
x_left_sum = 0.0
n_left_sum = 0.0
n_l_factorial_sum = 0.0

if has_right_censoring:
right_mask = c == 1
x_right = x_l[right_mask]
x_right_prev = x_prev_r[right_mask]
right_censored_time = (x_right_prev - x_right).sum()
else:
right_censored_time = 0.0

if has_interval_censoring:
interval_mask = c == 2
x_i_l = x_l[interval_mask]
x_i_r = x_r[interval_mask]
delta_xi = x_i_r - x_i_l

x_interval_sum = delta_xi.sum()

n_interval = n[c == 2]
n_interval_sum = n_interval.sum()

n_log_x_interval_sum = (n_interval * np.log(delta_xi)).sum()

n_i_factorial = gammaln(n_interval + 1)
n_i_factorial_sum = n_i_factorial.sum()
else:
x_interval_sum = 0.0
n_interval_sum = 0.0
n_log_x_interval_sum = 0.0
n_i_factorial_sum = 0.0

def negll_func(log_rate):
rate = anp.exp(log_rate)
ll = len_observed * log_rate + rate * observed_time
ll += rate * right_censored_time
ll += (
log_rate * n_left_sum
+ n_log_x_left_sum
- rate * x_left_sum
- n_l_factorial_sum
)
ll += (
log_rate * n_interval_sum
+ n_log_x_interval_sum
- rate * x_interval_sum
- n_i_factorial_sum
)

return -ll[0]

return negll_func

@classmethod
def fit(cls, x, i=None, c=None, n=None, init=None):
data = handle_xicn(x, i, c, n, as_recurrent_data=True)

def fit_from_recurrent_data(cls, data, init=None):
out = ParametricRecurrenceModel()
out.dist = cls()
out.data = data
Expand All @@ -85,14 +145,22 @@ def fit(cls, x, i=None, c=None, n=None, init=None):
out.support = (0.0, np.inf)
out.name = "Homogeneous Poisson Process"

init = (data.n[data.c == 0]).sum() / npi.group_by(data.i).max(data.x)[
1
].sum()
neg_ll = cls.create_negll_func(data.x, data.i, data.c, data.n)
res = minimize(
neg_ll, [init], method="Nelder-Mead", bounds=((0, None),)
)
neg_ll = cls.create_negll_func(data)
jac = jacobian(neg_ll)
hess = hessian(neg_ll)

if init is None:
init = [0.0]
else:
init = np.atleast_1d(np.log(init))

res = root(jac, init, jac=hess)
out.res = res
out.params = res.x
out.params = np.exp(res.x)

return out

@classmethod
def fit(cls, x, i=None, c=None, n=None, init=None):
data = handle_xicn(x, i, c, n, as_recurrent_data=True)
return cls.fit_from_recurrent_data(data, init=init)
161 changes: 94 additions & 67 deletions surpyval/recurrence/parametric/nhpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,107 +46,134 @@ def __init__(
else:
self.parameter_initialiser = parameter_initialiser

def create_negll_func(self, x, i, c, n):
def create_negll_func(self, data):
x, c, n = data.x, data.c, data.n
# Covariates
x_prev = data.find_x_previous()

has_interval_censoring = True if 2 in c else False
has_observed = True if 0 in c else False
has_left_censoing = True if -1 in c else False
has_right_censoring = True if 1 in c else False

x_l = x if x.ndim == 1 else x[:, 0]
x_r = x[:, 1] if x.ndim == 2 else None

x_prev_l = x_prev if x_prev.ndim == 1 else x[:, 0]
x_prev_r = x_prev[:, 1] if x_prev.ndim == 2 else None

x_o = x_l[c == 0] if has_observed else np.array([])
if has_interval_censoring:
x_o_prev = x_prev_r[c == 0] if has_observed else np.array([])
else:
x_o_prev = x_prev_l[c == 0] if has_observed else np.array([])

x_right = x_l[c == 1] if has_right_censoring else np.array([])
if has_interval_censoring:
x_right_prev = (
x_prev_r[c == 1] if has_right_censoring else np.array([])
)
else:
x_right_prev = (
x_prev_l[c == 1] if has_right_censoring else np.array([])
)

x_left = x_l[c == -1] if has_left_censoing else np.array([])
n_left = n[c == -1] if has_left_censoing else np.array([])

x_i_l = x_l[c == 2] if has_interval_censoring else np.array([])
x_i_r = x_r[c == 2] if has_interval_censoring else np.array([])
n_i = n[c == 2] if has_interval_censoring else np.array([])

# Using the empty arrays avoids the need for if statements in the
# likelihood function. It also means that the likelihood function
# will not encounter any invalid values since taking the log of 0
# will not occur.

def negll_func(params):
ll = 0

for item in set(i):
mask_item = i == item
x_item = np.atleast_1d(x[mask_item])
c_item = np.atleast_1d(c[mask_item])
n_item = np.atleast_1d(n[mask_item])
x_prev = 0

for j in range(0, len(x_item)):
if x_item.ndim == 1:
x_j = x_item[j]
else:
x_j = x_item[j][0]
x_jr = x_item[j][1]

if c_item[j] == 0:
ll += (
np.log(self.iif(x_j, *params))
- self.cif(x_j, *params)
+ self.cif(x_prev, *params)
)
x_prev = x_j
elif c_item[j] == 1:
ll += self.cif(x_prev, *params) - self.cif(
x_j, *params
)
x_prev = x_j
elif c_item[j] == 2:
delta_cif = self.cif(x_jr, *params) - self.cif(
x_j, *params
)
ll += (
n_item[j] * np.log(delta_cif)
- (delta_cif)
- gammaln(n_item[j] + 1)
)
x_prev = x_jr
elif c_item[j] == -1:
delta_cif = self.cif(x_j, *params)
ll += (
n_item[j] * np.log(delta_cif)
- (delta_cif)
- gammaln(n_item[j] + 1)
)
x_prev = x_j
# ll of directly observed
ll = (
np.log(self.iif(x_o, *params))
+ self.cif(x_o_prev, *params)
- self.cif(x_o, *params)
).sum()

# ll of right censored
ll += (
self.cif(x_right_prev, *params) - self.cif(x_right, *params)
).sum()

# ll of left censored
left_delta_cif = self.cif(x_left, *params)
ll += (
n_left * np.log(left_delta_cif)
- (left_delta_cif)
- gammaln(n_left + 1)
).sum()

# ll of interval censored
interval_delta_cif = self.cif(x_i_r, *params) - self.cif(
x_i_l, *params
)
ll += (
n_i * np.log(interval_delta_cif)
- (interval_delta_cif)
- gammaln(n_i + 1)
).sum()

return -ll

return negll_func

def fit(self, x, i=None, c=None, n=None, how="MLE", init=None):
"""
Format for counting processes...
Thinking x, i, n.
x = obvious
i is the identity.
n is counts ... maybe
"""

def fit_from_recurrent_data(self, data, how="MLE", init=None):
if init is None:
param_init = self.parameter_initialiser(x)
param_init = self.parameter_initialiser(data.x)
else:
param_init = np.array(init)

model = ParametricRecurrenceModel()

data = handle_xicn(x, i, c, n, as_recurrent_data=True)

x_unqiue, r, d = data.to_xrd()

mcf_hat = np.cumsum(d / r)

def fun(params):
return np.sum((self.cif(x_unqiue, *params) - mcf_hat) ** 2)

res = minimize(fun, param_init, bounds=self.bounds)
model.mcf_hat = mcf_hat
param_init = res.x

if how == "MSE":
params = res.x

elif how == "MLE":
ll_func = self.create_negll_func(data.x, data.i, data.c, data.n)
ll_func = self.create_negll_func(data)
res = minimize(
ll_func, param_init, method="Nelder-Mead", bounds=self.bounds
ll_func,
param_init,
method="Nelder-Mead",
bounds=self.bounds,
)
params = res.x
model._neg_ll = ll_func(params)

model = ParametricRecurrenceModel()
model.mcf_hat = mcf_hat
model.res = res
model.params = params
model.data = data
model.dist = self
model.how = how
return model

def fit(self, x, i=None, c=None, n=None, how="MLE", init=None):
"""
Format for counting processes...
Thinking x, i, n.
x = obvious
i is the identity.
n is counts ... maybe
"""
data = handle_xicn(x, i, c, n, as_recurrent_data=True)
return self.fit_from_recurrent_data(data, how, init)

def from_params(self, params):
model = ParametricRecurrenceModel()
model.params = params
Expand Down
Loading

0 comments on commit 84bf5a1

Please sign in to comment.