Merge branch 'random-survival-forest'

derrynknife · Aug 18, 2023 · 84bf5a1 · 84bf5a1
2 parents f3ed2be + 6d05d3b
commit 84bf5a1
Show file tree

Hide file tree

Showing 6 changed files with 593 additions and 311 deletions.
diff --git a/surpyval/recurrence/parametric/hpp.py b/surpyval/recurrence/parametric/hpp.py
@@ -1,6 +1,7 @@
 import numpy as np
-import numpy_indexed as npi
-from scipy.optimize import minimize
+from autograd import hessian, jacobian
+from autograd import numpy as anp
+from scipy.optimize import root
 from scipy.special import gammaln
 
 from surpyval.recurrence.parametric.parametric_recurrence import (
@@ -29,53 +30,112 @@ def inv_cif(self, cif, rate):
         return cif / rate
 
     @classmethod
-    def create_negll_func(cls, x, i, c, n):
-        def negll_func(rate):
-            ll = 0
-
-            for item in set(i):
-                mask_item = i == item
-                x_item = np.atleast_1d(x[mask_item])
-                c_item = np.atleast_1d(c[mask_item])
-                n_item = np.atleast_1d(n[mask_item])
-                x_prev = 0
-
-                for j in range(0, len(x_item)):
-                    if x_item.ndim == 1:
-                        x_j = x_item[j]
-                    else:
-                        x_j = x_item[j][0]
-                        x_jr = x_item[j][1]
-                    if c_item[j] == 0:
-                        ll += np.log(rate) - (rate * x_j) + (rate * x_prev)
-                        x_prev = x_j
-                    elif c_item[j] == 1:
-                        ll += rate * (x_prev - x_j)
-                        x_prev = x_j
-                    elif c_item[j] == 2:
-                        delta_x = x_jr - x_j
-                        ll += (
-                            n_item[j] * np.log(rate * delta_x)
-                            - gammaln(n_item[j] + 1)
-                            - (rate * delta_x)
-                        )
-                        x_prev = x_jr
-                    elif c_item[j] == -1:
-                        ll += (
-                            n_item[j] * np.log(rate * x_j)
-                            - gammaln(n_item[j] + 1)
-                            - (rate * x_j)
-                        )
-                        x_prev = x_j
-
-            return -ll
+    def create_negll_func(cls, data):
+        x, c, n = data.x, data.c, data.n
+        x_prev = data.find_x_previous()
+
+        has_observed = True if 0 in c else False
+        has_right_censoring = True if 1 in c else False
+        has_left_censoring = True if -1 in c else False
+        has_interval_censoring = True if x.ndim == 2 else False
+
+        x_l = x if x.ndim == 1 else x[:, 0]
+        x_r = x[:, 1] if x.ndim == 2 else None
+        x_prev_r = x_prev[:, 1] if x_prev.ndim == 2 else x_prev
+
+        # This code splits each observation type, if it exists, into its own
+        # array. This is done to avoid having to simplify the log-likelihood
+        # function to account for the different types of observations.
+
+        # Further by calculating the sum of the needed arrays, we can avoid
+        # having to do array sums in the log-likelihood function. This will be
+        # faster, especially for large datasets.
+
+        # Although this code is a bit more complex it results in a longer time
+        # to create the log-likelihood function, but a faster time to evaluate
+        # the log-likelihood function.
+
+        # In conclusion, this is a ridiculous optimisation that is probably
+        # not worth the effort that went into it.
+        if has_observed:
+            observed_mask = c == 0
+            x_o = x_l[observed_mask]
+            x_prev_o = x_prev_r[observed_mask]
+            len_observed = len(x_o)
+            observed_time = (x_prev_o - x_o).sum()
+        else:
+            len_observed = 0.0
+            observed_time = 0.0
+
+        if has_left_censoring:
+            left_mask = c == -1
+            x_left = x_l[left_mask]
+            n_left = n[left_mask]
+            log_xl = np.log(x_left)
+            n_log_x_left = n_left * log_xl
+            n_log_x_left_sum = n_log_x_left.sum()
+            x_left_sum = x_left.sum()
+            n_left_sum = n_left.sum()
+            n_l_factorial = gammaln(n_left + 1)
+            n_l_factorial_sum = n_l_factorial.sum()
+        else:
+            n_log_x_left_sum = 0.0
+            x_left_sum = 0.0
+            n_left_sum = 0.0
+            n_l_factorial_sum = 0.0
+
+        if has_right_censoring:
+            right_mask = c == 1
+            x_right = x_l[right_mask]
+            x_right_prev = x_prev_r[right_mask]
+            right_censored_time = (x_right_prev - x_right).sum()
+        else:
+            right_censored_time = 0.0
+
+        if has_interval_censoring:
+            interval_mask = c == 2
+            x_i_l = x_l[interval_mask]
+            x_i_r = x_r[interval_mask]
+            delta_xi = x_i_r - x_i_l
+
+            x_interval_sum = delta_xi.sum()
+
+            n_interval = n[c == 2]
+            n_interval_sum = n_interval.sum()
+
+            n_log_x_interval_sum = (n_interval * np.log(delta_xi)).sum()
+
+            n_i_factorial = gammaln(n_interval + 1)
+            n_i_factorial_sum = n_i_factorial.sum()
+        else:
+            x_interval_sum = 0.0
+            n_interval_sum = 0.0
+            n_log_x_interval_sum = 0.0
+            n_i_factorial_sum = 0.0
+
+        def negll_func(log_rate):
+            rate = anp.exp(log_rate)
+            ll = len_observed * log_rate + rate * observed_time
+            ll += rate * right_censored_time
+            ll += (
+                log_rate * n_left_sum
+                + n_log_x_left_sum
+                - rate * x_left_sum
+                - n_l_factorial_sum
+            )
+            ll += (
+                log_rate * n_interval_sum
+                + n_log_x_interval_sum
+                - rate * x_interval_sum
+                - n_i_factorial_sum
+            )
+
+            return -ll[0]
 
         return negll_func
 
     @classmethod
-    def fit(cls, x, i=None, c=None, n=None, init=None):
-        data = handle_xicn(x, i, c, n, as_recurrent_data=True)
-
+    def fit_from_recurrent_data(cls, data, init=None):
         out = ParametricRecurrenceModel()
         out.dist = cls()
         out.data = data
@@ -85,14 +145,22 @@ def fit(cls, x, i=None, c=None, n=None, init=None):
         out.support = (0.0, np.inf)
         out.name = "Homogeneous Poisson Process"
 
-        init = (data.n[data.c == 0]).sum() / npi.group_by(data.i).max(data.x)[
-            1
-        ].sum()
-        neg_ll = cls.create_negll_func(data.x, data.i, data.c, data.n)
-        res = minimize(
-            neg_ll, [init], method="Nelder-Mead", bounds=((0, None),)
-        )
+        neg_ll = cls.create_negll_func(data)
+        jac = jacobian(neg_ll)
+        hess = hessian(neg_ll)
+
+        if init is None:
+            init = [0.0]
+        else:
+            init = np.atleast_1d(np.log(init))
+
+        res = root(jac, init, jac=hess)
         out.res = res
-        out.params = res.x
+        out.params = np.exp(res.x)
 
         return out
+
+    @classmethod
+    def fit(cls, x, i=None, c=None, n=None, init=None):
+        data = handle_xicn(x, i, c, n, as_recurrent_data=True)
+        return cls.fit_from_recurrent_data(data, init=init)
diff --git a/surpyval/recurrence/parametric/nhpp.py b/surpyval/recurrence/parametric/nhpp.py
@@ -46,107 +46,134 @@ def __init__(
         else:
             self.parameter_initialiser = parameter_initialiser
 
-    def create_negll_func(self, x, i, c, n):
+    def create_negll_func(self, data):
+        x, c, n = data.x, data.c, data.n
+        # Covariates
+        x_prev = data.find_x_previous()
+
+        has_interval_censoring = True if 2 in c else False
+        has_observed = True if 0 in c else False
+        has_left_censoing = True if -1 in c else False
+        has_right_censoring = True if 1 in c else False
+
+        x_l = x if x.ndim == 1 else x[:, 0]
+        x_r = x[:, 1] if x.ndim == 2 else None
+
+        x_prev_l = x_prev if x_prev.ndim == 1 else x[:, 0]
+        x_prev_r = x_prev[:, 1] if x_prev.ndim == 2 else None
+
+        x_o = x_l[c == 0] if has_observed else np.array([])
+        if has_interval_censoring:
+            x_o_prev = x_prev_r[c == 0] if has_observed else np.array([])
+        else:
+            x_o_prev = x_prev_l[c == 0] if has_observed else np.array([])
+
+        x_right = x_l[c == 1] if has_right_censoring else np.array([])
+        if has_interval_censoring:
+            x_right_prev = (
+                x_prev_r[c == 1] if has_right_censoring else np.array([])
+            )
+        else:
+            x_right_prev = (
+                x_prev_l[c == 1] if has_right_censoring else np.array([])
+            )
+
+        x_left = x_l[c == -1] if has_left_censoing else np.array([])
+        n_left = n[c == -1] if has_left_censoing else np.array([])
+
+        x_i_l = x_l[c == 2] if has_interval_censoring else np.array([])
+        x_i_r = x_r[c == 2] if has_interval_censoring else np.array([])
+        n_i = n[c == 2] if has_interval_censoring else np.array([])
+
+        # Using the empty arrays avoids the need for if statements in the
+        # likelihood function. It also means that the likelihood function
+        # will not encounter any invalid values since taking the log of 0
+        # will not occur.
+
         def negll_func(params):
-            ll = 0
-
-            for item in set(i):
-                mask_item = i == item
-                x_item = np.atleast_1d(x[mask_item])
-                c_item = np.atleast_1d(c[mask_item])
-                n_item = np.atleast_1d(n[mask_item])
-                x_prev = 0
-
-                for j in range(0, len(x_item)):
-                    if x_item.ndim == 1:
-                        x_j = x_item[j]
-                    else:
-                        x_j = x_item[j][0]
-                        x_jr = x_item[j][1]
-
-                    if c_item[j] == 0:
-                        ll += (
-                            np.log(self.iif(x_j, *params))
-                            - self.cif(x_j, *params)
-                            + self.cif(x_prev, *params)
-                        )
-                        x_prev = x_j
-                    elif c_item[j] == 1:
-                        ll += self.cif(x_prev, *params) - self.cif(
-                            x_j, *params
-                        )
-                        x_prev = x_j
-                    elif c_item[j] == 2:
-                        delta_cif = self.cif(x_jr, *params) - self.cif(
-                            x_j, *params
-                        )
-                        ll += (
-                            n_item[j] * np.log(delta_cif)
-                            - (delta_cif)
-                            - gammaln(n_item[j] + 1)
-                        )
-                        x_prev = x_jr
-                    elif c_item[j] == -1:
-                        delta_cif = self.cif(x_j, *params)
-                        ll += (
-                            n_item[j] * np.log(delta_cif)
-                            - (delta_cif)
-                            - gammaln(n_item[j] + 1)
-                        )
-                        x_prev = x_j
+            # ll of directly observed
+            ll = (
+                np.log(self.iif(x_o, *params))
+                + self.cif(x_o_prev, *params)
+                - self.cif(x_o, *params)
+            ).sum()
+
+            # ll of right censored
+            ll += (
+                self.cif(x_right_prev, *params) - self.cif(x_right, *params)
+            ).sum()
+
+            # ll of left censored
+            left_delta_cif = self.cif(x_left, *params)
+            ll += (
+                n_left * np.log(left_delta_cif)
+                - (left_delta_cif)
+                - gammaln(n_left + 1)
+            ).sum()
+
+            # ll of interval censored
+            interval_delta_cif = self.cif(x_i_r, *params) - self.cif(
+                x_i_l, *params
+            )
+            ll += (
+                n_i * np.log(interval_delta_cif)
+                - (interval_delta_cif)
+                - gammaln(n_i + 1)
+            ).sum()
 
             return -ll
 
         return negll_func
 
-    def fit(self, x, i=None, c=None, n=None, how="MLE", init=None):
-        """
-        Format for counting processes...
-        Thinking x, i, n.
-        x = obvious
-        i is the identity.
-
-        n is counts ... maybe
-        """
-
+    def fit_from_recurrent_data(self, data, how="MLE", init=None):
         if init is None:
-            param_init = self.parameter_initialiser(x)
+            param_init = self.parameter_initialiser(data.x)
         else:
             param_init = np.array(init)
 
-        model = ParametricRecurrenceModel()
-
-        data = handle_xicn(x, i, c, n, as_recurrent_data=True)
-
         x_unqiue, r, d = data.to_xrd()
-
         mcf_hat = np.cumsum(d / r)
 
         def fun(params):
             return np.sum((self.cif(x_unqiue, *params) - mcf_hat) ** 2)
 
         res = minimize(fun, param_init, bounds=self.bounds)
-        model.mcf_hat = mcf_hat
         param_init = res.x
 
         if how == "MSE":
             params = res.x
 
         elif how == "MLE":
-            ll_func = self.create_negll_func(data.x, data.i, data.c, data.n)
+            ll_func = self.create_negll_func(data)
             res = minimize(
-                ll_func, param_init, method="Nelder-Mead", bounds=self.bounds
+                ll_func,
+                param_init,
+                method="Nelder-Mead",
+                bounds=self.bounds,
             )
             params = res.x
-            model._neg_ll = ll_func(params)
 
+        model = ParametricRecurrenceModel()
+        model.mcf_hat = mcf_hat
         model.res = res
         model.params = params
         model.data = data
         model.dist = self
         model.how = how
         return model
 
+    def fit(self, x, i=None, c=None, n=None, how="MLE", init=None):
+        """
+        Format for counting processes...
+        Thinking x, i, n.
+        x = obvious
+        i is the identity.
+
+        n is counts ... maybe
+        """
+        data = handle_xicn(x, i, c, n, as_recurrent_data=True)
+        return self.fit_from_recurrent_data(data, how, init)
+
     def from_params(self, params):
         model = ParametricRecurrenceModel()
         model.params = params