Implements Bootstrap Configuration

Adds extended configurability of Bootstrapping methods. Updated README.md. Updated test cases.
OliverHennhoefer · May 3, 2024 · e516d58 · e516d58
1 parent 991aa8a
commit e516d58
Show file tree

Hide file tree

Showing 10 changed files with 290 additions and 135 deletions.
diff --git a/README.md b/README.md
@@ -4,12 +4,9 @@ Tired of *alarm fatique*?
 
 **unquad** enables **conformal anomaly detection** for [*PyOD*](https://pyod.readthedocs.io/en/latest/) detectors.
 
-**unquad** is a wrapper applicable for most [*PyOD*](https://pyod.readthedocs.io/en/latest/) detectors for
+**unquad** is a wrapper applicable for most [*PyOD*](https://pyod.readthedocs.io/en/latest/) detectors (see [Supported Estimators](#supported-estimators)) for
 **uncertainty-quantified anomaly detection** based on one-class classification and the principles of **conformal inference**.
 
-* Wraps most '[*PyOD*](https://pyod.readthedocs.io/en/latest/)' anomaly de (see [Supported Estimators](#supported-estimators)).
-* Fits and calibrates given estimator to control the (marginal) **False Discovery Rate** (FDR).
-
 [![License](https://img.shields.io/badge/License-BSD_3--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause)
 [![contributions welcome](https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat)](https://github.com/dwyl/esta/issues)
 [![HitCount](https://hits.dwyl.com/OliverHennhoefer/unquad.svg?style=flat-square&show=unique)](http://hits.dwyl.com/OliverHennhoefer/unquad)
@@ -26,14 +23,14 @@ given point predictor or classifier, CAD aims to control the [*false discovery r
 
 ***CAD translates anomaly scores into statistical p-values by comparing anomaly scores observed on test data to a retained set of calibration
 scores as previously on normal data during model training*** (see [*One-Class Classification*](https://en.wikipedia.org/wiki/One-class_classification#:~:text=In%20machine%20learning%2C%20one%2Dclass,of%20one%2Dclass%20classifiers%20where)).
-The larger the discrepancy between *normal* scores and observed test scores, the lower the obtained (and statistically valid) p-value.
+The larger the discrepancy between *normal* scores and observed test scores, the lower the obtained (**statistically valid**) p-value.
 The p-values, instead of the usual anomaly estimates, allow for FDR-control by statistical procedures like *Benjamini-Hochberg*.
 
 ### Assumption
 CAD assumes ***exchangability*** of training and future test data. *Exchangability* is closely related to the statistical
 term of *independent and identically distributed random variables* (*IID*). IID implies both, independence <ins>and</ins> 
 exchangability. Exchangability defines a joint probability distribution that remains the same under permutations
-of the variables. With that, exchangability is a very practicable as it is a weaker assumption than IID.
+of the variables. With that, exchangability is a very practicable assumption as it is a *weaker* than IID.
 
 ### Limitations
 Since CAD controls the FDR by adjustment procedures in context of **multiple testing**, trained conformal detectors currently
@@ -47,16 +44,16 @@ co-variate shift. Currently, this kind of online detector is not implemented. It
 pip install unquad
 ```
 
-### Usage
+### Usage: Split-Conformal
 
 ```python
-from pyod.models.iforest import IForest  # Isolation Forest (sklearn-based)
-from pyod.utils import generate_data  # Example Data (PyOD built-in)
+from pyod.models.iforest import IForest
+from pyod.utils import generate_data
 
-from unquad.estimator.conformal import ConformalEstimator  # Model Wrapper
-from unquad.enums.adjustment import Adjustment  # Multiple Testing Adjustments
-from unquad.enums.method import Method  # Conformal Methods
-from unquad.evaluation.metrics import false_discovery_rate, statistical_power  # Evaluation Metrics
+from unquad.estimator.conformal import ConformalEstimator
+from unquad.enums.adjustment import Adjustment
+from unquad.enums.method import Method
+from unquad.evaluation.metrics import false_discovery_rate, statistical_power
 
 x_train, x_test, y_train, y_test = generate_data(
         n_train=1_000,
@@ -72,7 +69,7 @@ ce = ConformalEstimator(
             detector=IForest(behaviour="new"),
             method=Method.CV_PLUS,
             adjustment=Adjustment.BENJAMINI_HOCHBERG,
-            alpha=0.1,  # FDR
+            alpha=0.2,  # nominal FDR level
             random_state=1,
             split=10,
         )
@@ -84,6 +81,62 @@ print(false_discovery_rate(y=y_test, y_hat=estimates))  # Empirical FDR
 print(statistical_power(y=y_test, y_hat=estimates))  # Empirical Power
 ```
 
+```bash
+Training: 100%|██████████| 10/10 [00:01<00:00,  8.16it/s]
+Inference: 100%|██████████| 10/10 [00:00<00:00, 220.63it/s]
+```
+
+Output:
+```python
+0.194 # Empirical FDR
+0.806 # Empirical Power
+```
+
+### Usage: Jackknife+-after-Bootstrap
+
+```python
+from pyod.models.iforest import IForest
+from pyod.utils import generate_data
+
+from unquad.estimator.conformal import ConformalEstimator
+from unquad.enums.adjustment import Adjustment
+from unquad.estimator.bootstrap.bootstrap_config import BootstrapConfiguration
+from unquad.enums.method import Method 
+from unquad.evaluation.metrics import false_discovery_rate, statistical_power
+
+x_train, x_test, y_train, y_test = generate_data(
+        n_train=1_000,
+        n_test=1_000,
+        n_features=10,
+        contamination=0.1,
+        random_state=1,
+    )
+
+x_train = x_train[y_train == 0]  # Normal Instances (One-Class Classification)
+
+bc = BootstrapConfiguration(n=1_000, b=40, m=0.95)
+
+ce = ConformalEstimator(
+            detector=IForest(behaviour="new"),
+            method=Method.JACKKNIFE_PLUS_AFTER_BOOTSTRAP,
+            adjustment=Adjustment.BENJAMINI_HOCHBERG,
+            alpha=0.1,  # nominal FDR level
+            bootstrap_config=bc,
+            random_state=1,
+        )
+
+ce.fit(x_train)  # Model Fit/Calibration
+estimates = ce.predict(x_test, raw=False)
+
+print(false_discovery_rate(y=y_test, y_hat=estimates))  # Empirical FDR
+print(statistical_power(y=y_test, y_hat=estimates))  # Empirical Power
+```
+
+```bash
+Training: 100%|██████████| 40/40 [00:04<00:00,  8.13it/s]
+Inference: 100%|██████████| 40/40 [00:00<00:00, 231.63it/s]
+```
+
 Output:
 ```python
 0.099 # Empirical FDR

diff --git a/examples/knn.py b/examples/knn.py
@@ -20,11 +20,11 @@
 
     ce = ConformalEstimator(
         detector=KNN(),
-        method=Method.CV,
+        method=Method.SPLIT_CONFORMAL,
         adjustment=Adjustment.BENJAMINI_HOCHBERG,
         alpha=0.1,
         random_state=2,
-        split=100,
+        split=300,
     )
 
     ce.fit(x_train)

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,8 +4,8 @@ build-backend = "hatchling.build"
 
 [project]
 name = "unquad"
-version = "0.0.1"
-description = "Uncertainty quantified anomaly detection for 'PyOD'-detectors based on conformal inference."
+version = "0.0.2"
+description = "Conformal anomaly detection for 'PyOD'-detectors."
 authors = [
   { name = "Oliver Hennhoefer", email = "oliver.hennhoefer@mail.de" },
 ]

diff --git a/tests/functional/conformal_estimators/test_conformal_estimators_fraud.py b/tests/functional/conformal_estimators/test_conformal_estimators_fraud.py
@@ -3,6 +3,7 @@
 from pyod.models.iforest import IForest
 
 from unquad.enums.adjustment import Adjustment
+from unquad.estimator.bootstrap.bootstrap_config import BootstrapConfiguration
 from unquad.estimator.conformal import ConformalEstimator
 from unquad.enums.method import Method
 from unquad.evaluation.metrics import false_discovery_rate, statistical_power
@@ -12,17 +13,17 @@ class TestConformalEstimatorsIonosphere(unittest.TestCase):
 
     df = pd.read_csv("./test_data/fraud.zip", compression="zip")
     outliers = df.loc[df.Class == 1]
-    inliers = df.loc[df.Class == 0]
+    normal = df.loc[df.Class == 0]
 
-    n_inlier = len(inliers)
-    n_train = n_inlier // 2
+    n_normal = len(normal)
+    n_train = n_normal // 2
 
-    x_train = inliers.head(n_train)
+    x_train = normal.head(n_train)
     x_train = x_train.drop(["Class"], axis=1)
 
     x_test = pd.concat(
         [
-            inliers.tail((n_inlier - n_train)).sample(frac=0.05, random_state=1),
+            normal.tail((n_normal - n_train)).sample(frac=0.05, random_state=1),
             outliers,
         ],
         axis=0,
@@ -31,9 +32,6 @@ class TestConformalEstimatorsIonosphere(unittest.TestCase):
     x_test = x_test.drop(["Class"], axis=1)
 
     def test_split_conformal(self):
-        """
-        Split-Conformal Estimator.
-        """
 
         ce = ConformalEstimator(
             detector=IForest(behaviour="new"),
@@ -55,9 +53,6 @@ def test_split_conformal(self):
         self.assertEqual(power, 0.875)
 
     def test_cv(self):
-        """
-        CV Estimator.
-        """
 
         ce = ConformalEstimator(
             detector=IForest(behaviour="new"),
@@ -79,18 +74,16 @@ def test_cv(self):
         self.assertEqual(power, 0.869)
 
     def test_jackknife_after_bootstrap(self):
-        """
-        Jackknife-after-Bootstrap Estimator.
-        """
+
+        bc = BootstrapConfiguration(n=1_000, b=30, m=0.975)
 
         ce = ConformalEstimator(
             detector=IForest(behaviour="new"),
             method=Method.JACKKNIFE_AFTER_BOOTSTRAP,
             adjustment=Adjustment.BENJAMINI_HOCHBERG,
             alpha=0.1,
             random_state=1,
-            split=30,
-            bootstrap=0.975,
+            bootstrap_config=bc,
             silent=True,
         )
 
@@ -104,18 +97,16 @@ def test_jackknife_after_bootstrap(self):
         self.assertEqual(power, 0.871)
 
     def test_jackknife_plus_after_bootstrap(self):
-        """
-        Jackknife+-after-Bootstrap Estimator.
-        """
+
+        bc = BootstrapConfiguration(n=1_000, b=30, m=0.975)
 
         ce = ConformalEstimator(
             detector=IForest(behaviour="new"),
             method=Method.JACKKNIFE_PLUS_AFTER_BOOTSTRAP,
             adjustment=Adjustment.BENJAMINI_HOCHBERG,
             alpha=0.10,
             random_state=1,
-            split=30,
-            bootstrap=0.975,
+            bootstrap_config=bc,
             silent=True,
         )
 
@@ -127,3 +118,7 @@ def test_jackknife_plus_after_bootstrap(self):
 
         self.assertEqual(fdr, 0.154)
         self.assertEqual(power, 0.846)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/functional/conformal_estimators/test_conformal_estimators_ionosphere.py b/tests/functional/conformal_estimators/test_conformal_estimators_ionosphere.py
@@ -3,6 +3,7 @@
 from pyod.models.iforest import IForest
 
 from unquad.enums.adjustment import Adjustment
+from unquad.estimator.bootstrap.bootstrap_config import BootstrapConfiguration
 from unquad.estimator.conformal import ConformalEstimator
 from unquad.enums.method import Method
 from unquad.evaluation.metrics import false_discovery_rate, statistical_power
@@ -25,9 +26,6 @@ class TestConformalEstimatorsIonosphere(unittest.TestCase):
     x_test = x_test.drop(["Class"], axis=1)
 
     def test_split_conformal(self):
-        """
-        Split-Conformal Estimator.
-        """
 
         ce = ConformalEstimator(
             detector=IForest(behaviour="new"),
@@ -48,9 +46,6 @@ def test_split_conformal(self):
         self.assertEqual(power, 0.809)
 
     def test_cv(self):
-        """
-        CV Estimator.
-        """
 
         ce = ConformalEstimator(
             detector=IForest(behaviour="new"),
@@ -72,9 +67,6 @@ def test_cv(self):
         self.assertEqual(power, 0.958)
 
     def test_cv_plus(self):
-        """
-        CV+ Estimator.
-        """
 
         ce = ConformalEstimator(
             detector=IForest(behaviour="new"),
@@ -96,9 +88,6 @@ def test_cv_plus(self):
         self.assertEqual(power, 1.0)
 
     def test_jackknife(self):
-        """
-        Jackknife Estimator.
-        """
 
         ce = ConformalEstimator(
             detector=IForest(behaviour="new"),
@@ -119,9 +108,6 @@ def test_jackknife(self):
         self.assertEqual(power, 0.872)
 
     def test_jackknife_plus(self):
-        """
-        Jackknife+ Estimator.
-        """
 
         ce = ConformalEstimator(
             detector=IForest(behaviour="new"),
@@ -142,18 +128,16 @@ def test_jackknife_plus(self):
         self.assertEqual(power, 0.835)
 
     def test_jackknife_after_bootstrap(self):
-        """
-        Jackknife-after-Bootstrap Estimator.
-        """
+
+        bc = BootstrapConfiguration(n=1_000, b=50, m=0.75)
 
         ce = ConformalEstimator(
             detector=IForest(behaviour="new"),
             method=Method.JACKKNIFE_AFTER_BOOTSTRAP,
             adjustment=Adjustment.BENJAMINI_HOCHBERG,
             alpha=0.2,
+            bootstrap_config=bc,
             random_state=1,
-            split=50,
-            bootstrap=0.75,
             silent=True,
         )
 
@@ -167,18 +151,16 @@ def test_jackknife_after_bootstrap(self):
         self.assertEqual(power, 0.947)
 
     def test_jackknife_plus_after_bootstrap(self):
-        """
-        Jackknife+-after-Bootstrap Estimator.
-        """
+
+        bc = BootstrapConfiguration(n=1_000, b=50, m=0.75)
 
         ce = ConformalEstimator(
             detector=IForest(behaviour="new"),
             method=Method.JACKKNIFE_PLUS_AFTER_BOOTSTRAP,
             adjustment=Adjustment.BENJAMINI_HOCHBERG,
             alpha=0.2,
             random_state=1,
-            split=50,
-            bootstrap=0.75,
+            bootstrap_config=bc,
             silent=True,
         )
 
@@ -190,3 +172,7 @@ def test_jackknife_plus_after_bootstrap(self):
 
         self.assertEqual(fdr, 0.03)
         self.assertEqual(power, 0.97)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unit/test_bootstrap_configuration.py b/tests/unit/test_bootstrap_configuration.py
@@ -0,0 +1,26 @@
+import unittest
+
+from unquad.estimator.bootstrap.bootstrap_config import BootstrapConfiguration
+
+
+class TestEvaluationMetrics(unittest.TestCase):
+
+    def test_bootstrap_configuration_with_n_b_m(self):
+        bc_nbm = BootstrapConfiguration(n=1_000, b=20, m=0.5)
+        self.assertEqual(bc_nbm._c, 10_000)
+
+    def test_bootstrap_configuration_with_n_b_c(self):
+        bc_nbc = BootstrapConfiguration(n=1_000, b=20, c=10_000)
+        self.assertEqual(bc_nbc._m, 0.5)
+
+    def test_bootstrap_configuration_with_n_m_c(self):
+        bc_nmc = BootstrapConfiguration(n=1_000, m=0.5, c=10_000)
+        self.assertEqual(bc_nmc.b, 20)
+
+    def test_bootstrap_configuration_with_all(self):
+        with self.assertRaises(ValueError):
+            BootstrapConfiguration(n=1_000, b=20, m=0.5, c=10_000)
+
+
+if __name__ == "__main__":
+    unittest.main()