Skip to content

Commit

Permalink
Implements Bootstrap Configuration
Browse files Browse the repository at this point in the history
Adds extended configurability of Bootstrapping methods.
Updated README.md.
Updated test cases.
  • Loading branch information
OliverHennhoefer committed May 3, 2024
1 parent 991aa8a commit e516d58
Show file tree
Hide file tree
Showing 10 changed files with 290 additions and 135 deletions.
81 changes: 67 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,9 @@ Tired of *alarm fatique*?

**unquad** enables **conformal anomaly detection** for [*PyOD*](https://pyod.readthedocs.io/en/latest/) detectors.

**unquad** is a wrapper applicable for most [*PyOD*](https://pyod.readthedocs.io/en/latest/) detectors for
**unquad** is a wrapper applicable for most [*PyOD*](https://pyod.readthedocs.io/en/latest/) detectors (see [Supported Estimators](#supported-estimators)) for
**uncertainty-quantified anomaly detection** based on one-class classification and the principles of **conformal inference**.

* Wraps most '[*PyOD*](https://pyod.readthedocs.io/en/latest/)' anomaly de (see [Supported Estimators](#supported-estimators)).
* Fits and calibrates given estimator to control the (marginal) **False Discovery Rate** (FDR).

[![License](https://img.shields.io/badge/License-BSD_3--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause)
[![contributions welcome](https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat)](https://github.com/dwyl/esta/issues)
[![HitCount](https://hits.dwyl.com/OliverHennhoefer/unquad.svg?style=flat-square&show=unique)](http://hits.dwyl.com/OliverHennhoefer/unquad)
Expand All @@ -26,14 +23,14 @@ given point predictor or classifier, CAD aims to control the [*false discovery r

***CAD translates anomaly scores into statistical p-values by comparing anomaly scores observed on test data to a retained set of calibration
scores as previously on normal data during model training*** (see [*One-Class Classification*](https://en.wikipedia.org/wiki/One-class_classification#:~:text=In%20machine%20learning%2C%20one%2Dclass,of%20one%2Dclass%20classifiers%20where)).
The larger the discrepancy between *normal* scores and observed test scores, the lower the obtained (and statistically valid) p-value.
The larger the discrepancy between *normal* scores and observed test scores, the lower the obtained (**statistically valid**) p-value.
The p-values, instead of the usual anomaly estimates, allow for FDR-control by statistical procedures like *Benjamini-Hochberg*.

### Assumption
CAD assumes ***exchangability*** of training and future test data. *Exchangability* is closely related to the statistical
term of *independent and identically distributed random variables* (*IID*). IID implies both, independence <ins>and</ins>
exchangability. Exchangability defines a joint probability distribution that remains the same under permutations
of the variables. With that, exchangability is a very practicable as it is a weaker assumption than IID.
of the variables. With that, exchangability is a very practicable assumption as it is a *weaker* than IID.

### Limitations
Since CAD controls the FDR by adjustment procedures in context of **multiple testing**, trained conformal detectors currently
Expand All @@ -47,16 +44,16 @@ co-variate shift. Currently, this kind of online detector is not implemented. It
pip install unquad
```

### Usage
### Usage: Split-Conformal

```python
from pyod.models.iforest import IForest # Isolation Forest (sklearn-based)
from pyod.utils import generate_data # Example Data (PyOD built-in)
from pyod.models.iforest import IForest
from pyod.utils import generate_data

from unquad.estimator.conformal import ConformalEstimator # Model Wrapper
from unquad.enums.adjustment import Adjustment # Multiple Testing Adjustments
from unquad.enums.method import Method # Conformal Methods
from unquad.evaluation.metrics import false_discovery_rate, statistical_power # Evaluation Metrics
from unquad.estimator.conformal import ConformalEstimator
from unquad.enums.adjustment import Adjustment
from unquad.enums.method import Method
from unquad.evaluation.metrics import false_discovery_rate, statistical_power

x_train, x_test, y_train, y_test = generate_data(
n_train=1_000,
Expand All @@ -72,7 +69,7 @@ ce = ConformalEstimator(
detector=IForest(behaviour="new"),
method=Method.CV_PLUS,
adjustment=Adjustment.BENJAMINI_HOCHBERG,
alpha=0.1, # FDR
alpha=0.2, # nominal FDR level
random_state=1,
split=10,
)
Expand All @@ -84,6 +81,62 @@ print(false_discovery_rate(y=y_test, y_hat=estimates)) # Empirical FDR
print(statistical_power(y=y_test, y_hat=estimates)) # Empirical Power
```

```bash
Training: 100%|██████████| 10/10 [00:01<00:00, 8.16it/s]
Inference: 100%|██████████| 10/10 [00:00<00:00, 220.63it/s]
```

Output:
```python
0.194 # Empirical FDR
0.806 # Empirical Power
```

### Usage: Jackknife+-after-Bootstrap

```python
from pyod.models.iforest import IForest
from pyod.utils import generate_data

from unquad.estimator.conformal import ConformalEstimator
from unquad.enums.adjustment import Adjustment
from unquad.estimator.bootstrap.bootstrap_config import BootstrapConfiguration
from unquad.enums.method import Method
from unquad.evaluation.metrics import false_discovery_rate, statistical_power

x_train, x_test, y_train, y_test = generate_data(
n_train=1_000,
n_test=1_000,
n_features=10,
contamination=0.1,
random_state=1,
)

x_train = x_train[y_train == 0] # Normal Instances (One-Class Classification)

bc = BootstrapConfiguration(n=1_000, b=40, m=0.95)

ce = ConformalEstimator(
detector=IForest(behaviour="new"),
method=Method.JACKKNIFE_PLUS_AFTER_BOOTSTRAP,
adjustment=Adjustment.BENJAMINI_HOCHBERG,
alpha=0.1, # nominal FDR level
bootstrap_config=bc,
random_state=1,
)

ce.fit(x_train) # Model Fit/Calibration
estimates = ce.predict(x_test, raw=False)

print(false_discovery_rate(y=y_test, y_hat=estimates)) # Empirical FDR
print(statistical_power(y=y_test, y_hat=estimates)) # Empirical Power
```

```bash
Training: 100%|██████████| 40/40 [00:04<00:00, 8.13it/s]
Inference: 100%|██████████| 40/40 [00:00<00:00, 231.63it/s]
```

Output:
```python
0.099 # Empirical FDR
Expand Down
4 changes: 2 additions & 2 deletions examples/knn.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@

ce = ConformalEstimator(
detector=KNN(),
method=Method.CV,
method=Method.SPLIT_CONFORMAL,
adjustment=Adjustment.BENJAMINI_HOCHBERG,
alpha=0.1,
random_state=2,
split=100,
split=300,
)

ce.fit(x_train)
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ build-backend = "hatchling.build"

[project]
name = "unquad"
version = "0.0.1"
description = "Uncertainty quantified anomaly detection for 'PyOD'-detectors based on conformal inference."
version = "0.0.2"
description = "Conformal anomaly detection for 'PyOD'-detectors."
authors = [
{ name = "Oliver Hennhoefer", email = "oliver.hennhoefer@mail.de" },
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from pyod.models.iforest import IForest

from unquad.enums.adjustment import Adjustment
from unquad.estimator.bootstrap.bootstrap_config import BootstrapConfiguration
from unquad.estimator.conformal import ConformalEstimator
from unquad.enums.method import Method
from unquad.evaluation.metrics import false_discovery_rate, statistical_power
Expand All @@ -12,17 +13,17 @@ class TestConformalEstimatorsIonosphere(unittest.TestCase):

df = pd.read_csv("./test_data/fraud.zip", compression="zip")
outliers = df.loc[df.Class == 1]
inliers = df.loc[df.Class == 0]
normal = df.loc[df.Class == 0]

n_inlier = len(inliers)
n_train = n_inlier // 2
n_normal = len(normal)
n_train = n_normal // 2

x_train = inliers.head(n_train)
x_train = normal.head(n_train)
x_train = x_train.drop(["Class"], axis=1)

x_test = pd.concat(
[
inliers.tail((n_inlier - n_train)).sample(frac=0.05, random_state=1),
normal.tail((n_normal - n_train)).sample(frac=0.05, random_state=1),
outliers,
],
axis=0,
Expand All @@ -31,9 +32,6 @@ class TestConformalEstimatorsIonosphere(unittest.TestCase):
x_test = x_test.drop(["Class"], axis=1)

def test_split_conformal(self):
"""
Split-Conformal Estimator.
"""

ce = ConformalEstimator(
detector=IForest(behaviour="new"),
Expand All @@ -55,9 +53,6 @@ def test_split_conformal(self):
self.assertEqual(power, 0.875)

def test_cv(self):
"""
CV Estimator.
"""

ce = ConformalEstimator(
detector=IForest(behaviour="new"),
Expand All @@ -79,18 +74,16 @@ def test_cv(self):
self.assertEqual(power, 0.869)

def test_jackknife_after_bootstrap(self):
"""
Jackknife-after-Bootstrap Estimator.
"""

bc = BootstrapConfiguration(n=1_000, b=30, m=0.975)

ce = ConformalEstimator(
detector=IForest(behaviour="new"),
method=Method.JACKKNIFE_AFTER_BOOTSTRAP,
adjustment=Adjustment.BENJAMINI_HOCHBERG,
alpha=0.1,
random_state=1,
split=30,
bootstrap=0.975,
bootstrap_config=bc,
silent=True,
)

Expand All @@ -104,18 +97,16 @@ def test_jackknife_after_bootstrap(self):
self.assertEqual(power, 0.871)

def test_jackknife_plus_after_bootstrap(self):
"""
Jackknife+-after-Bootstrap Estimator.
"""

bc = BootstrapConfiguration(n=1_000, b=30, m=0.975)

ce = ConformalEstimator(
detector=IForest(behaviour="new"),
method=Method.JACKKNIFE_PLUS_AFTER_BOOTSTRAP,
adjustment=Adjustment.BENJAMINI_HOCHBERG,
alpha=0.10,
random_state=1,
split=30,
bootstrap=0.975,
bootstrap_config=bc,
silent=True,
)

Expand All @@ -127,3 +118,7 @@ def test_jackknife_plus_after_bootstrap(self):

self.assertEqual(fdr, 0.154)
self.assertEqual(power, 0.846)


if __name__ == "__main__":
unittest.main()
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from pyod.models.iforest import IForest

from unquad.enums.adjustment import Adjustment
from unquad.estimator.bootstrap.bootstrap_config import BootstrapConfiguration
from unquad.estimator.conformal import ConformalEstimator
from unquad.enums.method import Method
from unquad.evaluation.metrics import false_discovery_rate, statistical_power
Expand All @@ -25,9 +26,6 @@ class TestConformalEstimatorsIonosphere(unittest.TestCase):
x_test = x_test.drop(["Class"], axis=1)

def test_split_conformal(self):
"""
Split-Conformal Estimator.
"""

ce = ConformalEstimator(
detector=IForest(behaviour="new"),
Expand All @@ -48,9 +46,6 @@ def test_split_conformal(self):
self.assertEqual(power, 0.809)

def test_cv(self):
"""
CV Estimator.
"""

ce = ConformalEstimator(
detector=IForest(behaviour="new"),
Expand All @@ -72,9 +67,6 @@ def test_cv(self):
self.assertEqual(power, 0.958)

def test_cv_plus(self):
"""
CV+ Estimator.
"""

ce = ConformalEstimator(
detector=IForest(behaviour="new"),
Expand All @@ -96,9 +88,6 @@ def test_cv_plus(self):
self.assertEqual(power, 1.0)

def test_jackknife(self):
"""
Jackknife Estimator.
"""

ce = ConformalEstimator(
detector=IForest(behaviour="new"),
Expand All @@ -119,9 +108,6 @@ def test_jackknife(self):
self.assertEqual(power, 0.872)

def test_jackknife_plus(self):
"""
Jackknife+ Estimator.
"""

ce = ConformalEstimator(
detector=IForest(behaviour="new"),
Expand All @@ -142,18 +128,16 @@ def test_jackknife_plus(self):
self.assertEqual(power, 0.835)

def test_jackknife_after_bootstrap(self):
"""
Jackknife-after-Bootstrap Estimator.
"""

bc = BootstrapConfiguration(n=1_000, b=50, m=0.75)

ce = ConformalEstimator(
detector=IForest(behaviour="new"),
method=Method.JACKKNIFE_AFTER_BOOTSTRAP,
adjustment=Adjustment.BENJAMINI_HOCHBERG,
alpha=0.2,
bootstrap_config=bc,
random_state=1,
split=50,
bootstrap=0.75,
silent=True,
)

Expand All @@ -167,18 +151,16 @@ def test_jackknife_after_bootstrap(self):
self.assertEqual(power, 0.947)

def test_jackknife_plus_after_bootstrap(self):
"""
Jackknife+-after-Bootstrap Estimator.
"""

bc = BootstrapConfiguration(n=1_000, b=50, m=0.75)

ce = ConformalEstimator(
detector=IForest(behaviour="new"),
method=Method.JACKKNIFE_PLUS_AFTER_BOOTSTRAP,
adjustment=Adjustment.BENJAMINI_HOCHBERG,
alpha=0.2,
random_state=1,
split=50,
bootstrap=0.75,
bootstrap_config=bc,
silent=True,
)

Expand All @@ -190,3 +172,7 @@ def test_jackknife_plus_after_bootstrap(self):

self.assertEqual(fdr, 0.03)
self.assertEqual(power, 0.97)


if __name__ == "__main__":
unittest.main()
26 changes: 26 additions & 0 deletions tests/unit/test_bootstrap_configuration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import unittest

from unquad.estimator.bootstrap.bootstrap_config import BootstrapConfiguration


class TestEvaluationMetrics(unittest.TestCase):

def test_bootstrap_configuration_with_n_b_m(self):
bc_nbm = BootstrapConfiguration(n=1_000, b=20, m=0.5)
self.assertEqual(bc_nbm._c, 10_000)

def test_bootstrap_configuration_with_n_b_c(self):
bc_nbc = BootstrapConfiguration(n=1_000, b=20, c=10_000)
self.assertEqual(bc_nbc._m, 0.5)

def test_bootstrap_configuration_with_n_m_c(self):
bc_nmc = BootstrapConfiguration(n=1_000, m=0.5, c=10_000)
self.assertEqual(bc_nmc.b, 20)

def test_bootstrap_configuration_with_all(self):
with self.assertRaises(ValueError):
BootstrapConfiguration(n=1_000, b=20, m=0.5, c=10_000)


if __name__ == "__main__":
unittest.main()
Loading

0 comments on commit e516d58

Please sign in to comment.