Skip to content

Commit

Permalink
Merge pull request #27 from koaning/v0.1.4
Browse files Browse the repository at this point in the history
V0.1.4
  • Loading branch information
koaning committed Dec 15, 2021
2 parents 46ba333 + b5414c5 commit f263524
Show file tree
Hide file tree
Showing 13 changed files with 98 additions and 58 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,9 @@ The library implemented many "reasons" for doubt.
- `WrongPredictionReason`: assign doubt when a model cannot predict the listed label
- `ShortConfidenceReason`: assign doubt when the correct label gains too little confidence
- `LongConfidenceReason`: assign doubt when a wrong label gains too much confidence
- `MarginConfidenceReason`: assign doubt when there's a large difference between the top two classes
- `DisagreeReason`: assign doubt when two models disagree on a prediction
- `CleanlabReason`: assign doubt according to [cleanlab](https://github.com/cleanlab/cleanlab)
- `MarginConfidenceReason`: assign doubt when there's a small difference between the top two class confidences

### Regression Reasons

Expand Down
2 changes: 1 addition & 1 deletion docs/quickstart/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ of reasons that this library supports.
- `WrongPredictionReason`: assign doubt when a model cannot predict the listed label
- `ShortConfidenceReason`: assign doubt when the correct label gains too little confidence
- `LongConfidenceReason`: assign doubt when a wrong label gains too much confidence
- `MarginConfidenceReason`: assign doubt when there's a large difference between the top two classes
- `MarginConfidenceReason`: assign doubt when there's a small difference between the top two class confidences
- `DisagreeReason`: assign doubt when two models disagree on a prediction
- `CleanlabReason`: assign doubt according to [cleanlab](https://github.com/cleanlab/cleanlab)

Expand Down
92 changes: 54 additions & 38 deletions doubtlab/reason.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,13 @@ def __call__(self, X, y=None):
class ShannonEntropyReason:
"""
Assign doubt when the normalized Shannon entropy is too high, see
https://math.stackexchange.com/questions/395121/how-entropy-scales-with-sample-size
[here](https://math.stackexchange.com/questions/395121/how-entropy-scales-with-sample-size)
for a discussion.
Arguments:
model: scikit-learn classifier
threshold: confidence threshold for doubt assignment
smoothing: constant value added to probas to prevent division by zeor
Usage:
Expand All @@ -119,21 +120,36 @@ class ShannonEntropyReason:
```
"""

def __init__(self, model, threshold=0.5):
def __init__(self, model, threshold=0.5, smoothing=1e-5):
self.model = model
self.threshold = threshold
self.smoothing = smoothing

def __call__(self, X, y):
probas = self.model.predict_proba(X)
log_probas = self.model.predict_log_proba(X) / np.log(len(self.model.classes_))
entropies = -(probas * log_probas).sum(axis=1)
return np.where(entropies > self.threshold, entropies, 0)
return self.from_proba(
probas, threshold=self.threshold, smoothing=self.smoothing
)

@staticmethod
def from_proba(proba, n_classes, threshold=0.5):
"""Outputs a reason array from a prediction array, skipping the need for a model."""
entropies = -(proba * np.log(proba) / np.log(n_classes)).sum(axis=1)
return np.where(entropies > threshold, entropies, 0)
def from_proba(proba, threshold=0.5, smoothing=1e-5):
"""
Outputs a reason array from a prediction array, skipping the need for a model.
Usage:
```python
import numpy as np
from doubtlab.reason import ShannonEntropyReason
probas = np.array([[0.9, 0.1, 0.0], [0.5, 0.4, 0.1]])
predicate = ShannonEntropyReason.from_proba(probas, threshold=0.8)
assert np.all(predicate == np.array([0.0, 1.0]))
```
"""
probas = proba + smoothing
entropies = -(probas * np.log(probas) / np.log(probas.shape[1])).sum(axis=1)
return (entropies > threshold).astype(np.float16)


class WrongPredictionReason:
Expand Down Expand Up @@ -170,7 +186,7 @@ def __call__(self, X, y):
return self.from_predict(preds, y)

@staticmethod
def from_predict(preds, y):
def from_predict(pred, y):
"""
Outputs a reason array from a prediction array, skipping the need for a model.
Expand All @@ -186,7 +202,7 @@ def from_predict(preds, y):
assert np.all(predicate == np.array([0.0, 1.0]))
```
"""
return (preds != y).astype(np.float16)
return (pred != y).astype(np.float16)


class LongConfidenceReason:
Expand Down Expand Up @@ -221,7 +237,7 @@ def __init__(self, model, threshold=0.2):
self.threshold = threshold

@staticmethod
def from_probas(probas, y, classes, threshold):
def from_proba(proba, y, classes, threshold):
"""
Outputs a reason array from a proba array, skipping the need for a model.
Expand All @@ -231,24 +247,24 @@ def from_probas(probas, y, classes, threshold):
import numpy as np
from doubtlab.reason import LongConfidenceReason
probas = np.array([[0.9, 0.1], [0.5, 0.5]])
y = np.array([1, 0])
probas = np.array([[0.9, 0.1], [0.5, 0.5], [0.2, 0.8]])
y = np.array([0, 1, 0])
classes = np.array([0, 1])
threshold = 0.4
predicate = LongConfidenceReason.from_probas(preds, y, classes, threshold)
assert np.all(predicate == np.array([0.0, 1.0]))
predicate = LongConfidenceReason.from_proba(probas, y, classes, threshold)
assert np.all(predicate == np.array([0.0, 1.0, 1.0]))
```
"""
values = []
for i, proba in enumerate(probas):
for i, proba in enumerate(proba):
proba_dict = {classes[j]: v for j, v in enumerate(proba) if j != y[i]}
values.append(max(proba_dict.values()))
confidences = np.array(values)
return (confidences > threshold).astype(np.float16)

def __call__(self, X, y):
probas = self.model.predict_proba(X)
return self.from_probas(probas, y, self.model.classes_, self.threshold)
return self.from_proba(probas, y, self.model.classes_, self.threshold)


class MarginConfidenceReason:
Expand Down Expand Up @@ -285,7 +301,7 @@ def __init__(self, model, threshold=0.2):
self.threshold = threshold

@staticmethod
def from_probas(probas, threshold=0.2):
def from_proba(proba, threshold=0.2):
"""
Outputs a reason array from a proba array, skipping the need for a model.
Expand All @@ -296,17 +312,17 @@ def from_probas(probas, threshold=0.2):
from doubtlab.reason import MarginConfidenceReason
probas = np.array([[0.9, 0.1, 0.0], [0.5, 0.4, 0.1]])
predicate = MarginConfidenceReason.from_probas(probas, threshold=0.3)
predicate = MarginConfidenceReason.from_proba(probas, threshold=0.3)
assert np.all(predicate == np.array([0.0, 1.0]))
```
"""
sorted = np.sort(probas, axis=1)
sorted = np.sort(proba, axis=1)
margin = sorted[:, -1] - sorted[:, -2]
return (margin < threshold).astype(np.float16)

def __call__(self, X, y):
probas = self.model.predict_proba(X)
return self.from_probas(probas, self.threshold)
return self.from_proba(probas, self.threshold)


class ShortConfidenceReason:
Expand Down Expand Up @@ -341,7 +357,7 @@ def __init__(self, model, threshold=0.2):
self.threshold = threshold

@staticmethod
def from_probas(probas, y, classes, threshold=0.2):
def from_proba(proba, y, classes, threshold=0.2):
"""
Outputs a reason array from a proba array, skipping the need for a model.
Expand All @@ -351,23 +367,24 @@ def from_probas(probas, y, classes, threshold=0.2):
import numpy as np
from doubtlab.reason import ShortConfidenceReason
probas = np.array([[0.9, 0.1], [0.5, 0.5]])
y = np.array([0, 1])
probas = np.array([[0.9, 0.1], [0.5, 0.5], [0.3, 0.7]])
y = np.array([0, 1, 0])
classes = np.array([0, 1])
threshold = 0.6
predicate = ShortConfidenceReason.from_probas(probas, y, classes, threshold)
assert np.all(predicate == np.array([0.0, 1.0]))
threshold = 0.4
predicate = ShortConfidenceReason.from_proba(probas, y, classes, threshold)
assert np.all(predicate == np.array([0.0, 0.0, 1.0]))
```
"""
values = []
for i, p in enumerate(probas):
for i, p in enumerate(proba):
proba_dict = {classes[j]: v for j, v in enumerate(p)}
values.append(proba_dict[y[i]])
confidences = np.array(values)
return (confidences < threshold).astype(np.float16)

def __call__(self, X, y):
probas = self.model.predict_proba(X)
return self.from_probas(probas, y, self.model.classes_, self.threshold)
return self.from_proba(probas, y, self.model.classes_, self.threshold)


class DisagreeReason:
Expand Down Expand Up @@ -405,13 +422,14 @@ def __init__(self, model1, model2):
self.model2 = model2

@staticmethod
def from_pred(preds1, preds2):
def from_pred(pred1, pred2):
"""
Outputs a reason array from two pred arrays, skipping the need for a model.
Usage:
```python
import numpy as np
from doubtlab.reason import DisagreeReason
pred1 = [0, 1, 2]
Expand All @@ -420,7 +438,7 @@ def from_pred(preds1, preds2):
assert np.all(predicate == np.array([0.0, 0.0, 1.0]))
```
"""
return (np.array(preds1) != np.array(preds2)).astype(np.float16)
return (np.array(pred1) != np.array(pred2)).astype(np.float16)

def __call__(self, X, y):
pred1 = self.model1.predict(X)
Expand Down Expand Up @@ -568,7 +586,7 @@ def __init__(self, model, sorted_index_method="normalized_margin", min_doubt=0.5
self.min_doubt = min_doubt

@staticmethod
def from_probas(probas, y, min_doubt=0.5, sorted_index_method="normalized_margin"):
def from_proba(proba, y, min_doubt=0.5, sorted_index_method="normalized_margin"):
"""
Outputs a reason array from a proba array, skipping the need for a model.
Expand All @@ -580,12 +598,10 @@ def from_probas(probas, y, min_doubt=0.5, sorted_index_method="normalized_margin
probas = np.array([[0.9, 0.1], [0.5, 0.5]])
y = np.array([0, 1])
classes = np.array([0, 1])
threshold = 0.4
predicate = CleanlabReason.from_probas(probas, y, classes, threshold)
predicate = CleanlabReason.from_proba(probas, y)
```
"""
ordered_label_errors = get_noise_indices(y, probas, sorted_index_method)
ordered_label_errors = get_noise_indices(y, proba, sorted_index_method)
result = np.zeros_like(y)
conf_arr = np.linspace(1, min_doubt, result.shape[0])
for idx, _ in zip(ordered_label_errors, conf_arr):
Expand All @@ -594,4 +610,4 @@ def from_probas(probas, y, min_doubt=0.5, sorted_index_method="normalized_margin

def __call__(self, X, y):
probas = self.model.predict_proba(X)
return self.from_probas(probas, y, self.min_doubt, self.sorted_index_method)
return self.from_proba(probas, y, self.min_doubt, self.sorted_index_method)
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@

setup(
name="doubtlab",
version="0.1.3",
version="0.1.4",
author="Vincent D. Warmerdam",
packages=find_packages(exclude=["notebooks", "docs"]),
description="Don't Blindly Trust Your Labels",
Expand Down
23 changes: 17 additions & 6 deletions tests/test_docs.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import pytest
from mktestdocs import check_docstring, check_md_file
from mktestdocs import check_docstring, check_md_file, get_codeblock_members

from doubtlab.reason import (
ProbaReason,
Expand All @@ -13,10 +13,11 @@
AbsoluteDifferenceReason,
RelativeDifferenceReason,
CleanlabReason,
ShannonEntropyReason,
)
from doubtlab.ensemble import DoubtEnsemble

all_reasons = [
all_objects = [
ProbaReason,
RandomReason,
OutlierReason,
Expand All @@ -28,14 +29,24 @@
AbsoluteDifferenceReason,
RelativeDifferenceReason,
CleanlabReason,
ShannonEntropyReason,
DoubtEnsemble,
]


@pytest.mark.parametrize(
"func", all_reasons + [DoubtEnsemble], ids=lambda d: d.__name__
)
def flatten(items):
"""Flattens a list"""
return [item for sublist in items for item in sublist]


# This way we ensure that each item in `all_members` points to a method
# that could have a docstring.
all_members = flatten([get_codeblock_members(o) for o in all_objects])


@pytest.mark.parametrize("func", all_members, ids=lambda d: d.__qualname__)
def test_function_docstrings(func):
"""Test the docstring code of some functions."""
"""Test the python example in each method in each object."""
check_docstring(obj=func)


Expand Down
4 changes: 1 addition & 3 deletions tests/test_reason/test_cleanlab.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,5 @@ def test_longconf_proba():
"""Test from_probas on a obvious example."""
probas = np.array([[0.9, 0.1], [0.5, 0.5]])
y = np.array([0, 1])
classes = np.array([0, 1])
threshold = 0.4
predicate = CleanlabReason.from_probas(probas, y, classes, threshold)
predicate = CleanlabReason.from_proba(proba=probas, y=y)
assert predicate.dtype == np.float16
2 changes: 1 addition & 1 deletion tests/test_reason/test_disagree.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@ def test_short_conf_probas():
"""
pred1 = [0, 1, 2]
pred2 = [0, 1, 1]
predicate = DisagreeReason.from_pred(pred1, pred2)
predicate = DisagreeReason.from_pred(pred1=pred1, pred2=pred2)
assert np.all(predicate == np.array([0.0, 0.0, 1.0]))
11 changes: 11 additions & 0 deletions tests/test_reason/test_entropy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import numpy as np
from doubtlab.reason import ShannonEntropyReason


def test_short_conf_probas():
"""
Test `from_proba` on an obvious example.
"""
probas = np.array([[0.9, 0.1, 0.0], [0.5, 0.4, 0.1]])
predicate = ShannonEntropyReason.from_proba(probas, threshold=0.8)
assert np.all(predicate == np.array([0.0, 1.0]))
4 changes: 3 additions & 1 deletion tests/test_reason/test_longconfreason.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,7 @@ def test_longconf_proba():
y = np.array([0, 1])
classes = np.array([0, 1])
threshold = 0.4
predicate = LongConfidenceReason.from_probas(probas, y, classes, threshold)
predicate = LongConfidenceReason.from_proba(
proba=probas, y=y, classes=classes, threshold=threshold
)
assert np.all(predicate == np.array([0.0, 1.0]))
4 changes: 2 additions & 2 deletions tests/test_reason/test_margin.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@ def test_margin_confidence_margin():
model.fit(X, y)

probas = np.eye(3)
reason = MarginConfidenceReason.from_probas(probas)
reason = MarginConfidenceReason.from_proba(proba=probas)
assert all([r == 0.0 for r in reason])


def test_margin_simple_example():
"""Test on a obvious example."""
probas = np.array([[0.9, 0.1, 0.0], [0.5, 0.4, 0.1]])
predicate = MarginConfidenceReason.from_probas(probas, threshold=0.3)
predicate = MarginConfidenceReason.from_proba(proba=probas, threshold=0.3)
assert np.all(predicate == np.array([0.0, 1.0]))
4 changes: 2 additions & 2 deletions tests/test_reason/test_probareason.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@
def test_from_proba():
"""Ensure internal `from_proba` method handles obvious example"""
probas = np.array([[0.9, 0.1], [0.5, 0.5]])
predicate = ProbaReason.from_proba(probas, max_proba=0.5)
predicate = ProbaReason.from_proba(proba=probas, max_proba=0.5)
assert np.all(predicate == np.array([0.0, 1.0]))


def test_from_proba_max_proba():
"""Ensure internal `from_proba` method handles another obvious example"""
probas = np.array([[0.9, 0.1], [0.5, 0.5]])
predicate = ProbaReason.from_proba(probas, max_proba=0.3)
predicate = ProbaReason.from_proba(proba=probas, max_proba=0.3)
assert np.all(predicate == np.array([0.0, 0.0]))
4 changes: 3 additions & 1 deletion tests/test_reason/test_shortconfreason.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,7 @@ def test_short_conf_probas():
y = np.array([0, 1])
classes = np.array([0, 1])
threshold = 0.6
predicate = ShortConfidenceReason.from_probas(probas, y, classes, threshold)
predicate = ShortConfidenceReason.from_proba(
proba=probas, y=y, classes=classes, threshold=threshold
)
assert np.all(predicate == np.array([0.0, 1.0]))
Loading

0 comments on commit f263524

Please sign in to comment.