Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[add:lib] Add support for dependent feature warping #135

Merged
merged 5 commits into from
Jan 6, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,12 @@ The following algorithms provided within Sequentia support the use of multivaria

### Classification algorithms

- [x] Hidden Markov Models (via [`hmmlearn`](https://github.com/hmmlearn/hmmlearn))<br/><em>Learning with the Baum-Welch algorithm [[1]](#references)</em>
- [x] Hidden Markov Models (via [`hmmlearn`](https://github.com/hmmlearn/hmmlearn))<br/><em>Learning with the Baum-Welch algorithm</em> [[1]](#references)
- [x] Gaussian Mixture Model emissions
- [x] Linear, left-right and ergodic topologies
- [x] Dynamic Time Warping k-Nearest Neighbors (via [`dtaidistance`](https://github.com/wannesm/dtaidistance))
- [x] Sakoe–Chiba band global warping constraint
- [x] Feature-independent warping (DTWI)
- [x] Dependent and independent feature warping (DTWD & DTWI)
- [x] Custom distance-weighted predictions
- [x] Multi-processed predictions

Expand Down
24 changes: 19 additions & 5 deletions lib/sequentia/classifiers/knn/knn_classifier.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import warnings, tqdm, tqdm.auto, numpy as np, types, pickle, marshal
from joblib import Parallel, delayed
from multiprocessing import cpu_count
from dtaidistance import dtw
from dtaidistance import dtw, dtw_ndim
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from ...internals import _Validator
Expand Down Expand Up @@ -60,6 +60,9 @@ class KNNClassifier:

pip install -vvv --upgrade --no-cache-dir --force-reinstall dtaidistance

independent: bool
Whether or not to allow features to be warped independently from each other. See `here <https://www.cs.ucr.edu/~eamonn/Multi-Dimensional_DTW_Journal.pdf>`_ for a good overview of both approaches.

random_state: numpy.random.RandomState, int, optional
A random state object or seed for reproducible randomness.

Expand All @@ -84,7 +87,7 @@ class KNNClassifier:
The complete set of possible classes/labels.
"""

def __init__(self, k, classes, weighting='uniform', window=1., use_c=False, random_state=None):
def __init__(self, k, classes, weighting='uniform', window=1., use_c=False, independent=False, random_state=None):
self._val = _Validator()
self._k = self._val.restricted_integer(
k, lambda x: x > 0, desc='number of neighbors', expected='greater than zero')
Expand Down Expand Up @@ -116,6 +119,9 @@ def __init__(self, k, classes, weighting='uniform', window=1., use_c=False, rand
warnings.warn('DTAIDistance C library not available – using Python implementation', ImportWarning)
self._use_c = False

self._independent = self._val.boolean(independent, 'independent')
self._dtw = self._dtwi if independent else self._dtwd

def fit(self, X, y):
"""Fits the classifier by adding labeled training observation sequences.

Expand Down Expand Up @@ -238,6 +244,7 @@ def save(self, path):
'weighting': marshal.dumps((self._weighting.__code__, self._weighting.__name__)),
'window': self._window,
'use_c': self._use_c,
'independent': self._independent,
'random_state': self._random_state,
'X': self._X,
'y': self._y,
Expand All @@ -262,7 +269,7 @@ def load(cls, path):
data = pickle.load(file)

# Check deserialized object dictionary and keys
keys = set(('k', 'classes', 'weighting', 'window', 'use_c', 'random_state', 'X', 'y', 'n_features'))
keys = set(('k', 'classes', 'weighting', 'window', 'use_c', 'independent', 'random_state', 'X', 'y', 'n_features'))
if not isinstance(data, dict):
raise TypeError('Expected deserialized object to be a dictionary - make sure the object was serialized with the save() function')
else:
Expand All @@ -280,6 +287,7 @@ def load(cls, path):
weighting=weighting,
window=data['window'],
use_c=data['use_c'],
independent=data['independent'],
random_state=data['random_state']
)

Expand All @@ -293,11 +301,16 @@ def _dtw_1d(self, a, b, window): # Requires fit
"""Computes the DTW distance between two univariate sequences."""
return dtw.distance(a, b, use_c=self._use_c, window=window)

def _dtw(self, A, B): # Requires fit
"""Computes the multivariate DTW distance as the sum of the pairwise per-feature DTW distances."""
def _dtwi(self, A, B): # Requires fit
"""Computes the multivariate DTW distance as the sum of the pairwise per-feature DTW distances, allowing each feature to be warped independently."""
window = max(1, int(self._window * max(len(A), len(B))))
return np.sum([self._dtw_1d(A[:, i], B[:, i], window=window) for i in range(self._n_features)])

def _dtwd(self, A, B): # Requires fit
"""Computes the multivariate DTW distance so that the warping of the features depends on each other, by modifying the local distance measure."""
window = max(1, int(self._window * max(len(A), len(B))))
return dtw_ndim.distance(A, B, use_c=self._use_c, window=window)

def _argmax(self, a):
"""Same as numpy.argmax but returns all occurrences of the maximum, and is O(n) instead of O(2n).
From: https://stackoverflow.com/a/58652335
Expand Down Expand Up @@ -394,6 +407,7 @@ def __repr__(self):
('k', repr(self._k)),
('window', repr(self._window)),
('use_c', repr(self._use_c)),
('independent', repr(self._independent)),
('classes', repr(list(self._encoder.classes_)))
]
try:
Expand Down
45 changes: 36 additions & 9 deletions lib/test/lib/classifiers/knn/test_knn_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
'k=1': KNNClassifier(k=1, classes=classes, random_state=rng),
'k=2': KNNClassifier(k=2, classes=classes, random_state=rng),
'k=3': KNNClassifier(k=3, classes=classes, random_state=rng),
'weighted': KNNClassifier(k=3, classes=classes, weighting=(lambda x: np.exp(-x)), random_state=rng)
'weighted': KNNClassifier(k=3, classes=classes, weighting=(lambda x: np.exp(-x)), random_state=rng),
'independent': KNNClassifier(k=1, classes=classes, independent=True, random_state=rng)
}

for _, clf in clfs.items():
Expand Down Expand Up @@ -96,6 +97,18 @@ def test_predict_single_weighted_no_verbose(capsys):
assert 'Calculating distances' not in capsys.readouterr().err
assert prediction == 'c1'

def test_predict_single_independent_verbose(capsys):
"""Verbosely predict a single observation sequence with independent warping"""
prediction = clfs['independent'].predict(x, verbose=True)
assert 'Calculating distances' in capsys.readouterr().err
assert prediction == 'c1'

def test_predict_single_k1_no_verbose(capsys):
"""Silently predict a single observation sequence with independent warping"""
prediction = clfs['independent'].predict(x, verbose=False)
assert 'Calculating distances' not in capsys.readouterr().err
assert prediction == 'c1'

def test_predict_multiple_k1_verbose(capsys):
"""Verbosely predict multiple observation sequences (k=1)"""
predictions = clfs['k=1'].predict(X, verbose=True)
Expand Down Expand Up @@ -124,25 +137,37 @@ def test_predict_multiple_k3_verbose(capsys):
"""Verbosely predict multiple observation sequences (k=3)"""
predictions = clfs['k=3'].predict(X, verbose=True)
assert 'Classifying examples' in capsys.readouterr().err
assert list(predictions) == ['c1', 'c1', 'c1', 'c0', 'c0', 'c0']
assert list(predictions) == ['c1', 'c1', 'c1', 'c1', 'c0', 'c1']

def test_predict_multiple_k3_no_verbose(capsys):
"""Silently predict multiple observation sequences (k=3)"""
predictions = clfs['k=3'].predict(X, verbose=False)
assert 'Classifying examples' not in capsys.readouterr().err
assert list(predictions) == ['c1', 'c1', 'c1', 'c0', 'c0', 'c0']
assert list(predictions) == ['c1', 'c1', 'c1', 'c1', 'c0', 'c1']

def test_predict_multiple_weighted_verbose(capsys):
"""Verbosely predict multiple observation sequences (weighted)"""
predictions = clfs['weighted'].predict(X, verbose=True)
assert 'Classifying examples' in capsys.readouterr().err
assert list(predictions) == ['c1', 'c1', 'c0', 'c0', 'c0', 'c1']
assert list(predictions) == ['c1', 'c1', 'c0', 'c1', 'c0', 'c1']

def test_predict_multiple_weighted_no_verbose(capsys):
"""Silently predict multiple observation sequences (weighted)"""
predictions = clfs['weighted'].predict(X, verbose=False)
assert 'Classifying examples' not in capsys.readouterr().err
assert list(predictions) == ['c1', 'c1', 'c0', 'c0', 'c0', 'c1']
assert list(predictions) == ['c1', 'c1', 'c0', 'c1', 'c0', 'c1']

def test_predict_multiple_independent_verbose(capsys):
"""Verbosely predict multiple observation sequences with independent warping"""
predictions = clfs['independent'].predict(X, verbose=True)
assert 'Classifying examples' in capsys.readouterr().err
assert list(predictions) == ['c1', 'c1', 'c0', 'c1', 'c1', 'c0']

def test_predict_multiple_independent_no_verbose(capsys):
"""Silently predict multiple observation sequences with independent warping"""
predictions = clfs['independent'].predict(X, verbose=False)
assert 'Classifying examples' not in capsys.readouterr().err
assert list(predictions) == ['c1', 'c1', 'c0', 'c1', 'c1', 'c0']

def test_predict_single():
"""Predict a single observation sequence and don't return the original labels"""
Expand All @@ -157,12 +182,12 @@ def test_predict_single_original_labels():
def test_predict_multiple():
"""Predict multiple observation sequences and don't return the original labels"""
predictions = clfs['k=3'].predict(X, verbose=False, original_labels=False)
assert list(predictions) == [1, 1, 1, 0, 0, 0]
assert list(predictions) == [1, 1, 1, 1, 0, 1]

def test_predict_multiple_original_labels():
"""Predict multiple observation sequences and return the original labels"""
predictions = clfs['k=3'].predict(X, verbose=False, original_labels=True)
assert list(predictions) == ['c1', 'c1', 'c1', 'c0', 'c0', 'c0']
assert list(predictions) == ['c1', 'c1', 'c1', 'c1', 'c0', 'c1']

# ======================== #
# KNNClassifier.evaluate() #
Expand All @@ -173,8 +198,8 @@ def test_evaluate():
acc, cm = clfs['k=3'].evaluate(X, y)
assert acc == 0.5
assert_equal(cm, np.array([
[1, 1, 0, 0, 0],
[2, 2, 0, 0, 0],
[0, 2, 0, 0, 0],
[1, 3, 0, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0]
Expand Down Expand Up @@ -249,6 +274,7 @@ def test_load_valid_no_weighting():
assert list(clf._encoder.classes_) == classes
assert clf._window == 1.
assert clf._use_c == False
assert clf._independent == False
assert deepcopy(clf._random_state).normal() == deepcopy(rng).normal()
assert_all_equal(clf._X, X)
assert_equal(clf._y, clf._encoder.transform(y))
Expand All @@ -271,6 +297,7 @@ def test_load_valid_weighting():
assert list(clf._encoder.classes_) == classes
assert clf._window == 1.
assert clf._use_c == False
assert clf._independent == False
assert deepcopy(clf._random_state).normal() == deepcopy(rng).normal()
assert_all_equal(clf._X, X)
assert_equal(clf._y, clf._encoder.transform(y))
Expand Down
Loading