From 7436067f4a198e8cbbd6cc01643930c133f319bc Mon Sep 17 00:00:00 2001 From: Edwin Onuonga Date: Thu, 7 Jan 2021 00:54:50 +0400 Subject: [PATCH 1/3] Add option for DTWD --- .../classifiers/knn/knn_classifier.py | 24 +++++-- .../Pen-Tip Trajectories (Example).ipynb | 72 +++++++++---------- 2 files changed, 55 insertions(+), 41 deletions(-) diff --git a/lib/sequentia/classifiers/knn/knn_classifier.py b/lib/sequentia/classifiers/knn/knn_classifier.py index cac511c0..cc4cf934 100644 --- a/lib/sequentia/classifiers/knn/knn_classifier.py +++ b/lib/sequentia/classifiers/knn/knn_classifier.py @@ -1,7 +1,7 @@ import warnings, tqdm, tqdm.auto, numpy as np, types, pickle, marshal from joblib import Parallel, delayed from multiprocessing import cpu_count -from dtaidistance import dtw +from dtaidistance import dtw, dtw_ndim from sklearn.metrics import confusion_matrix from sklearn.preprocessing import LabelEncoder from ...internals import _Validator @@ -60,6 +60,9 @@ class KNNClassifier: pip install -vvv --upgrade --no-cache-dir --force-reinstall dtaidistance + independent: bool + Whether or not to allow features to be warped independently from each other. See `here `_ for a good overview of both approaches. + random_state: numpy.random.RandomState, int, optional A random state object or seed for reproducible randomness. @@ -84,7 +87,7 @@ class KNNClassifier: The complete set of possible classes/labels. """ - def __init__(self, k, classes, weighting='uniform', window=1., use_c=False, random_state=None): + def __init__(self, k, classes, weighting='uniform', window=1., use_c=False, independent=False, random_state=None): self._val = _Validator() self._k = self._val.restricted_integer( k, lambda x: x > 0, desc='number of neighbors', expected='greater than zero') @@ -116,6 +119,9 @@ def __init__(self, k, classes, weighting='uniform', window=1., use_c=False, rand warnings.warn('DTAIDistance C library not available – using Python implementation', ImportWarning) self._use_c = False + self._independent = self._val.boolean(independent, 'independent') + self._dtw = self._dtwi if independent else self._dtwd + def fit(self, X, y): """Fits the classifier by adding labeled training observation sequences. @@ -238,6 +244,7 @@ def save(self, path): 'weighting': marshal.dumps((self._weighting.__code__, self._weighting.__name__)), 'window': self._window, 'use_c': self._use_c, + 'independent': self._independent, 'random_state': self._random_state, 'X': self._X, 'y': self._y, @@ -262,7 +269,7 @@ def load(cls, path): data = pickle.load(file) # Check deserialized object dictionary and keys - keys = set(('k', 'classes', 'weighting', 'window', 'use_c', 'random_state', 'X', 'y', 'n_features')) + keys = set(('k', 'classes', 'weighting', 'window', 'use_c', 'independent', 'random_state', 'X', 'y', 'n_features')) if not isinstance(data, dict): raise TypeError('Expected deserialized object to be a dictionary - make sure the object was serialized with the save() function') else: @@ -280,6 +287,7 @@ def load(cls, path): weighting=weighting, window=data['window'], use_c=data['use_c'], + independent=data['independent'], random_state=data['random_state'] ) @@ -293,11 +301,16 @@ def _dtw_1d(self, a, b, window): # Requires fit """Computes the DTW distance between two univariate sequences.""" return dtw.distance(a, b, use_c=self._use_c, window=window) - def _dtw(self, A, B): # Requires fit - """Computes the multivariate DTW distance as the sum of the pairwise per-feature DTW distances.""" + def _dtwi(self, A, B): # Requires fit + """Computes the multivariate DTW distance as the sum of the pairwise per-feature DTW distances, allowing each feature to be warped independently.""" window = max(1, int(self._window * max(len(A), len(B)))) return np.sum([self._dtw_1d(A[:, i], B[:, i], window=window) for i in range(self._n_features)]) + def _dtwd(self, A, B): # Requires fit + """Computes the multivariate DTW distance so that the warping of the features depends on each other, by modifying the local distance measure.""" + window = max(1, int(self._window * max(len(A), len(B)))) + return dtw_ndim.distance(A, B, use_c=self._use_c, window=window) + def _argmax(self, a): """Same as numpy.argmax but returns all occurrences of the maximum, and is O(n) instead of O(2n). From: https://stackoverflow.com/a/58652335 @@ -394,6 +407,7 @@ def __repr__(self): ('k', repr(self._k)), ('window', repr(self._window)), ('use_c', repr(self._use_c)), + ('independent', repr(self._independent)), ('classes', repr(list(self._encoder.classes_))) ] try: diff --git a/notebooks/Pen-Tip Trajectories (Example).ipynb b/notebooks/Pen-Tip Trajectories (Example).ipynb index 5e8aadb7..19e97d15 100644 --- a/notebooks/Pen-Tip Trajectories (Example).ipynb +++ b/notebooks/Pen-Tip Trajectories (Example).ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -56,7 +56,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -93,7 +93,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -120,7 +120,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -142,7 +142,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -184,7 +184,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -218,7 +218,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -263,7 +263,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -293,7 +293,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -323,7 +323,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -344,7 +344,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -383,7 +383,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -403,13 +403,13 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "0f3bf4349086417dbba1fba33f5fff2a", + "model_id": "9036fa23214f4727b2ad661a19a549c4", "version_major": 2, "version_minor": 0 }, @@ -426,7 +426,7 @@ "'w'" ] }, - "execution_count": 15, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -438,13 +438,13 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "2be0f98157f84e43af99b6895902c651", + "model_id": "1a3610f874c4400cab88d38292c49468", "version_major": 2, "version_minor": 0 }, @@ -461,8 +461,8 @@ "text": [ "w c d e a e b h s v c y w e v v w v v b o e l c d c p n h p y p m h d a y d b n m m a g o g c n l y\n", "\n", - "CPU times: user 5.16 s, sys: 229 ms, total: 5.39 s\n", - "Wall time: 5.43 s\n" + "CPU times: user 1.75 s, sys: 108 ms, total: 1.86 s\n", + "Wall time: 2.2 s\n" ] } ], @@ -482,7 +482,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -491,8 +491,8 @@ "text": [ "w c d e a e b h s v c y w e v v w v v b o e l c d c p n h p y p m h d a y d b n m m a g o g c n l y\n", "\n", - "CPU times: user 705 ms, sys: 85.8 ms, total: 791 ms\n", - "Wall time: 5.1 s\n" + "CPU times: user 699 ms, sys: 80.5 ms, total: 779 ms\n", + "Wall time: 3.73 s\n" ] } ], @@ -512,7 +512,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 16, "metadata": { "scrolled": true }, @@ -521,8 +521,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 576 ms, sys: 20 ms, total: 596 ms\n", - "Wall time: 40.4 s\n" + "CPU times: user 542 ms, sys: 17.1 ms, total: 559 ms\n", + "Wall time: 21.9 s\n" ] } ], @@ -533,12 +533,12 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 17, "metadata": {}, "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -552,7 +552,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Accuracy: 97.03%\n" + "Accuracy: 97.73%\n" ] } ], @@ -568,7 +568,7 @@ "\n", "While the fast C compiled functions in the [`dtaidistance`](https://github.com/wannesm/dtaidistance) package (along with the multiprocessing capabilities of Sequentia's `KNNClassifier`) help to speed up classification **a lot**, the practical use of $k$-NN becomes more limited as the dataset grows larger. \n", "\n", - "In this case, since our dataset is relatively small, classifying all test examples was completed in $\\approx40s$, which is even faster than the HMM classifier that we show below. " + "In this case, since our dataset is relatively small, classifying all test examples was completed in $\\approx22s$, which is even faster than the HMM classifier that we show below. " ] }, { @@ -599,13 +599,13 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "6daa1c40652d4c539f9fda73cebb3076", + "model_id": "0fbffb1d3bbc44b5b6356b54a89a61b2", "version_major": 2, "version_minor": 0 }, @@ -638,7 +638,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -648,15 +648,15 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 3min 20s, sys: 16.2 s, total: 3min 36s\n", - "Wall time: 2min 2s\n" + "CPU times: user 3min 33s, sys: 18 s, total: 3min 51s\n", + "Wall time: 2min 34s\n" ] } ], @@ -667,7 +667,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 21, "metadata": {}, "outputs": [ { From 33ee5fdba909721bad04bbc2a6b216e5a8b2d52b Mon Sep 17 00:00:00 2001 From: Edwin Onuonga Date: Thu, 7 Jan 2021 01:13:14 +0400 Subject: [PATCH 2/3] Add tests for independent/dependent warping --- .../classifiers/knn/test_knn_classifier.py | 45 +++++++++++++++---- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/lib/test/lib/classifiers/knn/test_knn_classifier.py b/lib/test/lib/classifiers/knn/test_knn_classifier.py index 43057888..a1a9c987 100644 --- a/lib/test/lib/classifiers/knn/test_knn_classifier.py +++ b/lib/test/lib/classifiers/knn/test_knn_classifier.py @@ -20,7 +20,8 @@ 'k=1': KNNClassifier(k=1, classes=classes, random_state=rng), 'k=2': KNNClassifier(k=2, classes=classes, random_state=rng), 'k=3': KNNClassifier(k=3, classes=classes, random_state=rng), - 'weighted': KNNClassifier(k=3, classes=classes, weighting=(lambda x: np.exp(-x)), random_state=rng) + 'weighted': KNNClassifier(k=3, classes=classes, weighting=(lambda x: np.exp(-x)), random_state=rng), + 'independent': KNNClassifier(k=1, classes=classes, independent=True, random_state=rng) } for _, clf in clfs.items(): @@ -96,6 +97,18 @@ def test_predict_single_weighted_no_verbose(capsys): assert 'Calculating distances' not in capsys.readouterr().err assert prediction == 'c1' +def test_predict_single_independent_verbose(capsys): + """Verbosely predict a single observation sequence with independent warping""" + prediction = clfs['independent'].predict(x, verbose=True) + assert 'Calculating distances' in capsys.readouterr().err + assert prediction == 'c1' + +def test_predict_single_k1_no_verbose(capsys): + """Silently predict a single observation sequence with independent warping""" + prediction = clfs['independent'].predict(x, verbose=False) + assert 'Calculating distances' not in capsys.readouterr().err + assert prediction == 'c1' + def test_predict_multiple_k1_verbose(capsys): """Verbosely predict multiple observation sequences (k=1)""" predictions = clfs['k=1'].predict(X, verbose=True) @@ -124,25 +137,37 @@ def test_predict_multiple_k3_verbose(capsys): """Verbosely predict multiple observation sequences (k=3)""" predictions = clfs['k=3'].predict(X, verbose=True) assert 'Classifying examples' in capsys.readouterr().err - assert list(predictions) == ['c1', 'c1', 'c1', 'c0', 'c0', 'c0'] + assert list(predictions) == ['c1', 'c1', 'c1', 'c1', 'c0', 'c1'] def test_predict_multiple_k3_no_verbose(capsys): """Silently predict multiple observation sequences (k=3)""" predictions = clfs['k=3'].predict(X, verbose=False) assert 'Classifying examples' not in capsys.readouterr().err - assert list(predictions) == ['c1', 'c1', 'c1', 'c0', 'c0', 'c0'] + assert list(predictions) == ['c1', 'c1', 'c1', 'c1', 'c0', 'c1'] def test_predict_multiple_weighted_verbose(capsys): """Verbosely predict multiple observation sequences (weighted)""" predictions = clfs['weighted'].predict(X, verbose=True) assert 'Classifying examples' in capsys.readouterr().err - assert list(predictions) == ['c1', 'c1', 'c0', 'c0', 'c0', 'c1'] + assert list(predictions) == ['c1', 'c1', 'c0', 'c1', 'c0', 'c1'] def test_predict_multiple_weighted_no_verbose(capsys): """Silently predict multiple observation sequences (weighted)""" predictions = clfs['weighted'].predict(X, verbose=False) assert 'Classifying examples' not in capsys.readouterr().err - assert list(predictions) == ['c1', 'c1', 'c0', 'c0', 'c0', 'c1'] + assert list(predictions) == ['c1', 'c1', 'c0', 'c1', 'c0', 'c1'] + +def test_predict_multiple_independent_verbose(capsys): + """Verbosely predict multiple observation sequences with independent warping""" + predictions = clfs['independent'].predict(X, verbose=True) + assert 'Classifying examples' in capsys.readouterr().err + assert list(predictions) == ['c1', 'c1', 'c0', 'c1', 'c1', 'c0'] + +def test_predict_multiple_independent_no_verbose(capsys): + """Silently predict multiple observation sequences with independent warping""" + predictions = clfs['independent'].predict(X, verbose=False) + assert 'Classifying examples' not in capsys.readouterr().err + assert list(predictions) == ['c1', 'c1', 'c0', 'c1', 'c1', 'c0'] def test_predict_single(): """Predict a single observation sequence and don't return the original labels""" @@ -157,12 +182,12 @@ def test_predict_single_original_labels(): def test_predict_multiple(): """Predict multiple observation sequences and don't return the original labels""" predictions = clfs['k=3'].predict(X, verbose=False, original_labels=False) - assert list(predictions) == [1, 1, 1, 0, 0, 0] + assert list(predictions) == [1, 1, 1, 1, 0, 1] def test_predict_multiple_original_labels(): """Predict multiple observation sequences and return the original labels""" predictions = clfs['k=3'].predict(X, verbose=False, original_labels=True) - assert list(predictions) == ['c1', 'c1', 'c1', 'c0', 'c0', 'c0'] + assert list(predictions) == ['c1', 'c1', 'c1', 'c1', 'c0', 'c1'] # ======================== # # KNNClassifier.evaluate() # @@ -173,8 +198,8 @@ def test_evaluate(): acc, cm = clfs['k=3'].evaluate(X, y) assert acc == 0.5 assert_equal(cm, np.array([ - [1, 1, 0, 0, 0], - [2, 2, 0, 0, 0], + [0, 2, 0, 0, 0], + [1, 3, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0] @@ -249,6 +274,7 @@ def test_load_valid_no_weighting(): assert list(clf._encoder.classes_) == classes assert clf._window == 1. assert clf._use_c == False + assert clf._independent == False assert deepcopy(clf._random_state).normal() == deepcopy(rng).normal() assert_all_equal(clf._X, X) assert_equal(clf._y, clf._encoder.transform(y)) @@ -271,6 +297,7 @@ def test_load_valid_weighting(): assert list(clf._encoder.classes_) == classes assert clf._window == 1. assert clf._use_c == False + assert clf._independent == False assert deepcopy(clf._random_state).normal() == deepcopy(rng).normal() assert_all_equal(clf._X, X) assert_equal(clf._y, clf._encoder.transform(y)) From bf7c9027b3be8eb4fdcec6363a092a40c95d8cbd Mon Sep 17 00:00:00 2001 From: Edwin Onuonga Date: Thu, 7 Jan 2021 01:19:03 +0400 Subject: [PATCH 3/3] Add DTWD+DTWI to readme --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 935fcb0b..7372c7c2 100644 --- a/README.md +++ b/README.md @@ -58,12 +58,12 @@ The following algorithms provided within Sequentia support the use of multivaria ### Classification algorithms -- [x] Hidden Markov Models (via [`hmmlearn`](https://github.com/hmmlearn/hmmlearn))
Learning with the Baum-Welch algorithm [[1]](#references) +- [x] Hidden Markov Models (via [`hmmlearn`](https://github.com/hmmlearn/hmmlearn))
Learning with the Baum-Welch algorithm [[1]](#references) - [x] Gaussian Mixture Model emissions - [x] Linear, left-right and ergodic topologies - [x] Dynamic Time Warping k-Nearest Neighbors (via [`dtaidistance`](https://github.com/wannesm/dtaidistance)) - [x] Sakoe–Chiba band global warping constraint - - [x] Feature-independent warping (DTWI) + - [x] Dependent and independent feature warping (DTWD & DTWI) - [x] Custom distance-weighted predictions - [x] Multi-processed predictions