From ce3e404ff21b599de1688ce5cdba91aa4f729d69 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Sat, 1 Dec 2018 00:06:59 -0800 Subject: [PATCH 01/83] in progress updates --- dask_ml/model_selection/_search.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index e38ae384c..9d3006b7e 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -11,6 +11,9 @@ import packaging.version from dask.base import tokenize from dask.delayed import delayed + +from dask.distributed import Client, as_completed #TODO Make this optional + from dask.utils import derived_from from sklearn import model_selection from sklearn.base import BaseEstimator, MetaEstimatorMixin, clone, is_classifier @@ -1194,7 +1197,27 @@ def fit(self, X, y=None, groups=None, **fit_params): if scheduler is dask.threaded.get and n_jobs == 1: scheduler = dask.local.get_sync - out = scheduler(dsk, keys, num_workers=n_jobs) + if isinstance(scheduler.__self__, dask.distributed.Client): + out = [] + + # FIXME: Hack to pull out fit-score futures how should we do this cleaner? + keys_2 = [keys[0]] + [k for k in dsk.keys() if 'fit-score' in k[0]] + + # FIXME: Ignore the last 4 items in the graph as they are not needed right now + dsk_2 = {k: dsk[k] for k in list(dsk.keys())[0:-4]} + + futures = scheduler(dsk_2, keys_2, num_workers=n_jobs, sync=False) + # TODO: should we get batches of futures instead of getting them one at a time? + for future, result in as_completed(futures, with_results=True): + if future.status == 'finished': + out.append(result) + future.cancel() + # FIXME: Hack to break out of loop. How to do it cleaner? + if len(out) == len(futures)-1: + break + # TODO: Now that we have the results of the random search cv. we need to continue the graph. + else: + out = scheduler(dsk, keys, num_workers=n_jobs) results = handle_deprecated_train_score(out[0], self.return_train_score) self.cv_results_ = results From 6b3688a26f21c878ee73c5eaf7538c8113547cd5 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Sat, 1 Dec 2018 23:53:33 -0800 Subject: [PATCH 02/83] clean up a bit more --- dask_ml/model_selection/_search.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index 9d3006b7e..ed86cc971 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -1198,26 +1198,23 @@ def fit(self, X, y=None, groups=None, **fit_params): scheduler = dask.local.get_sync if isinstance(scheduler.__self__, dask.distributed.Client): - out = [] + cv_results_key = [k for k, v in dsk.items() if 'cv-results' in k][0] + score_keys = dsk[cv_results_key][1] + futures = scheduler(dsk, score_keys, num_workers=n_jobs, sync=False) - # FIXME: Hack to pull out fit-score futures how should we do this cleaner? - keys_2 = [keys[0]] + [k for k in dsk.keys() if 'fit-score' in k[0]] - - # FIXME: Ignore the last 4 items in the graph as they are not needed right now - dsk_2 = {k: dsk[k] for k in list(dsk.keys())[0:-4]} - - futures = scheduler(dsk_2, keys_2, num_workers=n_jobs, sync=False) - # TODO: should we get batches of futures instead of getting them one at a time? + scores = [] for future, result in as_completed(futures, with_results=True): if future.status == 'finished': - out.append(result) + scores.append(result) future.cancel() - # FIXME: Hack to break out of loop. How to do it cleaner? - if len(out) == len(futures)-1: + if len(scores) == len(score_keys): break - # TODO: Now that we have the results of the random search cv. we need to continue the graph. - else: - out = scheduler(dsk, keys, num_workers=n_jobs) + + tmp_cv_results = list(dsk[cv_results_key]) + tmp_cv_results[1] = scores + dsk[cv_results_key] = tuple(tmp_cv_results) + + out = scheduler(dsk, keys, num_workers=n_jobs) results = handle_deprecated_train_score(out[0], self.return_train_score) self.cv_results_ = results From 9a8585dd2a328ab4e63ae43eb7d4d13a657bdba2 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Sun, 2 Dec 2018 00:08:08 -0800 Subject: [PATCH 03/83] note on futures possibly returning out of order --- dask_ml/model_selection/_search.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index ed86cc971..9321647e8 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -1169,12 +1169,12 @@ def fit(self, X, y=None, groups=None, **fit_params): raise ValueError( "error_score must be the string 'raise' or a" " numeric value." ) - + candidate_params = list(self._get_param_iterator()) dsk, keys, n_splits = build_graph( estimator, self.cv, self.scorer_, - list(self._get_param_iterator()), + candidate_params, X, y, groups, @@ -1203,13 +1203,14 @@ def fit(self, X, y=None, groups=None, **fit_params): futures = scheduler(dsk, score_keys, num_workers=n_jobs, sync=False) scores = [] + # Fixme: Have to be careful here, this can return out of order, how to ensure we keep params and results in sync? for future, result in as_completed(futures, with_results=True): if future.status == 'finished': scores.append(result) future.cancel() if len(scores) == len(score_keys): break - + tmp_cv_results = list(dsk[cv_results_key]) tmp_cv_results[1] = scores dsk[cv_results_key] = tuple(tmp_cv_results) From 6e4123e4544c02f52184b4a0df5f0c4234b89d88 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Sun, 2 Dec 2018 10:58:49 -0800 Subject: [PATCH 04/83] ensure scores from futures line up with parameters --- dask_ml/model_selection/_search.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index 9321647e8..1c1a841f1 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -1202,15 +1202,16 @@ def fit(self, X, y=None, groups=None, **fit_params): score_keys = dsk[cv_results_key][1] futures = scheduler(dsk, score_keys, num_workers=n_jobs, sync=False) - scores = [] - # Fixme: Have to be careful here, this can return out of order, how to ensure we keep params and results in sync? + score_map = {} for future, result in as_completed(futures, with_results=True): if future.status == 'finished': - scores.append(result) + score_map[future.key] = result future.cancel() - if len(scores) == len(score_keys): + if len(score_map) == len(score_keys): break + # Make sure the returned scores is in the same order as the score_keys + scores = [score_map[k] for k in score_keys] tmp_cv_results = list(dsk[cv_results_key]) tmp_cv_results[1] = scores dsk[cv_results_key] = tuple(tmp_cv_results) From a506d5867c3c6b75b63617f5b87a151e4f6f2970 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Sun, 2 Dec 2018 11:01:40 -0800 Subject: [PATCH 05/83] update comment about score order --- dask_ml/model_selection/_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index 1c1a841f1..f0fba6c97 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -1210,7 +1210,7 @@ def fit(self, X, y=None, groups=None, **fit_params): if len(score_map) == len(score_keys): break - # Make sure the returned scores is in the same order as the score_keys + # Sort scores by score_keys so parameters line up scores = [score_map[k] for k in score_keys] tmp_cv_results = list(dsk[cv_results_key]) tmp_cv_results[1] = scores From 1fa69e30879ab893f6748f76e55f598d8420c679 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Sun, 2 Dec 2018 11:03:11 -0800 Subject: [PATCH 06/83] revert candidate_param change given we are not using it anymore --- dask_ml/model_selection/_search.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index f0fba6c97..c472dc134 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -1169,12 +1169,12 @@ def fit(self, X, y=None, groups=None, **fit_params): raise ValueError( "error_score must be the string 'raise' or a" " numeric value." ) - candidate_params = list(self._get_param_iterator()) + dsk, keys, n_splits = build_graph( estimator, self.cv, self.scorer_, - candidate_params, + list(self._get_param_iterator()), X, y, groups, From 578c5f8ad03b2a5cb83a10a666246277b1f95b84 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Sun, 2 Dec 2018 11:06:50 -0800 Subject: [PATCH 07/83] loop over keys instead of items --- dask_ml/model_selection/_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index c472dc134..2750d7929 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -1198,7 +1198,7 @@ def fit(self, X, y=None, groups=None, **fit_params): scheduler = dask.local.get_sync if isinstance(scheduler.__self__, dask.distributed.Client): - cv_results_key = [k for k, v in dsk.items() if 'cv-results' in k][0] + cv_results_key = [k for k in dsk.keys() if 'cv-results' in k][0] score_keys = dsk[cv_results_key][1] futures = scheduler(dsk, score_keys, num_workers=n_jobs, sync=False) From 12892faf8210e5b2d81eee27ec81596abc4be63a Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Sun, 2 Dec 2018 11:12:03 -0800 Subject: [PATCH 08/83] take first value using iteration instead of index --- dask_ml/model_selection/_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index 2750d7929..19a2bd978 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -1198,7 +1198,7 @@ def fit(self, X, y=None, groups=None, **fit_params): scheduler = dask.local.get_sync if isinstance(scheduler.__self__, dask.distributed.Client): - cv_results_key = [k for k in dsk.keys() if 'cv-results' in k][0] + cv_results_key = next(k for k in dsk.keys() if 'cv-results' in k) score_keys = dsk[cv_results_key][1] futures = scheduler(dsk, score_keys, num_workers=n_jobs, sync=False) From 2c1d005304318a0428b5c690bebe37d5ac7530ee Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Sun, 2 Dec 2018 13:55:42 -0800 Subject: [PATCH 09/83] optional import for distributed --- dask_ml/model_selection/_search.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index 19a2bd978..8d5fe4c27 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -12,7 +12,10 @@ from dask.base import tokenize from dask.delayed import delayed -from dask.distributed import Client, as_completed #TODO Make this optional +try: + import dask.distributed +except ImportError: + dask.distributed = None from dask.utils import derived_from from sklearn import model_selection @@ -1197,13 +1200,13 @@ def fit(self, X, y=None, groups=None, **fit_params): if scheduler is dask.threaded.get and n_jobs == 1: scheduler = dask.local.get_sync - if isinstance(scheduler.__self__, dask.distributed.Client): + if isinstance(getattr(scheduler, '__self__', None), dask.distributed.Client): cv_results_key = next(k for k in dsk.keys() if 'cv-results' in k) score_keys = dsk[cv_results_key][1] futures = scheduler(dsk, score_keys, num_workers=n_jobs, sync=False) score_map = {} - for future, result in as_completed(futures, with_results=True): + for future, result in dask.distributed.as_completed(futures, with_results=True): if future.status == 'finished': score_map[future.key] = result future.cancel() From c9d3a757335ad5e6d0177e44136862cc8ae2dfc2 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Sun, 2 Dec 2018 14:40:42 -0800 Subject: [PATCH 10/83] add distributed scheduler to param test --- tests/model_selection/dask_searchcv/test_model_selection.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/model_selection/dask_searchcv/test_model_selection.py b/tests/model_selection/dask_searchcv/test_model_selection.py index 2066e7231..c4f1ec94d 100644 --- a/tests/model_selection/dask_searchcv/test_model_selection.py +++ b/tests/model_selection/dask_searchcv/test_model_selection.py @@ -13,6 +13,7 @@ from dask.base import tokenize from dask.callbacks import Callback from dask.delayed import delayed +from dask.distributed import Client from dask.utils import tmpdir from sklearn.datasets import load_iris, make_classification from sklearn.decomposition import PCA @@ -769,6 +770,7 @@ def test_normalize_n_jobs(): ("synchronous", 4), ("sync", 4), ("multiprocessing", 4), + (Client(), 4), pytest.param(dask.get, 4, marks=[pytest.mark.filterwarnings("ignore")]), ], ) From e81f9e5eb9f2c8b914ce97eb21e2fffe19346a1a Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Sun, 2 Dec 2018 14:59:47 -0800 Subject: [PATCH 11/83] put client first and close it so it doesn't leak into other tests. is there a better way? --- .../model_selection/dask_searchcv/test_model_selection.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/model_selection/dask_searchcv/test_model_selection.py b/tests/model_selection/dask_searchcv/test_model_selection.py index c4f1ec94d..57388b4ae 100644 --- a/tests/model_selection/dask_searchcv/test_model_selection.py +++ b/tests/model_selection/dask_searchcv/test_model_selection.py @@ -760,17 +760,17 @@ def test_normalize_n_jobs(): with pytest.raises(TypeError): _normalize_n_jobs("not an integer") - +from dask.distributed import Client, LocalCluster @pytest.mark.parametrize( "scheduler,n_jobs", [ + (Client(), 4), (None, 4), ("threading", 4), ("threading", 1), ("synchronous", 4), ("sync", 4), ("multiprocessing", 4), - (Client(), 4), pytest.param(dask.get, 4, marks=[pytest.mark.filterwarnings("ignore")]), ], ) @@ -785,6 +785,9 @@ def test_scheduler_param(scheduler, n_jobs): ) gs.fit(X, y) + if hasattr(scheduler, 'close'): + scheduler.close() + @pytest.mark.skipif("not has_distributed") def test_scheduler_param_distributed(loop): From 000059d69d9244a435971e42dee7db162a4ea271 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Sun, 2 Dec 2018 15:10:31 -0800 Subject: [PATCH 12/83] remove duplicate import --- .../dask_searchcv/test_model_selection.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/tests/model_selection/dask_searchcv/test_model_selection.py b/tests/model_selection/dask_searchcv/test_model_selection.py index 57388b4ae..22bdab20b 100644 --- a/tests/model_selection/dask_searchcv/test_model_selection.py +++ b/tests/model_selection/dask_searchcv/test_model_selection.py @@ -760,7 +760,6 @@ def test_normalize_n_jobs(): with pytest.raises(TypeError): _normalize_n_jobs("not an integer") -from dask.distributed import Client, LocalCluster @pytest.mark.parametrize( "scheduler,n_jobs", [ @@ -776,14 +775,16 @@ def test_normalize_n_jobs(): ) def test_scheduler_param(scheduler, n_jobs): X, y = make_classification(n_samples=100, n_features=10, random_state=0) - gs = dcv.GridSearchCV( - MockClassifier(), - {"foo_param": [0, 1, 2]}, - cv=3, - scheduler=scheduler, - n_jobs=n_jobs, - ) - gs.fit(X, y) + + with scheduler() as (s, [_, _]): + gs = dcv.GridSearchCV( + MockClassifier(), + {"foo_param": [0, 1, 2]}, + cv=3, + scheduler=s, + n_jobs=n_jobs, + ) + gs.fit(X, y) if hasattr(scheduler, 'close'): scheduler.close() From 6c83b916e7f7d9796b58e2605d2f37223cfc71e9 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Sun, 2 Dec 2018 15:13:05 -0800 Subject: [PATCH 13/83] remove test as it already existed --- tests/model_selection/dask_searchcv/test_model_selection.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/model_selection/dask_searchcv/test_model_selection.py b/tests/model_selection/dask_searchcv/test_model_selection.py index 22bdab20b..e8624912b 100644 --- a/tests/model_selection/dask_searchcv/test_model_selection.py +++ b/tests/model_selection/dask_searchcv/test_model_selection.py @@ -13,7 +13,6 @@ from dask.base import tokenize from dask.callbacks import Callback from dask.delayed import delayed -from dask.distributed import Client from dask.utils import tmpdir from sklearn.datasets import load_iris, make_classification from sklearn.decomposition import PCA @@ -763,7 +762,6 @@ def test_normalize_n_jobs(): @pytest.mark.parametrize( "scheduler,n_jobs", [ - (Client(), 4), (None, 4), ("threading", 4), ("threading", 1), @@ -786,10 +784,6 @@ def test_scheduler_param(scheduler, n_jobs): ) gs.fit(X, y) - if hasattr(scheduler, 'close'): - scheduler.close() - - @pytest.mark.skipif("not has_distributed") def test_scheduler_param_distributed(loop): X, y = make_classification(n_samples=100, n_features=10, random_state=0) From ee34b1265aa17a0d18c1ac1cfd5ed87e8eb6f862 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Sun, 2 Dec 2018 15:14:08 -0800 Subject: [PATCH 14/83] actually revert tests --- .../dask_searchcv/test_model_selection.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/model_selection/dask_searchcv/test_model_selection.py b/tests/model_selection/dask_searchcv/test_model_selection.py index e8624912b..2066e7231 100644 --- a/tests/model_selection/dask_searchcv/test_model_selection.py +++ b/tests/model_selection/dask_searchcv/test_model_selection.py @@ -759,6 +759,7 @@ def test_normalize_n_jobs(): with pytest.raises(TypeError): _normalize_n_jobs("not an integer") + @pytest.mark.parametrize( "scheduler,n_jobs", [ @@ -773,16 +774,15 @@ def test_normalize_n_jobs(): ) def test_scheduler_param(scheduler, n_jobs): X, y = make_classification(n_samples=100, n_features=10, random_state=0) + gs = dcv.GridSearchCV( + MockClassifier(), + {"foo_param": [0, 1, 2]}, + cv=3, + scheduler=scheduler, + n_jobs=n_jobs, + ) + gs.fit(X, y) - with scheduler() as (s, [_, _]): - gs = dcv.GridSearchCV( - MockClassifier(), - {"foo_param": [0, 1, 2]}, - cv=3, - scheduler=s, - n_jobs=n_jobs, - ) - gs.fit(X, y) @pytest.mark.skipif("not has_distributed") def test_scheduler_param_distributed(loop): From 2d0627ca4f3661012ce93b764fc2f37cf1da487a Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Sun, 2 Dec 2018 15:21:22 -0800 Subject: [PATCH 15/83] ensure dask distributed exists before we check for the client --- dask_ml/model_selection/_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index 8d5fe4c27..603de1b3f 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -1200,7 +1200,7 @@ def fit(self, X, y=None, groups=None, **fit_params): if scheduler is dask.threaded.get and n_jobs == 1: scheduler = dask.local.get_sync - if isinstance(getattr(scheduler, '__self__', None), dask.distributed.Client): + if dask.distributed and isinstance(getattr(scheduler, '__self__', None), dask.distributed.Client): cv_results_key = next(k for k in dsk.keys() if 'cv-results' in k) score_keys = dsk[cv_results_key][1] futures = scheduler(dsk, score_keys, num_workers=n_jobs, sync=False) From fa77f197001afeccf93764d1a320103c74c6d0d9 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Sun, 2 Dec 2018 19:39:24 -0800 Subject: [PATCH 16/83] in progress tests for distributed as_completed cv --- .../dask_searchcv/test_model_selection.py | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/tests/model_selection/dask_searchcv/test_model_selection.py b/tests/model_selection/dask_searchcv/test_model_selection.py index 2066e7231..62ce1882c 100644 --- a/tests/model_selection/dask_searchcv/test_model_selection.py +++ b/tests/model_selection/dask_searchcv/test_model_selection.py @@ -797,6 +797,39 @@ def f(dask_scheduler): assert client.run_on_scheduler(f) # some work happened on cluster +from sklearn.base import BaseEstimator +import os.path + + +class TestAsCompletedEstimator(BaseEstimator): + def __init__(self, kill_file=None): + self.kill_file = kill_file + + def fit(self): + if self.kill_file: + if not os.path.isfile(self.kill_file): + self.kill_file.write('done') + # kill worker + + return 1 + + def transform(self): + pass + +@pytest.mark.skipif("not has_distributed") +def test_gather_as_completed_distributed(loop, tmpdir): + X, y = make_classification(n_samples=100, n_features=10, random_state=0) + with cluster() as (s, [a, b]): + with Client(s["address"], loop=loop) as client: + kill_file = tmpdir.join('is_killed.txt') + est = TestAsCompletedEstimator(kill_file) + est.fit() + est.fit() + + tmpdir.remove() + + + def test_cv_multiplemetrics(): X, y = make_classification(random_state=0) From a0ad0d68d63503c45e089b4f6b5dd2e3052fe5da Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Sun, 2 Dec 2018 23:43:12 -0800 Subject: [PATCH 17/83] trying to figure out a way to test this --- .../dask_searchcv/test_model_selection.py | 41 +++++++++++++------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/tests/model_selection/dask_searchcv/test_model_selection.py b/tests/model_selection/dask_searchcv/test_model_selection.py index 62ce1882c..a3f173201 100644 --- a/tests/model_selection/dask_searchcv/test_model_selection.py +++ b/tests/model_selection/dask_searchcv/test_model_selection.py @@ -797,20 +797,36 @@ def f(dask_scheduler): assert client.run_on_scheduler(f) # some work happened on cluster +from time import sleep from sklearn.base import BaseEstimator import os.path - +from dask.distributed import get_client +import sys class TestAsCompletedEstimator(BaseEstimator): - def __init__(self, kill_file=None): - self.kill_file = kill_file + def __init__(self, i=None, out_path=None, num_cv=None, scheduler=None, loop=None): + self.i = i + self.num_cv = num_cv + self.out_path = out_path + self.scheduler = scheduler + self.loop = loop + + def fit(self, X, y): + out_file = self.out_path.join(f'{self.i}.txt') + + if self.i == (self.num_cv-1): + files = os.listdir(self.out_path) + while len(files) < (self.num_cv-1): + files = os.listdir(self.out_path) + sleep(0.05) + c = get_client() + t = 1 + + out_file.write('done') - def fit(self): - if self.kill_file: - if not os.path.isfile(self.kill_file): - self.kill_file.write('done') - # kill worker + return 1 + def score(self, X, y): return 1 def transform(self): @@ -818,13 +834,14 @@ def transform(self): @pytest.mark.skipif("not has_distributed") def test_gather_as_completed_distributed(loop, tmpdir): + num_cv = 3 + ids = list(range(0, num_cv)) + X, y = make_classification(n_samples=100, n_features=10, random_state=0) with cluster() as (s, [a, b]): with Client(s["address"], loop=loop) as client: - kill_file = tmpdir.join('is_killed.txt') - est = TestAsCompletedEstimator(kill_file) - est.fit() - est.fit() + gs = dcv.GridSearchCV(TestAsCompletedEstimator(out_path=tmpdir, num_cv=num_cv), {"i": ids}, cv=3) + gs.fit(X, y) tmpdir.remove() From 184e7281d8a13df35bb8e378f28a21d5db8d0053 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Tue, 4 Dec 2018 22:51:06 -0800 Subject: [PATCH 18/83] split up graphs --- dask_ml/model_selection/_search.py | 362 ++++++++++++++++------------- 1 file changed, 195 insertions(+), 167 deletions(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index 603de1b3f..5685d0997 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -66,14 +66,13 @@ except ImportError: # pragma: no cover from toolz import get, pluck - __all__ = ["GridSearchCV", "RandomizedSearchCV"] - if SK_VERSION <= packaging.version.parse("0.21.dev0"): _RETURN_TRAIN_SCORE_DEFAULT = "warn" + def handle_deprecated_train_score(results, return_train_score): if return_train_score == "warn": results = DeprecationDict(results) @@ -92,6 +91,7 @@ def handle_deprecated_train_score(results, return_train_score): else: _RETURN_TRAIN_SCORE_DEFAULT = False + def handle_deprecated_train_score(results, return_train_score): return results @@ -108,23 +108,20 @@ def __call__(self, est): return self.token if c == 0 else self.token + str(c) -def build_graph( - estimator, - cv, - scorer, - candidate_params, - X, - y=None, - groups=None, - fit_params=None, - iid=True, - refit=True, - error_score="raise", - return_train_score=_RETURN_TRAIN_SCORE_DEFAULT, - cache_cv=True, - multimetric=False, +def build_cv_graph( + estimator, + cv, + scorer, + candidate_params, + X, + y=None, + groups=None, + fit_params=None, + iid=True, + error_score="raise", + return_train_score=_RETURN_TRAIN_SCORE_DEFAULT, + cache_cv=True, ): - X, y, groups = to_indexable(X, y, groups) cv = check_cv(cv, y, is_classifier(estimator)) # "pairwise" estimators require a different graph for CV splitting @@ -183,17 +180,36 @@ def build_graph( return_train_score, ) + return dsk, scores, n_splits, main_token, X_name, y_name, weights + + +def build_result_graph( + dsk, + main_token, + estimator, + X_name, + y_name, + fit_params, + n_splits, + error_score, + scorer, + candidate_params, + scores, + weights, + refit, + multimetric +): + cv_results = "cv-results-" + main_token - candidate_params_name = "cv-parameters-" + main_token - dsk[candidate_params_name] = (decompress_params, fields, params) if multimetric: metrics = list(scorer.keys()) else: metrics = None + dsk[cv_results] = ( create_cv_results, scores, - candidate_params_name, + candidate_params, n_splits, error_score, weights, @@ -208,8 +224,9 @@ def build_graph( scorer = "score" best_params = "best-params-" + main_token - dsk[best_params] = (get_best_params, candidate_params_name, cv_results, scorer) + dsk[best_params] = (get_best_params, candidate_params, cv_results, scorer) best_estimator = "best-estimator-" + main_token + if fit_params: fit_params = ( dict, @@ -225,8 +242,7 @@ def build_graph( ) keys.append(best_estimator) - return dsk, keys, n_splits - + return dsk, keys def normalize_params(params): """Take a list of dictionaries, and tokenize/normalize.""" @@ -266,20 +282,20 @@ def _group_fit_params(steps, fit_params): def do_fit_and_score( - dsk, - main_token, - est, - cv, - fields, - tokens, - params, - X, - y, - fit_params, - n_splits, - error_score, - scorer, - return_train_score, + dsk, + main_token, + est, + cv, + fields, + tokens, + params, + X, + y, + fit_params, + n_splits, + error_score, + scorer, + return_train_score, ): if not isinstance(est, Pipeline): # Fitting and scoring can all be done as a single task @@ -372,18 +388,18 @@ def do_fit_and_score( def do_fit( - dsk, - next_token, - est, - cv, - fields, - tokens, - params, - Xs, - ys, - fit_params, - n_splits, - error_score, + dsk, + next_token, + est, + cv, + fields, + tokens, + params, + Xs, + ys, + fit_params, + n_splits, + error_score, ): if isinstance(est, Pipeline) and params is not None: return _do_pipeline( @@ -442,18 +458,18 @@ def do_fit( def do_fit_transform( - dsk, - next_token, - est, - cv, - fields, - tokens, - params, - Xs, - ys, - fit_params, - n_splits, - error_score, + dsk, + next_token, + est, + cv, + fields, + tokens, + params, + Xs, + ys, + fit_params, + n_splits, + error_score, ): if isinstance(est, Pipeline) and params is not None: return _do_pipeline( @@ -560,24 +576,24 @@ def new_group(): def _do_fit_step( - dsk, - next_token, - step, - cv, - fields, - tokens, - params, - Xs, - ys, - fit_params, - n_splits, - error_score, - step_fields_lk, - fit_params_lk, - field_to_index, - step_name, - none_passthrough, - is_transform, + dsk, + next_token, + step, + cv, + fields, + tokens, + params, + Xs, + ys, + fit_params, + n_splits, + error_score, + step_fields_lk, + fit_params_lk, + field_to_index, + step_name, + none_passthrough, + is_transform, ): sub_fields, sub_inds = map(list, unzip(step_fields_lk[step_name], 2)) sub_fit_params = fit_params_lk[step_name] @@ -699,19 +715,19 @@ def _do_fit_step( def _do_pipeline( - dsk, - next_token, - est, - cv, - fields, - tokens, - params, - Xs, - ys, - fit_params, - n_splits, - error_score, - is_transform, + dsk, + next_token, + est, + cv, + fields, + tokens, + params, + Xs, + ys, + fit_params, + n_splits, + error_score, + is_transform, ): if "steps" in fields: raise NotImplementedError("Setting Pipeline.steps in a gridsearch") @@ -792,18 +808,18 @@ def _do_n_samples(dsk, token, Xs, n_splits): def _do_featureunion( - dsk, - next_token, - est, - cv, - fields, - tokens, - params, - Xs, - ys, - fit_params, - n_splits, - error_score, + dsk, + next_token, + est, + cv, + fields, + tokens, + params, + Xs, + ys, + fit_params, + n_splits, + error_score, ): if "transformer_list" in fields: raise NotImplementedError( @@ -874,7 +890,7 @@ def _do_featureunion( m = 0 seen = {} for steps, Xs, wt, (w, wl), nsamp in zip( - zip(*fit_steps), zip(*tr_Xs), weight_tokens, weights, n_samples + zip(*fit_steps), zip(*tr_Xs), weight_tokens, weights, n_samples ): if (steps, wt) in seen: out_append(seen[steps, wt]) @@ -953,7 +969,7 @@ def compute_n_splits(cv, X, y=None, groups=None): return cv.get_n_splits(X, None, None) elif isinstance(cv, (LeaveOneGroupOut, LeavePGroupsOut)) and not is_dask_collection( - groups + groups ): # Only `groups` is referenced for these classes return cv.get_n_splits(None, None, groups) @@ -1010,17 +1026,17 @@ class DaskBaseSearchCV(BaseEstimator, MetaEstimatorMixin): """Base class for hyper parameter search with cross-validation.""" def __init__( - self, - estimator, - scoring=None, - iid=True, - refit=True, - cv=None, - error_score="raise", - return_train_score=_RETURN_TRAIN_SCORE_DEFAULT, - scheduler=None, - n_jobs=-1, - cache_cv=True, + self, + estimator, + scoring=None, + iid=True, + refit=True, + cv=None, + error_score="raise", + return_train_score=_RETURN_TRAIN_SCORE_DEFAULT, + scheduler=None, + n_jobs=-1, + cache_cv=True, ): self.scoring = scoring self.estimator = estimator @@ -1149,10 +1165,10 @@ def fit(self, X, y=None, groups=None, **fit_params): if self.multimetric_: if self.refit is not False and ( - not isinstance(self.refit, str) - or - # This will work for both dict / list (tuple) - self.refit not in scorer + not isinstance(self.refit, str) + or + # This will work for both dict / list (tuple) + self.refit not in scorer ): raise ValueError( "For multi-metric scoring, the parameter " @@ -1173,22 +1189,22 @@ def fit(self, X, y=None, groups=None, **fit_params): "error_score must be the string 'raise' or a" " numeric value." ) - dsk, keys, n_splits = build_graph( + candidate_params = list(self._get_param_iterator()) + + dsk, keys, n_splits, main_token, X_name, y_name, weights = build_cv_graph( estimator, self.cv, self.scorer_, - list(self._get_param_iterator()), + candidate_params, X, - y, - groups, - fit_params, - iid=self.iid, - refit=self.refit, + y=y, + groups=groups, + fit_params=fit_params, error_score=error_score, return_train_score=self.return_train_score, - cache_cv=self.cache_cv, - multimetric=multimetric, + cache_cv=self.cache_cv ) + self.dask_graph_ = dsk self.n_splits_ = n_splits @@ -1201,26 +1217,39 @@ def fit(self, X, y=None, groups=None, **fit_params): scheduler = dask.local.get_sync if dask.distributed and isinstance(getattr(scheduler, '__self__', None), dask.distributed.Client): - cv_results_key = next(k for k in dsk.keys() if 'cv-results' in k) - score_keys = dsk[cv_results_key][1] - futures = scheduler(dsk, score_keys, num_workers=n_jobs, sync=False) + futures = scheduler(dsk, keys, num_workers=n_jobs, sync=False) score_map = {} for future, result in dask.distributed.as_completed(futures, with_results=True): if future.status == 'finished': score_map[future.key] = result future.cancel() - if len(score_map) == len(score_keys): + if len(score_map) == len(keys): break # Sort scores by score_keys so parameters line up - scores = [score_map[k] for k in score_keys] - tmp_cv_results = list(dsk[cv_results_key]) - tmp_cv_results[1] = scores - dsk[cv_results_key] = tuple(tmp_cv_results) + scores = [score_map[k] for k in keys] + else: + scores = scheduler(dsk, keys, num_workers=n_jobs) - out = scheduler(dsk, keys, num_workers=n_jobs) + dsk, keys = build_result_graph( + dsk, + main_token, + estimator, + X_name, + y_name, + fit_params, + n_splits, + error_score, + self.scorer_, + candidate_params, + scores, + weights, + self.refit, + multimetric + ) + out = scheduler(dsk, keys, num_workers=n_jobs) results = handle_deprecated_train_score(out[0], self.return_train_score) self.cv_results_ = results @@ -1492,18 +1521,18 @@ class GridSearchCV(StaticDaskSearchMixin, DaskBaseSearchCV): ) def __init__( - self, - estimator, - param_grid, - scoring=None, - iid=True, - refit=True, - cv=None, - error_score="raise", - return_train_score=_RETURN_TRAIN_SCORE_DEFAULT, - scheduler=None, - n_jobs=-1, - cache_cv=True, + self, + estimator, + param_grid, + scoring=None, + iid=True, + refit=True, + cv=None, + error_score="raise", + return_train_score=_RETURN_TRAIN_SCORE_DEFAULT, + scheduler=None, + n_jobs=-1, + cache_cv=True, ): super(GridSearchCV, self).__init__( estimator=estimator, @@ -1595,22 +1624,21 @@ class RandomizedSearchCV(StaticDaskSearchMixin, DaskBaseSearchCV): ) def __init__( - self, - estimator, - param_distributions, - n_iter=10, - random_state=None, - scoring=None, - iid=True, - refit=True, - cv=None, - error_score="raise", - return_train_score=_RETURN_TRAIN_SCORE_DEFAULT, - scheduler=None, - n_jobs=-1, - cache_cv=True, + self, + estimator, + param_distributions, + n_iter=10, + random_state=None, + scoring=None, + iid=True, + refit=True, + cv=None, + error_score="raise", + return_train_score=_RETURN_TRAIN_SCORE_DEFAULT, + scheduler=None, + n_jobs=-1, + cache_cv=True, ): - super(RandomizedSearchCV, self).__init__( estimator=estimator, scoring=scoring, From 406f6be151ceaeda130c7d76a152ab49fdeeab43 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Tue, 4 Dec 2018 23:04:25 -0800 Subject: [PATCH 19/83] Change module import and dask detection. --- dask_ml/model_selection/_search.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index 39d156afa..de203bad3 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -12,11 +12,6 @@ from dask.base import tokenize from dask.delayed import delayed -try: - import dask.distributed -except ImportError: - dask.distributed = None - from dask.utils import derived_from from sklearn import model_selection from sklearn.base import BaseEstimator, MetaEstimatorMixin, clone, is_classifier @@ -1165,9 +1160,10 @@ def fit(self, X, y=None, groups=None, **fit_params): if self.multimetric_: if self.refit is not False and ( - not isinstance(self.refit, str) - # This will work for both dict / list (tuple) - or self.refit not in scorer + not isinstance(self.refit, str) + or + # This will work for both dict / list (tuple) + self.refit not in scorer ): raise ValueError( "For multi-metric scoring, the parameter " @@ -1215,9 +1211,8 @@ def fit(self, X, y=None, groups=None, **fit_params): if scheduler is dask.threaded.get and n_jobs == 1: scheduler = dask.local.get_sync - if dask.distributed and isinstance(getattr(scheduler, '__self__', None), dask.distributed.Client): + if 'Client' in type(getattr(scheduler, '__self__', None)).__name__: futures = scheduler(dsk, keys, num_workers=n_jobs, sync=False) - score_map = {} for future, result in dask.distributed.as_completed(futures, with_results=True): if future.status == 'finished': From a587fa2a6960f9decbf5cbaaabd08624c786cd6f Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Tue, 4 Dec 2018 23:11:02 -0800 Subject: [PATCH 20/83] Formatting to match master. --- dask_ml/model_selection/_search.py | 275 ++++++++++++++--------------- 1 file changed, 137 insertions(+), 138 deletions(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index de203bad3..7d84ec002 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -11,7 +11,6 @@ import packaging.version from dask.base import tokenize from dask.delayed import delayed - from dask.utils import derived_from from sklearn import model_selection from sklearn.base import BaseEstimator, MetaEstimatorMixin, clone, is_classifier @@ -61,13 +60,14 @@ except ImportError: # pragma: no cover from toolz import get, pluck + __all__ = ["GridSearchCV", "RandomizedSearchCV"] + if SK_VERSION <= packaging.version.parse("0.21.dev0"): _RETURN_TRAIN_SCORE_DEFAULT = "warn" - def handle_deprecated_train_score(results, return_train_score): if return_train_score == "warn": results = DeprecationDict(results) @@ -86,7 +86,6 @@ def handle_deprecated_train_score(results, return_train_score): else: _RETURN_TRAIN_SCORE_DEFAULT = False - def handle_deprecated_train_score(results, return_train_score): return results @@ -104,18 +103,18 @@ def __call__(self, est): def build_cv_graph( - estimator, - cv, - scorer, - candidate_params, - X, - y=None, - groups=None, - fit_params=None, - iid=True, - error_score="raise", - return_train_score=_RETURN_TRAIN_SCORE_DEFAULT, - cache_cv=True, + estimator, + cv, + scorer, + candidate_params, + X, + y=None, + groups=None, + fit_params=None, + iid=True, + error_score="raise", + return_train_score=_RETURN_TRAIN_SCORE_DEFAULT, + cache_cv=True, ): X, y, groups = to_indexable(X, y, groups) cv = check_cv(cv, y, is_classifier(estimator)) @@ -179,20 +178,20 @@ def build_cv_graph( def build_result_graph( - dsk, - main_token, - estimator, - X_name, - y_name, - fit_params, - n_splits, - error_score, - scorer, - candidate_params, - scores, - weights, - refit, - multimetric + dsk, + main_token, + estimator, + X_name, + y_name, + fit_params, + n_splits, + error_score, + scorer, + candidate_params, + scores, + weights, + refit, + multimetric ): cv_results = "cv-results-" + main_token @@ -277,20 +276,20 @@ def _group_fit_params(steps, fit_params): def do_fit_and_score( - dsk, - main_token, - est, - cv, - fields, - tokens, - params, - X, - y, - fit_params, - n_splits, - error_score, - scorer, - return_train_score, + dsk, + main_token, + est, + cv, + fields, + tokens, + params, + X, + y, + fit_params, + n_splits, + error_score, + scorer, + return_train_score, ): if not isinstance(est, Pipeline): # Fitting and scoring can all be done as a single task @@ -383,18 +382,18 @@ def do_fit_and_score( def do_fit( - dsk, - next_token, - est, - cv, - fields, - tokens, - params, - Xs, - ys, - fit_params, - n_splits, - error_score, + dsk, + next_token, + est, + cv, + fields, + tokens, + params, + Xs, + ys, + fit_params, + n_splits, + error_score, ): if isinstance(est, Pipeline) and params is not None: return _do_pipeline( @@ -453,18 +452,18 @@ def do_fit( def do_fit_transform( - dsk, - next_token, - est, - cv, - fields, - tokens, - params, - Xs, - ys, - fit_params, - n_splits, - error_score, + dsk, + next_token, + est, + cv, + fields, + tokens, + params, + Xs, + ys, + fit_params, + n_splits, + error_score, ): if isinstance(est, Pipeline) and params is not None: return _do_pipeline( @@ -571,24 +570,24 @@ def new_group(): def _do_fit_step( - dsk, - next_token, - step, - cv, - fields, - tokens, - params, - Xs, - ys, - fit_params, - n_splits, - error_score, - step_fields_lk, - fit_params_lk, - field_to_index, - step_name, - none_passthrough, - is_transform, + dsk, + next_token, + step, + cv, + fields, + tokens, + params, + Xs, + ys, + fit_params, + n_splits, + error_score, + step_fields_lk, + fit_params_lk, + field_to_index, + step_name, + none_passthrough, + is_transform, ): sub_fields, sub_inds = map(list, unzip(step_fields_lk[step_name], 2)) sub_fit_params = fit_params_lk[step_name] @@ -710,19 +709,19 @@ def _do_fit_step( def _do_pipeline( - dsk, - next_token, - est, - cv, - fields, - tokens, - params, - Xs, - ys, - fit_params, - n_splits, - error_score, - is_transform, + dsk, + next_token, + est, + cv, + fields, + tokens, + params, + Xs, + ys, + fit_params, + n_splits, + error_score, + is_transform, ): if "steps" in fields: raise NotImplementedError("Setting Pipeline.steps in a gridsearch") @@ -803,18 +802,18 @@ def _do_n_samples(dsk, token, Xs, n_splits): def _do_featureunion( - dsk, - next_token, - est, - cv, - fields, - tokens, - params, - Xs, - ys, - fit_params, - n_splits, - error_score, + dsk, + next_token, + est, + cv, + fields, + tokens, + params, + Xs, + ys, + fit_params, + n_splits, + error_score, ): if "transformer_list" in fields: raise NotImplementedError( @@ -885,7 +884,7 @@ def _do_featureunion( m = 0 seen = {} for steps, Xs, wt, (w, wl), nsamp in zip( - zip(*fit_steps), zip(*tr_Xs), weight_tokens, weights, n_samples + zip(*fit_steps), zip(*tr_Xs), weight_tokens, weights, n_samples ): if (steps, wt) in seen: out_append(seen[steps, wt]) @@ -964,7 +963,7 @@ def compute_n_splits(cv, X, y=None, groups=None): return cv.get_n_splits(X, None, None) elif isinstance(cv, (LeaveOneGroupOut, LeavePGroupsOut)) and not is_dask_collection( - groups + groups ): # Only `groups` is referenced for these classes return cv.get_n_splits(None, None, groups) @@ -1515,18 +1514,18 @@ class GridSearchCV(StaticDaskSearchMixin, DaskBaseSearchCV): ) def __init__( - self, - estimator, - param_grid, - scoring=None, - iid=True, - refit=True, - cv=None, - error_score="raise", - return_train_score=_RETURN_TRAIN_SCORE_DEFAULT, - scheduler=None, - n_jobs=-1, - cache_cv=True, + self, + estimator, + param_grid, + scoring=None, + iid=True, + refit=True, + cv=None, + error_score="raise", + return_train_score=_RETURN_TRAIN_SCORE_DEFAULT, + scheduler=None, + n_jobs=-1, + cache_cv=True, ): super(GridSearchCV, self).__init__( estimator=estimator, @@ -1618,20 +1617,20 @@ class RandomizedSearchCV(StaticDaskSearchMixin, DaskBaseSearchCV): ) def __init__( - self, - estimator, - param_distributions, - n_iter=10, - random_state=None, - scoring=None, - iid=True, - refit=True, - cv=None, - error_score="raise", - return_train_score=_RETURN_TRAIN_SCORE_DEFAULT, - scheduler=None, - n_jobs=-1, - cache_cv=True, + self, + estimator, + param_distributions, + n_iter=10, + random_state=None, + scoring=None, + iid=True, + refit=True, + cv=None, + error_score="raise", + return_train_score=_RETURN_TRAIN_SCORE_DEFAULT, + scheduler=None, + n_jobs=-1, + cache_cv=True, ): super(RandomizedSearchCV, self).__init__( estimator=estimator, From 66f245e248cdbd8f5dba5ff4567d7ec06dfab7f8 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Tue, 4 Dec 2018 23:15:59 -0800 Subject: [PATCH 21/83] more formatting. --- dask_ml/model_selection/_search.py | 35 ++++++++++++++---------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index 7d84ec002..158e386aa 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -116,6 +116,7 @@ def build_cv_graph( return_train_score=_RETURN_TRAIN_SCORE_DEFAULT, cache_cv=True, ): + X, y, groups = to_indexable(X, y, groups) cv = check_cv(cv, y, is_classifier(estimator)) # "pairwise" estimators require a different graph for CV splitting @@ -199,7 +200,6 @@ def build_result_graph( metrics = list(scorer.keys()) else: metrics = None - dsk[cv_results] = ( create_cv_results, scores, @@ -220,7 +220,6 @@ def build_result_graph( best_params = "best-params-" + main_token dsk[best_params] = (get_best_params, candidate_params, cv_results, scorer) best_estimator = "best-estimator-" + main_token - if fit_params: fit_params = ( dict, @@ -238,6 +237,7 @@ def build_result_graph( return dsk, keys + def normalize_params(params): """Take a list of dictionaries, and tokenize/normalize.""" # Collect a set of all fields @@ -1020,17 +1020,17 @@ class DaskBaseSearchCV(BaseEstimator, MetaEstimatorMixin): """Base class for hyper parameter search with cross-validation.""" def __init__( - self, - estimator, - scoring=None, - iid=True, - refit=True, - cv=None, - error_score="raise", - return_train_score=_RETURN_TRAIN_SCORE_DEFAULT, - scheduler=None, - n_jobs=-1, - cache_cv=True, + self, + estimator, + scoring=None, + iid=True, + refit=True, + cv=None, + error_score="raise", + return_train_score=_RETURN_TRAIN_SCORE_DEFAULT, + scheduler=None, + n_jobs=-1, + cache_cv=True, ): self.scoring = scoring self.estimator = estimator @@ -1159,10 +1159,9 @@ def fit(self, X, y=None, groups=None, **fit_params): if self.multimetric_: if self.refit is not False and ( - not isinstance(self.refit, str) - or - # This will work for both dict / list (tuple) - self.refit not in scorer + not isinstance(self.refit, str) + # This will work for both dict / list (tuple) + or self.refit not in scorer ): raise ValueError( "For multi-metric scoring, the parameter " @@ -1184,7 +1183,6 @@ def fit(self, X, y=None, groups=None, **fit_params): ) candidate_params = list(self._get_param_iterator()) - dsk, keys, n_splits, main_token, X_name, y_name, weights = build_cv_graph( estimator, self.cv, @@ -1198,7 +1196,6 @@ def fit(self, X, y=None, groups=None, **fit_params): return_train_score=self.return_train_score, cache_cv=self.cache_cv ) - self.dask_graph_ = dsk self.n_splits_ = n_splits From cfdc7a8ab45aaa1c64abdad4cc43004ff0278f00 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Wed, 5 Dec 2018 00:33:29 -0800 Subject: [PATCH 22/83] fix sklearn tests and comment out as completed tests for now. --- dask_ml/model_selection/_search.py | 5 +++-- tests/model_selection/dask_searchcv/test_model_selection.py | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index 158e386aa..40ba48368 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -175,7 +175,7 @@ def build_cv_graph( return_train_score, ) - return dsk, scores, n_splits, main_token, X_name, y_name, weights + return dsk, scores, n_splits, main_token, X_name, y_name, weights, fit_params def build_result_graph( @@ -1183,7 +1183,7 @@ def fit(self, X, y=None, groups=None, **fit_params): ) candidate_params = list(self._get_param_iterator()) - dsk, keys, n_splits, main_token, X_name, y_name, weights = build_cv_graph( + dsk, keys, n_splits, main_token, X_name, y_name, weights, fit_params = build_cv_graph( estimator, self.cv, self.scorer_, @@ -1192,6 +1192,7 @@ def fit(self, X, y=None, groups=None, **fit_params): y=y, groups=groups, fit_params=fit_params, + iid=self.iid, error_score=error_score, return_train_score=self.return_train_score, cache_cv=self.cache_cv diff --git a/tests/model_selection/dask_searchcv/test_model_selection.py b/tests/model_selection/dask_searchcv/test_model_selection.py index a3f173201..58bbc3bcb 100644 --- a/tests/model_selection/dask_searchcv/test_model_selection.py +++ b/tests/model_selection/dask_searchcv/test_model_selection.py @@ -832,6 +832,7 @@ def score(self, X, y): def transform(self): pass +""" @pytest.mark.skipif("not has_distributed") def test_gather_as_completed_distributed(loop, tmpdir): num_cv = 3 @@ -844,8 +845,7 @@ def test_gather_as_completed_distributed(loop, tmpdir): gs.fit(X, y) tmpdir.remove() - - +""" def test_cv_multiplemetrics(): From 0871f3508f35a6bea10798bcfe72f57f2add12bd Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Wed, 5 Dec 2018 00:47:02 -0800 Subject: [PATCH 23/83] move dask graph and n_splits to the proper place. --- dask_ml/model_selection/_search.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index 40ba48368..aa7d14b1c 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -1197,8 +1197,6 @@ def fit(self, X, y=None, groups=None, **fit_params): return_train_score=self.return_train_score, cache_cv=self.cache_cv ) - self.dask_graph_ = dsk - self.n_splits_ = n_splits n_jobs = _normalize_n_jobs(self.n_jobs) scheduler = dask.base.get_scheduler(scheduler=self.scheduler) @@ -1240,6 +1238,8 @@ def fit(self, X, y=None, groups=None, **fit_params): multimetric ) + self.dask_graph_ = dsk + self.n_splits_ = n_splits out = scheduler(dsk, keys, num_workers=n_jobs) results = handle_deprecated_train_score(out[0], self.return_train_score) self.cv_results_ = results From 61ea9dc2562890213ad7c58e0b9679dc3b0ce009 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Wed, 5 Dec 2018 00:48:03 -0800 Subject: [PATCH 24/83] move dask_graph and n_splits next to results given they are all modifying private attributes. --- dask_ml/model_selection/_search.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index aa7d14b1c..9d8163ecf 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -1238,10 +1238,10 @@ def fit(self, X, y=None, groups=None, **fit_params): multimetric ) - self.dask_graph_ = dsk - self.n_splits_ = n_splits out = scheduler(dsk, keys, num_workers=n_jobs) results = handle_deprecated_train_score(out[0], self.return_train_score) + self.dask_graph_ = dsk + self.n_splits_ = n_splits self.cv_results_ = results if self.refit: From 111d77aaf70457913c3ccc7fd8cf1242f76d1a60 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Wed, 5 Dec 2018 01:13:28 -0800 Subject: [PATCH 25/83] add back try/catch for distributed model --- dask_ml/model_selection/_search.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index 9d8163ecf..a0108c8a8 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -42,7 +42,6 @@ cv_extract_params, cv_n_samples, cv_split, - decompress_params, feature_union, feature_union_concat, fit, @@ -60,6 +59,10 @@ except ImportError: # pragma: no cover from toolz import get, pluck +try: + from dask.distributed import as_completed +except ImportError: + pass __all__ = ["GridSearchCV", "RandomizedSearchCV"] @@ -1208,14 +1211,7 @@ def fit(self, X, y=None, groups=None, **fit_params): if 'Client' in type(getattr(scheduler, '__self__', None)).__name__: futures = scheduler(dsk, keys, num_workers=n_jobs, sync=False) - score_map = {} - for future, result in dask.distributed.as_completed(futures, with_results=True): - if future.status == 'finished': - score_map[future.key] = result - future.cancel() - if len(score_map) == len(keys): - break - + score_map = {f.key: res for f, res in as_completed(futures, with_results=True)} # Sort scores by score_keys so parameters line up scores = [score_map[k] for k in keys] else: From 1dea13217772dfb4d281af1f80e8bdc514444b7b Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Wed, 5 Dec 2018 03:03:00 -0800 Subject: [PATCH 26/83] scores map dict comprehension with batches. --- dask_ml/model_selection/_search.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index a0108c8a8..678dd0cb9 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -1211,12 +1211,18 @@ def fit(self, X, y=None, groups=None, **fit_params): if 'Client' in type(getattr(scheduler, '__self__', None)).__name__: futures = scheduler(dsk, keys, num_workers=n_jobs, sync=False) - score_map = {f.key: res for f, res in as_completed(futures, with_results=True)} - # Sort scores by score_keys so parameters line up - scores = [score_map[k] for k in keys] + scores_map = { + f.key: res + for batch in as_completed(futures, with_results=True).batches() + for f, res in batch + } + scores = [scores_map[k] for k in keys] else: scores = scheduler(dsk, keys, num_workers=n_jobs) + for key in keys: + dsk.pop(key) + dsk, keys = build_result_graph( dsk, main_token, From d6d15ca1e9d2b45c8c56b3b5fca387d25a1e8bee Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Fri, 7 Dec 2018 21:15:57 -0800 Subject: [PATCH 27/83] only remove keys that have been processed / exist. This also fixes the cache test --- dask_ml/model_selection/_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index 678dd0cb9..7b38064b9 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -1221,7 +1221,7 @@ def fit(self, X, y=None, groups=None, **fit_params): scores = scheduler(dsk, keys, num_workers=n_jobs) for key in keys: - dsk.pop(key) + dsk.pop(key, None) dsk, keys = build_result_graph( dsk, From df861703f84845cd31e06483a0ef13edfddbcd28 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Fri, 7 Dec 2018 21:56:49 -0800 Subject: [PATCH 28/83] fix linting --- dask_ml/model_selection/_search.py | 332 ++++++++++++++--------------- 1 file changed, 166 insertions(+), 166 deletions(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index 7b38064b9..dc1f041f2 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -66,11 +66,11 @@ __all__ = ["GridSearchCV", "RandomizedSearchCV"] - if SK_VERSION <= packaging.version.parse("0.21.dev0"): _RETURN_TRAIN_SCORE_DEFAULT = "warn" + def handle_deprecated_train_score(results, return_train_score): if return_train_score == "warn": results = DeprecationDict(results) @@ -89,6 +89,7 @@ def handle_deprecated_train_score(results, return_train_score): else: _RETURN_TRAIN_SCORE_DEFAULT = False + def handle_deprecated_train_score(results, return_train_score): return results @@ -106,20 +107,19 @@ def __call__(self, est): def build_cv_graph( - estimator, - cv, - scorer, - candidate_params, - X, - y=None, - groups=None, - fit_params=None, - iid=True, - error_score="raise", - return_train_score=_RETURN_TRAIN_SCORE_DEFAULT, - cache_cv=True, + estimator, + cv, + scorer, + candidate_params, + X, + y=None, + groups=None, + fit_params=None, + iid=True, + error_score="raise", + return_train_score=_RETURN_TRAIN_SCORE_DEFAULT, + cache_cv=True, ): - X, y, groups = to_indexable(X, y, groups) cv = check_cv(cv, y, is_classifier(estimator)) # "pairwise" estimators require a different graph for CV splitting @@ -182,22 +182,21 @@ def build_cv_graph( def build_result_graph( - dsk, - main_token, - estimator, - X_name, - y_name, - fit_params, - n_splits, - error_score, - scorer, - candidate_params, - scores, - weights, - refit, - multimetric + dsk, + main_token, + estimator, + X_name, + y_name, + fit_params, + n_splits, + error_score, + scorer, + candidate_params, + scores, + weights, + refit, + multimetric ): - cv_results = "cv-results-" + main_token if multimetric: metrics = list(scorer.keys()) @@ -279,20 +278,20 @@ def _group_fit_params(steps, fit_params): def do_fit_and_score( - dsk, - main_token, - est, - cv, - fields, - tokens, - params, - X, - y, - fit_params, - n_splits, - error_score, - scorer, - return_train_score, + dsk, + main_token, + est, + cv, + fields, + tokens, + params, + X, + y, + fit_params, + n_splits, + error_score, + scorer, + return_train_score, ): if not isinstance(est, Pipeline): # Fitting and scoring can all be done as a single task @@ -385,18 +384,18 @@ def do_fit_and_score( def do_fit( - dsk, - next_token, - est, - cv, - fields, - tokens, - params, - Xs, - ys, - fit_params, - n_splits, - error_score, + dsk, + next_token, + est, + cv, + fields, + tokens, + params, + Xs, + ys, + fit_params, + n_splits, + error_score, ): if isinstance(est, Pipeline) and params is not None: return _do_pipeline( @@ -455,18 +454,18 @@ def do_fit( def do_fit_transform( - dsk, - next_token, - est, - cv, - fields, - tokens, - params, - Xs, - ys, - fit_params, - n_splits, - error_score, + dsk, + next_token, + est, + cv, + fields, + tokens, + params, + Xs, + ys, + fit_params, + n_splits, + error_score, ): if isinstance(est, Pipeline) and params is not None: return _do_pipeline( @@ -573,24 +572,24 @@ def new_group(): def _do_fit_step( - dsk, - next_token, - step, - cv, - fields, - tokens, - params, - Xs, - ys, - fit_params, - n_splits, - error_score, - step_fields_lk, - fit_params_lk, - field_to_index, - step_name, - none_passthrough, - is_transform, + dsk, + next_token, + step, + cv, + fields, + tokens, + params, + Xs, + ys, + fit_params, + n_splits, + error_score, + step_fields_lk, + fit_params_lk, + field_to_index, + step_name, + none_passthrough, + is_transform, ): sub_fields, sub_inds = map(list, unzip(step_fields_lk[step_name], 2)) sub_fit_params = fit_params_lk[step_name] @@ -712,19 +711,19 @@ def _do_fit_step( def _do_pipeline( - dsk, - next_token, - est, - cv, - fields, - tokens, - params, - Xs, - ys, - fit_params, - n_splits, - error_score, - is_transform, + dsk, + next_token, + est, + cv, + fields, + tokens, + params, + Xs, + ys, + fit_params, + n_splits, + error_score, + is_transform, ): if "steps" in fields: raise NotImplementedError("Setting Pipeline.steps in a gridsearch") @@ -805,18 +804,18 @@ def _do_n_samples(dsk, token, Xs, n_splits): def _do_featureunion( - dsk, - next_token, - est, - cv, - fields, - tokens, - params, - Xs, - ys, - fit_params, - n_splits, - error_score, + dsk, + next_token, + est, + cv, + fields, + tokens, + params, + Xs, + ys, + fit_params, + n_splits, + error_score, ): if "transformer_list" in fields: raise NotImplementedError( @@ -887,7 +886,7 @@ def _do_featureunion( m = 0 seen = {} for steps, Xs, wt, (w, wl), nsamp in zip( - zip(*fit_steps), zip(*tr_Xs), weight_tokens, weights, n_samples + zip(*fit_steps), zip(*tr_Xs), weight_tokens, weights, n_samples ): if (steps, wt) in seen: out_append(seen[steps, wt]) @@ -966,7 +965,7 @@ def compute_n_splits(cv, X, y=None, groups=None): return cv.get_n_splits(X, None, None) elif isinstance(cv, (LeaveOneGroupOut, LeavePGroupsOut)) and not is_dask_collection( - groups + groups ): # Only `groups` is referenced for these classes return cv.get_n_splits(None, None, groups) @@ -1023,17 +1022,17 @@ class DaskBaseSearchCV(BaseEstimator, MetaEstimatorMixin): """Base class for hyper parameter search with cross-validation.""" def __init__( - self, - estimator, - scoring=None, - iid=True, - refit=True, - cv=None, - error_score="raise", - return_train_score=_RETURN_TRAIN_SCORE_DEFAULT, - scheduler=None, - n_jobs=-1, - cache_cv=True, + self, + estimator, + scoring=None, + iid=True, + refit=True, + cv=None, + error_score="raise", + return_train_score=_RETURN_TRAIN_SCORE_DEFAULT, + scheduler=None, + n_jobs=-1, + cache_cv=True, ): self.scoring = scoring self.estimator = estimator @@ -1162,9 +1161,9 @@ def fit(self, X, y=None, groups=None, **fit_params): if self.multimetric_: if self.refit is not False and ( - not isinstance(self.refit, str) - # This will work for both dict / list (tuple) - or self.refit not in scorer + not isinstance(self.refit, str) + # This will work for both dict / list (tuple) + or self.refit not in scorer ): raise ValueError( "For multi-metric scoring, the parameter " @@ -1186,20 +1185,21 @@ def fit(self, X, y=None, groups=None, **fit_params): ) candidate_params = list(self._get_param_iterator()) - dsk, keys, n_splits, main_token, X_name, y_name, weights, fit_params = build_cv_graph( - estimator, - self.cv, - self.scorer_, - candidate_params, - X, - y=y, - groups=groups, - fit_params=fit_params, - iid=self.iid, - error_score=error_score, - return_train_score=self.return_train_score, - cache_cv=self.cache_cv - ) + (dsk, keys, n_splits, main_token, X_name, y_name, weights, fit_params) = \ + build_cv_graph( + estimator, + self.cv, + self.scorer_, + candidate_params, + X, + y=y, + groups=groups, + fit_params=fit_params, + iid=self.iid, + error_score=error_score, + return_train_score=self.return_train_score, + cache_cv=self.cache_cv + ) n_jobs = _normalize_n_jobs(self.n_jobs) scheduler = dask.base.get_scheduler(scheduler=self.scheduler) @@ -1514,18 +1514,18 @@ class GridSearchCV(StaticDaskSearchMixin, DaskBaseSearchCV): ) def __init__( - self, - estimator, - param_grid, - scoring=None, - iid=True, - refit=True, - cv=None, - error_score="raise", - return_train_score=_RETURN_TRAIN_SCORE_DEFAULT, - scheduler=None, - n_jobs=-1, - cache_cv=True, + self, + estimator, + param_grid, + scoring=None, + iid=True, + refit=True, + cv=None, + error_score="raise", + return_train_score=_RETURN_TRAIN_SCORE_DEFAULT, + scheduler=None, + n_jobs=-1, + cache_cv=True, ): super(GridSearchCV, self).__init__( estimator=estimator, @@ -1617,20 +1617,20 @@ class RandomizedSearchCV(StaticDaskSearchMixin, DaskBaseSearchCV): ) def __init__( - self, - estimator, - param_distributions, - n_iter=10, - random_state=None, - scoring=None, - iid=True, - refit=True, - cv=None, - error_score="raise", - return_train_score=_RETURN_TRAIN_SCORE_DEFAULT, - scheduler=None, - n_jobs=-1, - cache_cv=True, + self, + estimator, + param_distributions, + n_iter=10, + random_state=None, + scoring=None, + iid=True, + refit=True, + cv=None, + error_score="raise", + return_train_score=_RETURN_TRAIN_SCORE_DEFAULT, + scheduler=None, + n_jobs=-1, + cache_cv=True, ): super(RandomizedSearchCV, self).__init__( estimator=estimator, From 577d987c799d08c61c48eebf408e2d2d623a702a Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Fri, 7 Dec 2018 21:59:51 -0800 Subject: [PATCH 29/83] remove as completed test until we find a better way. --- .../dask_searchcv/test_model_selection.py | 50 ------------------- 1 file changed, 50 deletions(-) diff --git a/tests/model_selection/dask_searchcv/test_model_selection.py b/tests/model_selection/dask_searchcv/test_model_selection.py index 58bbc3bcb..2066e7231 100644 --- a/tests/model_selection/dask_searchcv/test_model_selection.py +++ b/tests/model_selection/dask_searchcv/test_model_selection.py @@ -797,56 +797,6 @@ def f(dask_scheduler): assert client.run_on_scheduler(f) # some work happened on cluster -from time import sleep -from sklearn.base import BaseEstimator -import os.path -from dask.distributed import get_client -import sys - -class TestAsCompletedEstimator(BaseEstimator): - def __init__(self, i=None, out_path=None, num_cv=None, scheduler=None, loop=None): - self.i = i - self.num_cv = num_cv - self.out_path = out_path - self.scheduler = scheduler - self.loop = loop - - def fit(self, X, y): - out_file = self.out_path.join(f'{self.i}.txt') - - if self.i == (self.num_cv-1): - files = os.listdir(self.out_path) - while len(files) < (self.num_cv-1): - files = os.listdir(self.out_path) - sleep(0.05) - c = get_client() - t = 1 - - out_file.write('done') - - return 1 - - def score(self, X, y): - return 1 - - def transform(self): - pass - -""" -@pytest.mark.skipif("not has_distributed") -def test_gather_as_completed_distributed(loop, tmpdir): - num_cv = 3 - ids = list(range(0, num_cv)) - - X, y = make_classification(n_samples=100, n_features=10, random_state=0) - with cluster() as (s, [a, b]): - with Client(s["address"], loop=loop) as client: - gs = dcv.GridSearchCV(TestAsCompletedEstimator(out_path=tmpdir, num_cv=num_cv), {"i": ids}, cv=3) - gs.fit(X, y) - - tmpdir.remove() -""" - def test_cv_multiplemetrics(): X, y = make_classification(random_state=0) From 0d8dbc6def3829d973e5d636da741618fdb4fb5c Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Fri, 7 Dec 2018 22:16:14 -0800 Subject: [PATCH 30/83] more flake8 fixes. --- dask_ml/model_selection/_search.py | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index dc1f041f2..ea1e68624 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -70,7 +70,6 @@ _RETURN_TRAIN_SCORE_DEFAULT = "warn" - def handle_deprecated_train_score(results, return_train_score): if return_train_score == "warn": results = DeprecationDict(results) @@ -89,7 +88,6 @@ def handle_deprecated_train_score(results, return_train_score): else: _RETURN_TRAIN_SCORE_DEFAULT = False - def handle_deprecated_train_score(results, return_train_score): return results @@ -1187,19 +1185,19 @@ def fit(self, X, y=None, groups=None, **fit_params): candidate_params = list(self._get_param_iterator()) (dsk, keys, n_splits, main_token, X_name, y_name, weights, fit_params) = \ build_cv_graph( - estimator, - self.cv, - self.scorer_, - candidate_params, - X, - y=y, - groups=groups, - fit_params=fit_params, - iid=self.iid, - error_score=error_score, - return_train_score=self.return_train_score, - cache_cv=self.cache_cv - ) + estimator, + self.cv, + self.scorer_, + candidate_params, + X, + y=y, + groups=groups, + fit_params=fit_params, + iid=self.iid, + error_score=error_score, + return_train_score=self.return_train_score, + cache_cv=self.cache_cv + ) n_jobs = _normalize_n_jobs(self.n_jobs) scheduler = dask.base.get_scheduler(scheduler=self.scheduler) From cc70fe078d8e591cd23de4567f69690cfe98fcaa Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Fri, 7 Dec 2018 22:31:57 -0800 Subject: [PATCH 31/83] more flake8 fixes. --- dask_ml/model_selection/_search.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index ea1e68624..336c45e6d 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -1185,19 +1185,19 @@ def fit(self, X, y=None, groups=None, **fit_params): candidate_params = list(self._get_param_iterator()) (dsk, keys, n_splits, main_token, X_name, y_name, weights, fit_params) = \ build_cv_graph( - estimator, - self.cv, - self.scorer_, - candidate_params, - X, - y=y, - groups=groups, - fit_params=fit_params, - iid=self.iid, - error_score=error_score, - return_train_score=self.return_train_score, - cache_cv=self.cache_cv - ) + estimator, + self.cv, + self.scorer_, + candidate_params, + X, + y=y, + groups=groups, + fit_params=fit_params, + iid=self.iid, + error_score=error_score, + return_train_score=self.return_train_score, + cache_cv=self.cache_cv + ) n_jobs = _normalize_n_jobs(self.n_jobs) scheduler = dask.base.get_scheduler(scheduler=self.scheduler) From 7967c5c6699eb9c13b4f61c025003554077bb948 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Fri, 7 Dec 2018 22:46:31 -0800 Subject: [PATCH 32/83] remove uncessary format changes. --- dask_ml/model_selection/_search.py | 246 ++++++++++++++--------------- 1 file changed, 123 insertions(+), 123 deletions(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index 336c45e6d..faf207766 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -276,20 +276,20 @@ def _group_fit_params(steps, fit_params): def do_fit_and_score( - dsk, - main_token, - est, - cv, - fields, - tokens, - params, - X, - y, - fit_params, - n_splits, - error_score, - scorer, - return_train_score, + dsk, + main_token, + est, + cv, + fields, + tokens, + params, + X, + y, + fit_params, + n_splits, + error_score, + scorer, + return_train_score, ): if not isinstance(est, Pipeline): # Fitting and scoring can all be done as a single task @@ -382,18 +382,18 @@ def do_fit_and_score( def do_fit( - dsk, - next_token, - est, - cv, - fields, - tokens, - params, - Xs, - ys, - fit_params, - n_splits, - error_score, + dsk, + next_token, + est, + cv, + fields, + tokens, + params, + Xs, + ys, + fit_params, + n_splits, + error_score, ): if isinstance(est, Pipeline) and params is not None: return _do_pipeline( @@ -452,18 +452,18 @@ def do_fit( def do_fit_transform( - dsk, - next_token, - est, - cv, - fields, - tokens, - params, - Xs, - ys, - fit_params, - n_splits, - error_score, + dsk, + next_token, + est, + cv, + fields, + tokens, + params, + Xs, + ys, + fit_params, + n_splits, + error_score, ): if isinstance(est, Pipeline) and params is not None: return _do_pipeline( @@ -570,24 +570,24 @@ def new_group(): def _do_fit_step( - dsk, - next_token, - step, - cv, - fields, - tokens, - params, - Xs, - ys, - fit_params, - n_splits, - error_score, - step_fields_lk, - fit_params_lk, - field_to_index, - step_name, - none_passthrough, - is_transform, + dsk, + next_token, + step, + cv, + fields, + tokens, + params, + Xs, + ys, + fit_params, + n_splits, + error_score, + step_fields_lk, + fit_params_lk, + field_to_index, + step_name, + none_passthrough, + is_transform, ): sub_fields, sub_inds = map(list, unzip(step_fields_lk[step_name], 2)) sub_fit_params = fit_params_lk[step_name] @@ -709,19 +709,19 @@ def _do_fit_step( def _do_pipeline( - dsk, - next_token, - est, - cv, - fields, - tokens, - params, - Xs, - ys, - fit_params, - n_splits, - error_score, - is_transform, + dsk, + next_token, + est, + cv, + fields, + tokens, + params, + Xs, + ys, + fit_params, + n_splits, + error_score, + is_transform, ): if "steps" in fields: raise NotImplementedError("Setting Pipeline.steps in a gridsearch") @@ -802,18 +802,18 @@ def _do_n_samples(dsk, token, Xs, n_splits): def _do_featureunion( - dsk, - next_token, - est, - cv, - fields, - tokens, - params, - Xs, - ys, - fit_params, - n_splits, - error_score, + dsk, + next_token, + est, + cv, + fields, + tokens, + params, + Xs, + ys, + fit_params, + n_splits, + error_score, ): if "transformer_list" in fields: raise NotImplementedError( @@ -884,7 +884,7 @@ def _do_featureunion( m = 0 seen = {} for steps, Xs, wt, (w, wl), nsamp in zip( - zip(*fit_steps), zip(*tr_Xs), weight_tokens, weights, n_samples + zip(*fit_steps), zip(*tr_Xs), weight_tokens, weights, n_samples ): if (steps, wt) in seen: out_append(seen[steps, wt]) @@ -963,7 +963,7 @@ def compute_n_splits(cv, X, y=None, groups=None): return cv.get_n_splits(X, None, None) elif isinstance(cv, (LeaveOneGroupOut, LeavePGroupsOut)) and not is_dask_collection( - groups + groups ): # Only `groups` is referenced for these classes return cv.get_n_splits(None, None, groups) @@ -1020,17 +1020,17 @@ class DaskBaseSearchCV(BaseEstimator, MetaEstimatorMixin): """Base class for hyper parameter search with cross-validation.""" def __init__( - self, - estimator, - scoring=None, - iid=True, - refit=True, - cv=None, - error_score="raise", - return_train_score=_RETURN_TRAIN_SCORE_DEFAULT, - scheduler=None, - n_jobs=-1, - cache_cv=True, + self, + estimator, + scoring=None, + iid=True, + refit=True, + cv=None, + error_score="raise", + return_train_score=_RETURN_TRAIN_SCORE_DEFAULT, + scheduler=None, + n_jobs=-1, + cache_cv=True, ): self.scoring = scoring self.estimator = estimator @@ -1159,9 +1159,9 @@ def fit(self, X, y=None, groups=None, **fit_params): if self.multimetric_: if self.refit is not False and ( - not isinstance(self.refit, str) - # This will work for both dict / list (tuple) - or self.refit not in scorer + not isinstance(self.refit, str) + # This will work for both dict / list (tuple) + or self.refit not in scorer ): raise ValueError( "For multi-metric scoring, the parameter " @@ -1512,18 +1512,18 @@ class GridSearchCV(StaticDaskSearchMixin, DaskBaseSearchCV): ) def __init__( - self, - estimator, - param_grid, - scoring=None, - iid=True, - refit=True, - cv=None, - error_score="raise", - return_train_score=_RETURN_TRAIN_SCORE_DEFAULT, - scheduler=None, - n_jobs=-1, - cache_cv=True, + self, + estimator, + param_grid, + scoring=None, + iid=True, + refit=True, + cv=None, + error_score="raise", + return_train_score=_RETURN_TRAIN_SCORE_DEFAULT, + scheduler=None, + n_jobs=-1, + cache_cv=True, ): super(GridSearchCV, self).__init__( estimator=estimator, @@ -1615,20 +1615,20 @@ class RandomizedSearchCV(StaticDaskSearchMixin, DaskBaseSearchCV): ) def __init__( - self, - estimator, - param_distributions, - n_iter=10, - random_state=None, - scoring=None, - iid=True, - refit=True, - cv=None, - error_score="raise", - return_train_score=_RETURN_TRAIN_SCORE_DEFAULT, - scheduler=None, - n_jobs=-1, - cache_cv=True, + self, + estimator, + param_distributions, + n_iter=10, + random_state=None, + scoring=None, + iid=True, + refit=True, + cv=None, + error_score="raise", + return_train_score=_RETURN_TRAIN_SCORE_DEFAULT, + scheduler=None, + n_jobs=-1, + cache_cv=True, ): super(RandomizedSearchCV, self).__init__( estimator=estimator, From 8105ea86aeadfbe505e8f019dbf03a4a0167e27e Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Fri, 7 Dec 2018 23:01:48 -0800 Subject: [PATCH 33/83] black formatting --- dask_ml/model_selection/_search.py | 92 ++++++++++++++++-------------- 1 file changed, 50 insertions(+), 42 deletions(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index faf207766..0dbdc40c0 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -105,18 +105,18 @@ def __call__(self, est): def build_cv_graph( - estimator, - cv, - scorer, - candidate_params, - X, - y=None, - groups=None, - fit_params=None, - iid=True, - error_score="raise", - return_train_score=_RETURN_TRAIN_SCORE_DEFAULT, - cache_cv=True, + estimator, + cv, + scorer, + candidate_params, + X, + y=None, + groups=None, + fit_params=None, + iid=True, + error_score="raise", + return_train_score=_RETURN_TRAIN_SCORE_DEFAULT, + cache_cv=True, ): X, y, groups = to_indexable(X, y, groups) cv = check_cv(cv, y, is_classifier(estimator)) @@ -180,20 +180,20 @@ def build_cv_graph( def build_result_graph( - dsk, - main_token, - estimator, - X_name, - y_name, - fit_params, - n_splits, - error_score, - scorer, - candidate_params, - scores, - weights, - refit, - multimetric + dsk, + main_token, + estimator, + X_name, + y_name, + fit_params, + n_splits, + error_score, + scorer, + candidate_params, + scores, + weights, + refit, + multimetric, ): cv_results = "cv-results-" + main_token if multimetric: @@ -1183,20 +1183,28 @@ def fit(self, X, y=None, groups=None, **fit_params): ) candidate_params = list(self._get_param_iterator()) - (dsk, keys, n_splits, main_token, X_name, y_name, weights, fit_params) = \ - build_cv_graph( - estimator, - self.cv, - self.scorer_, - candidate_params, - X, - y=y, - groups=groups, - fit_params=fit_params, - iid=self.iid, - error_score=error_score, - return_train_score=self.return_train_score, - cache_cv=self.cache_cv + ( + dsk, + keys, + n_splits, + main_token, + X_name, + y_name, + weights, + fit_params, + ) = build_cv_graph( + estimator, + self.cv, + self.scorer_, + candidate_params, + X, + y=y, + groups=groups, + fit_params=fit_params, + iid=self.iid, + error_score=error_score, + return_train_score=self.return_train_score, + cache_cv=self.cache_cv, ) n_jobs = _normalize_n_jobs(self.n_jobs) @@ -1207,7 +1215,7 @@ def fit(self, X, y=None, groups=None, **fit_params): if scheduler is dask.threaded.get and n_jobs == 1: scheduler = dask.local.get_sync - if 'Client' in type(getattr(scheduler, '__self__', None)).__name__: + if "Client" in type(getattr(scheduler, "__self__", None)).__name__: futures = scheduler(dsk, keys, num_workers=n_jobs, sync=False) scores_map = { f.key: res @@ -1235,7 +1243,7 @@ def fit(self, X, y=None, groups=None, **fit_params): scores, weights, self.refit, - multimetric + multimetric, ) out = scheduler(dsk, keys, num_workers=n_jobs) From fc84640cec1310d38ab817910beda62d7768937b Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Sat, 8 Dec 2018 23:13:17 -0800 Subject: [PATCH 34/83] compute results locally, in progress refit graph --- dask_ml/model_selection/_search.py | 140 +++++++++++++---------------- 1 file changed, 60 insertions(+), 80 deletions(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index 0dbdc40c0..50f7dc6e2 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -175,68 +175,38 @@ def build_cv_graph( scorer, return_train_score, ) + keys = [weights] + scores + return dsk, keys, n_splits, main_token, X_name, y_name, fit_params - return dsk, scores, n_splits, main_token, X_name, y_name, weights, fit_params - -def build_result_graph( - dsk, - main_token, - estimator, - X_name, - y_name, - fit_params, - n_splits, - error_score, - scorer, - candidate_params, - scores, - weights, - refit, - multimetric, -): - cv_results = "cv-results-" + main_token - if multimetric: - metrics = list(scorer.keys()) - else: - metrics = None - dsk[cv_results] = ( - create_cv_results, - scores, +def build_refit_graph( + dsk, + main_token, + estimator, + scorer, + X_name, + y_name, + fit_params, candidate_params, - n_splits, - error_score, - weights, - metrics, - ) - keys = [cv_results] + cv_results - if refit: - if multimetric: - scorer = refit - else: - scorer = "score" - - best_params = "best-params-" + main_token - dsk[best_params] = (get_best_params, candidate_params, cv_results, scorer) - best_estimator = "best-estimator-" + main_token - if fit_params: - fit_params = ( - dict, - (zip, list(fit_params.keys()), list(pluck(1, fit_params.values()))), - ) - dsk[best_estimator] = ( - fit_best, - clone(estimator), - best_params, - X_name, - y_name, - fit_params, +): + best_params = get_best_params(candidate_params, cv_results, scorer) + best_estimator = "best-estimator-" + main_token + if fit_params: + fit_params = ( + dict, + (zip, list(fit_params.keys()), list(pluck(1, fit_params.values()))), ) - keys.append(best_estimator) - - return dsk, keys - + dsk[best_estimator] = ( + fit_best, + clone(estimator), + best_params, + X_name, + y_name, + fit_params, + ) + return dsk, [best_params] def normalize_params(params): """Take a list of dictionaries, and tokenize/normalize.""" @@ -1190,7 +1160,6 @@ def fit(self, X, y=None, groups=None, **fit_params): main_token, X_name, y_name, - weights, fit_params, ) = build_cv_graph( estimator, @@ -1217,51 +1186,62 @@ def fit(self, X, y=None, groups=None, **fit_params): if "Client" in type(getattr(scheduler, "__self__", None)).__name__: futures = scheduler(dsk, keys, num_workers=n_jobs, sync=False) - scores_map = { + result_map = { f.key: res for batch in as_completed(futures, with_results=True).batches() for f, res in batch } - scores = [scores_map[k] for k in keys] + out = [result_map[k] for k in keys] else: - scores = scheduler(dsk, keys, num_workers=n_jobs) + out = scheduler(dsk, keys, num_workers=n_jobs) - for key in keys: - dsk.pop(key, None) + weights = out[0] + scores = out[1::] - dsk, keys = build_result_graph( - dsk, - main_token, - estimator, - X_name, - y_name, - fit_params, + if multimetric: + metrics = list(scorer.keys()) + else: + metrics = None + + cv_results = create_cv_results( + scores, + candidate_params, n_splits, error_score, - self.scorer_, - candidate_params, - scores, weights, - self.refit, - multimetric, + metrics, ) - out = scheduler(dsk, keys, num_workers=n_jobs) - results = handle_deprecated_train_score(out[0], self.return_train_score) + results = handle_deprecated_train_score(cv_results, self.return_train_score) self.dask_graph_ = dsk self.n_splits_ = n_splits self.cv_results_ = results if self.refit: if self.multimetric_: - key = self.refit + scorer = self.refit else: - key = "score" + scorer = "score" + + dsk, keys = build_refit_graph( + dsk, + main_token, + estimator, + scorer, + X_name, + y_name, + fit_params, + candidate_params, + cv_results + ) + + out = scheduler(dsk, keys, num_workers=n_jobs) + self.best_index_ = np.flatnonzero(results["rank_test_{}".format(key)] == 1)[ 0 ] - self.best_estimator_ = out[1] + self.best_estimator_ = out[0] return self From bb545229144c77a459465b4322a7e48b23ffaffe Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Sat, 8 Dec 2018 23:25:15 -0800 Subject: [PATCH 35/83] fix a few broken tests. 2 more left --- dask_ml/model_selection/_search.py | 36 +++++++++------- .../dask_searchcv/test_model_selection.py | 41 +++++++++++++++++++ 2 files changed, 62 insertions(+), 15 deletions(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index 50f7dc6e2..8c1b05eef 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -176,21 +176,31 @@ def build_cv_graph( return_train_score, ) keys = [weights] + scores - return dsk, keys, n_splits, main_token, X_name, y_name, fit_params + return dsk, keys, n_splits, fit_params def build_refit_graph( - dsk, - main_token, estimator, scorer, - X_name, - y_name, + X, + y, + groups, fit_params, candidate_params, cv_results - ): + X, y, groups = to_indexable(X, y, groups) + dsk = {} + X_name, y_name, groups_name = to_keys(dsk, X, y, groups) + + main_token = tokenize( + normalize_estimator(estimator), + X_name, + y_name, + groups_name, + fit_params, + ) + best_params = get_best_params(candidate_params, cv_results, scorer) best_estimator = "best-estimator-" + main_token if fit_params: @@ -206,7 +216,7 @@ def build_refit_graph( y_name, fit_params, ) - return dsk, [best_params] + return dsk, [best_estimator] def normalize_params(params): """Take a list of dictionaries, and tokenize/normalize.""" @@ -1157,9 +1167,6 @@ def fit(self, X, y=None, groups=None, **fit_params): dsk, keys, n_splits, - main_token, - X_name, - y_name, fit_params, ) = build_cv_graph( estimator, @@ -1224,12 +1231,11 @@ def fit(self, X, y=None, groups=None, **fit_params): scorer = "score" dsk, keys = build_refit_graph( - dsk, - main_token, estimator, scorer, - X_name, - y_name, + X, + y, + groups, fit_params, candidate_params, cv_results @@ -1237,7 +1243,7 @@ def fit(self, X, y=None, groups=None, **fit_params): out = scheduler(dsk, keys, num_workers=n_jobs) - self.best_index_ = np.flatnonzero(results["rank_test_{}".format(key)] == 1)[ + self.best_index_ = np.flatnonzero(results["rank_test_{}".format(scorer)] == 1)[ 0 ] diff --git a/tests/model_selection/dask_searchcv/test_model_selection.py b/tests/model_selection/dask_searchcv/test_model_selection.py index 2066e7231..e96abd5ad 100644 --- a/tests/model_selection/dask_searchcv/test_model_selection.py +++ b/tests/model_selection/dask_searchcv/test_model_selection.py @@ -798,6 +798,47 @@ def f(dask_scheduler): assert client.run_on_scheduler(f) # some work happened on cluster +from distributed.diagnostics.plugin import SchedulerPlugin + +class AsCompletedPlugin(SchedulerPlugin): + def __init__(self): + self.finshed_tasks = set() + self.events = [] + + def transition(self, key, start, finish, *args, **kwargs): + self.events.append((key, start, finish)) + + if 'fit-score' in key and key in self.finshed_tasks: + print(f'{key,start,finish} shiiiiit') + raise Exception + + if start == 'memory' and finish == 'released': + self.finshed_tasks.add(key) + + + def restart(self, schduler, **kwargs): + self.finshed_tasks = set() + self.events = [] + +from dask.distributed import LocalCluster +@pytest.mark.skipif("not has_distributed") +def test_as_completed_distributed(): + + lc = LocalCluster(n_workers=2) + acpg = AsCompletedPlugin() + lc.scheduler.add_plugin(acpg) + X, y = make_classification(n_samples=100, n_features=10, random_state=0) + with Client(lc) as client: + gs = dcv.GridSearchCV(MockClassifier(), {"foo_param": [0, 1, 2]}, cv=3, refit=False) + gs.fit(X, y) + + def f(dask_scheduler): + return len(dask_scheduler.transition_log) + + client.run_on_scheduler(f) + t = 1 + assert client.run_on_scheduler(f) # some work happened on cluster + def test_cv_multiplemetrics(): X, y = make_classification(random_state=0) From 6d5e242260fe3f4f2348be910e9e9e8df9a531ef Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Sun, 9 Dec 2018 08:46:21 -0800 Subject: [PATCH 36/83] fix multimetric tests, flake8 and black formatting --- dask_ml/model_selection/_search.py | 48 ++++++++++-------------------- 1 file changed, 16 insertions(+), 32 deletions(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index 8c1b05eef..9ce909efa 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -175,30 +175,19 @@ def build_cv_graph( scorer, return_train_score, ) - keys = [weights] + scores + keys = [weights] + scores if weights else scores return dsk, keys, n_splits, fit_params def build_refit_graph( - estimator, - scorer, - X, - y, - groups, - fit_params, - candidate_params, - cv_results + estimator, scorer, X, y, groups, fit_params, candidate_params, cv_results ): X, y, groups = to_indexable(X, y, groups) dsk = {} X_name, y_name, groups_name = to_keys(dsk, X, y, groups) main_token = tokenize( - normalize_estimator(estimator), - X_name, - y_name, - groups_name, - fit_params, + normalize_estimator(estimator), X_name, y_name, groups_name, fit_params ) best_params = get_best_params(candidate_params, cv_results, scorer) @@ -218,6 +207,7 @@ def build_refit_graph( ) return dsk, [best_estimator] + def normalize_params(params): """Take a list of dictionaries, and tokenize/normalize.""" # Collect a set of all fields @@ -1163,12 +1153,7 @@ def fit(self, X, y=None, groups=None, **fit_params): ) candidate_params = list(self._get_param_iterator()) - ( - dsk, - keys, - n_splits, - fit_params, - ) = build_cv_graph( + (dsk, keys, n_splits, fit_params) = build_cv_graph( estimator, self.cv, self.scorer_, @@ -1202,8 +1187,12 @@ def fit(self, X, y=None, groups=None, **fit_params): else: out = scheduler(dsk, keys, num_workers=n_jobs) - weights = out[0] - scores = out[1::] + if self.iid: + weights = out[0] + scores = out[1::] + else: + weights = None + scores = out if multimetric: metrics = list(scorer.keys()) @@ -1211,12 +1200,7 @@ def fit(self, X, y=None, groups=None, **fit_params): metrics = None cv_results = create_cv_results( - scores, - candidate_params, - n_splits, - error_score, - weights, - metrics, + scores, candidate_params, n_splits, error_score, weights, metrics ) results = handle_deprecated_train_score(cv_results, self.return_train_score) @@ -1238,14 +1222,14 @@ def fit(self, X, y=None, groups=None, **fit_params): groups, fit_params, candidate_params, - cv_results + cv_results, ) out = scheduler(dsk, keys, num_workers=n_jobs) - self.best_index_ = np.flatnonzero(results["rank_test_{}".format(scorer)] == 1)[ - 0 - ] + self.best_index_ = np.flatnonzero( + results["rank_test_{}".format(scorer)] == 1 + )[0] self.best_estimator_ = out[0] From c55513f3204916c90790a7bcf796f41feab3d0d0 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Sun, 9 Dec 2018 11:20:02 -0800 Subject: [PATCH 37/83] simplifly build_refit_graph args --- dask_ml/model_selection/_search.py | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index 9ce909efa..3782bcf7e 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -48,7 +48,6 @@ fit_and_score, fit_best, fit_transform, - get_best_params, pipeline, score, ) @@ -179,9 +178,7 @@ def build_cv_graph( return dsk, keys, n_splits, fit_params -def build_refit_graph( - estimator, scorer, X, y, groups, fit_params, candidate_params, cv_results -): +def build_refit_graph(estimator, X, y, groups, best_params, fit_params): X, y, groups = to_indexable(X, y, groups) dsk = {} X_name, y_name, groups_name = to_keys(dsk, X, y, groups) @@ -190,7 +187,6 @@ def build_refit_graph( normalize_estimator(estimator), X_name, y_name, groups_name, fit_params ) - best_params = get_best_params(candidate_params, cv_results, scorer) best_estimator = "best-estimator-" + main_token if fit_params: fit_params = ( @@ -1214,23 +1210,16 @@ def fit(self, X, y=None, groups=None, **fit_params): else: scorer = "score" - dsk, keys = build_refit_graph( - estimator, - scorer, - X, - y, - groups, - fit_params, - candidate_params, - cv_results, - ) - - out = scheduler(dsk, keys, num_workers=n_jobs) - self.best_index_ = np.flatnonzero( results["rank_test_{}".format(scorer)] == 1 )[0] + best_params = candidate_params[self.best_index_] + dsk, keys = build_refit_graph( + estimator, X, y, groups, best_params, fit_params + ) + + out = scheduler(dsk, keys, num_workers=n_jobs) self.best_estimator_ = out[0] return self From 0ba7619ef76d3276eec34bad2330e7df87428673 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Mon, 10 Dec 2018 12:10:49 -0800 Subject: [PATCH 38/83] properly separate refit and search --- dask_ml/model_selection/_search.py | 41 ++++++++++++++++-------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index 3782bcf7e..9634d1b7d 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -103,6 +103,19 @@ def __call__(self, est): return self.token if c == 0 else self.token + str(c) +def map_fit_params(dsk, fit_params): + if fit_params: + # A mapping of {name: (name, graph-key)} + param_values = to_indexable(*fit_params.values(), allow_scalars=True) + fit_params = { + k: (k, v) for (k, v) in zip(fit_params, to_keys(dsk, *param_values)) + } + else: + fit_params = {} + + return fit_params + + def build_cv_graph( estimator, cv, @@ -126,14 +139,7 @@ def build_cv_graph( X_name, y_name, groups_name = to_keys(dsk, X, y, groups) n_splits = compute_n_splits(cv, X, y, groups) - if fit_params: - # A mapping of {name: (name, graph-key)} - param_values = to_indexable(*fit_params.values(), allow_scalars=True) - fit_params = { - k: (k, v) for (k, v) in zip(fit_params, to_keys(dsk, *param_values)) - } - else: - fit_params = {} + fit_params = map_fit_params(dsk, fit_params) fields, tokens, params = normalize_params(candidate_params) main_token = tokenize( @@ -175,17 +181,16 @@ def build_cv_graph( return_train_score, ) keys = [weights] + scores if weights else scores - return dsk, keys, n_splits, fit_params + return dsk, keys, n_splits -def build_refit_graph(estimator, X, y, groups, best_params, fit_params): - X, y, groups = to_indexable(X, y, groups) +def build_refit_graph(estimator, X, y, best_params, fit_params): + X, y = to_indexable(X, y) dsk = {} - X_name, y_name, groups_name = to_keys(dsk, X, y, groups) + X_name, y_name = to_keys(dsk, X, y) - main_token = tokenize( - normalize_estimator(estimator), X_name, y_name, groups_name, fit_params - ) + fit_params = map_fit_params(dsk, fit_params) + main_token = tokenize(normalize_estimator(estimator), X_name, y_name, fit_params) best_estimator = "best-estimator-" + main_token if fit_params: @@ -1149,7 +1154,7 @@ def fit(self, X, y=None, groups=None, **fit_params): ) candidate_params = list(self._get_param_iterator()) - (dsk, keys, n_splits, fit_params) = build_cv_graph( + dsk, keys, n_splits = build_cv_graph( estimator, self.cv, self.scorer_, @@ -1215,9 +1220,7 @@ def fit(self, X, y=None, groups=None, **fit_params): )[0] best_params = candidate_params[self.best_index_] - dsk, keys = build_refit_graph( - estimator, X, y, groups, best_params, fit_params - ) + dsk, keys = build_refit_graph(estimator, X, y, best_params, fit_params) out = scheduler(dsk, keys, num_workers=n_jobs) self.best_estimator_ = out[0] From 28490abe70150bc3067c7c4cd5fa40eb552b7a70 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Mon, 10 Dec 2018 12:18:40 -0800 Subject: [PATCH 39/83] flake8/black formatting. comment out as_completed tests for now --- .../dask_searchcv/test_model_selection.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/tests/model_selection/dask_searchcv/test_model_selection.py b/tests/model_selection/dask_searchcv/test_model_selection.py index e96abd5ad..ab0142c31 100644 --- a/tests/model_selection/dask_searchcv/test_model_selection.py +++ b/tests/model_selection/dask_searchcv/test_model_selection.py @@ -798,8 +798,10 @@ def f(dask_scheduler): assert client.run_on_scheduler(f) # some work happened on cluster +""" from distributed.diagnostics.plugin import SchedulerPlugin + class AsCompletedPlugin(SchedulerPlugin): def __init__(self): self.finshed_tasks = set() @@ -808,19 +810,21 @@ def __init__(self): def transition(self, key, start, finish, *args, **kwargs): self.events.append((key, start, finish)) - if 'fit-score' in key and key in self.finshed_tasks: - print(f'{key,start,finish} shiiiiit') + if "fit-score" in key and key in self.finshed_tasks: + print(f"{key,start,finish} shiiiiit") raise Exception - if start == 'memory' and finish == 'released': + if start == "memory" and finish == "released": self.finshed_tasks.add(key) - def restart(self, schduler, **kwargs): self.finshed_tasks = set() self.events = [] + from dask.distributed import LocalCluster + + @pytest.mark.skipif("not has_distributed") def test_as_completed_distributed(): @@ -829,7 +833,9 @@ def test_as_completed_distributed(): lc.scheduler.add_plugin(acpg) X, y = make_classification(n_samples=100, n_features=10, random_state=0) with Client(lc) as client: - gs = dcv.GridSearchCV(MockClassifier(), {"foo_param": [0, 1, 2]}, cv=3, refit=False) + gs = dcv.GridSearchCV( + MockClassifier(), {"foo_param": [0, 1, 2]}, cv=3, refit=False + ) gs.fit(X, y) def f(dask_scheduler): @@ -839,6 +845,9 @@ def f(dask_scheduler): t = 1 assert client.run_on_scheduler(f) # some work happened on cluster +""" + + def test_cv_multiplemetrics(): X, y = make_classification(random_state=0) From 751b82478346adeb58c6d0d917fc65415e0a8e6f Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Mon, 10 Dec 2018 12:31:23 -0800 Subject: [PATCH 40/83] single colon slicing --- dask_ml/model_selection/_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index 9634d1b7d..8bcb01ede 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -1190,7 +1190,7 @@ def fit(self, X, y=None, groups=None, **fit_params): if self.iid: weights = out[0] - scores = out[1::] + scores = out[1:] else: weights = None scores = out From 39a2343c5a2d4d2e3d4f28d1786e19e0607748d3 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Mon, 10 Dec 2018 12:32:36 -0800 Subject: [PATCH 41/83] only check multimetric once --- dask_ml/model_selection/_search.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index 8bcb01ede..5ff1f3498 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -1197,8 +1197,10 @@ def fit(self, X, y=None, groups=None, **fit_params): if multimetric: metrics = list(scorer.keys()) + scorer = self.refit else: metrics = None + scorer = "score" cv_results = create_cv_results( scores, candidate_params, n_splits, error_score, weights, metrics @@ -1210,11 +1212,6 @@ def fit(self, X, y=None, groups=None, **fit_params): self.cv_results_ = results if self.refit: - if self.multimetric_: - scorer = self.refit - else: - scorer = "score" - self.best_index_ = np.flatnonzero( results["rank_test_{}".format(scorer)] == 1 )[0] From 673d31673136680a0058a078299eafb64a1344f6 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Thu, 13 Dec 2018 19:51:42 -0800 Subject: [PATCH 42/83] As completed test --- dask_ml/model_selection/_search.py | 15 ++-- dask_ml/model_selection/utils_test.py | 43 ++++++++++ .../dask_searchcv/test_model_selection.py | 85 +++++++++---------- 3 files changed, 92 insertions(+), 51 deletions(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index 5ff1f3498..7641bcecc 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -1178,12 +1178,15 @@ def fit(self, X, y=None, groups=None, **fit_params): scheduler = dask.local.get_sync if "Client" in type(getattr(scheduler, "__self__", None)).__name__: - futures = scheduler(dsk, keys, num_workers=n_jobs, sync=False) - result_map = { - f.key: res - for batch in as_completed(futures, with_results=True).batches() - for f, res in batch - } + futures = scheduler(dsk, keys, allow_other_workers=True, num_workers=n_jobs, sync=False) + result_map = {} + while len(result_map) != len(keys): + for f in as_completed(futures): + if f.status == 'finished': + result_map[f.key] = f.result() + elif f.status == 'error': + f.retry() + out = [result_map[k] for k in keys] else: out = scheduler(dsk, keys, num_workers=n_jobs) diff --git a/dask_ml/model_selection/utils_test.py b/dask_ml/model_selection/utils_test.py index b271e1898..6371f5819 100644 --- a/dask_ml/model_selection/utils_test.py +++ b/dask_ml/model_selection/utils_test.py @@ -1,7 +1,18 @@ +from ast import literal_eval + import numpy as np +import pytest + from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.utils.validation import _num_samples, check_array +try: + from dask.distributed import get_worker + has_distributed = True +except ImportError: + get_worker = pytest.fixture(lambda: None) + has_distributed = False + # This class doesn't inherit from BaseEstimator to test hyperparameter search # on user-defined classifiers. @@ -188,3 +199,35 @@ def score(self, X=None, Y=None): else: score = 0.0 return score + + +class AsCompletedEstimator(BaseEstimator): + def __init__(self, killed_workers, lock, counter, foo_param=None): + self.foo_param = foo_param + self.killed_workers = killed_workers + self.lock = lock + self.counter = counter + + def fit(self, X, y): + w = get_worker() + for e in w.executing: + t = literal_eval(e) + self.lock.acquire() + c = self.counter.get() + killed_workers = self.killed_workers.get() + self.counter.set(self.counter.get() + 1) + self.lock.release() + if c >=8 and t not in killed_workers: + killed_workers[t] = True + self.killed_workers.set(killed_workers) + exit(1) + return self + + def transform(self, X): + return X + + def predict(self, X): + return 1 + + def score(self, X, y): + return 1 \ No newline at end of file diff --git a/tests/model_selection/dask_searchcv/test_model_selection.py b/tests/model_selection/dask_searchcv/test_model_selection.py index ab0142c31..acf4feaef 100644 --- a/tests/model_selection/dask_searchcv/test_model_selection.py +++ b/tests/model_selection/dask_searchcv/test_model_selection.py @@ -44,6 +44,7 @@ from dask_ml.model_selection._search import _normalize_n_jobs from dask_ml.model_selection.methods import CVCache from dask_ml.model_selection.utils_test import ( + AsCompletedEstimator, CheckXClassifier, FailingClassifier, MockClassifier, @@ -54,6 +55,8 @@ try: from distributed import Client from distributed.utils_test import cluster, loop + from distributed.diagnostics.plugin import SchedulerPlugin + from dask.distributed import LocalCluster, Lock, Variable, wait has_distributed = True except ImportError: @@ -798,54 +801,46 @@ def f(dask_scheduler): assert client.run_on_scheduler(f) # some work happened on cluster -""" -from distributed.diagnostics.plugin import SchedulerPlugin - - -class AsCompletedPlugin(SchedulerPlugin): - def __init__(self): - self.finshed_tasks = set() - self.events = [] - - def transition(self, key, start, finish, *args, **kwargs): - self.events.append((key, start, finish)) - - if "fit-score" in key and key in self.finshed_tasks: - print(f"{key,start,finish} shiiiiit") - raise Exception - - if start == "memory" and finish == "released": - self.finshed_tasks.add(key) - - def restart(self, schduler, **kwargs): - self.finshed_tasks = set() - self.events = [] - - -from dask.distributed import LocalCluster - - @pytest.mark.skipif("not has_distributed") def test_as_completed_distributed(): + with LocalCluster() as clstr: + with Client(clstr) as client: + counter = Variable("counter") + counter.set(0) + lock = Lock("lock") + killed_workers = Variable("killed_workers") + killed_workers.set({}) + + X, y = make_classification(n_samples=100, n_features=10, random_state=0) + gs = dcv.GridSearchCV( + AsCompletedEstimator(killed_workers, lock, counter), + param_grid={"foo_param": [0, 1, 2]}, + cv=3, + refit=False, + cache_cv=False, + ) + gs.fit(X, y) - lc = LocalCluster(n_workers=2) - acpg = AsCompletedPlugin() - lc.scheduler.add_plugin(acpg) - X, y = make_classification(n_samples=100, n_features=10, random_state=0) - with Client(lc) as client: - gs = dcv.GridSearchCV( - MockClassifier(), {"foo_param": [0, 1, 2]}, cv=3, refit=False - ) - gs.fit(X, y) - - def f(dask_scheduler): - return len(dask_scheduler.transition_log) - - client.run_on_scheduler(f) - t = 1 - assert client.run_on_scheduler(f) # some work happened on cluster - -""" + def f(dask_scheduler): + return dask_scheduler.transition_log + + def check_reprocess(transition_log): + finished = set() + for transition in transition_log: + key, start_state, end_state = ( + transition[0], + transition[1], + transition[2], + ) + assert key not in finished + if ( + "score" in key + and start_state == "memory" + and end_state == "forgotten" + ): + finished.add(key) + + check_reprocess(client.run_on_scheduler(f)) def test_cv_multiplemetrics(): From 7df0d20dc71ac8dc987483d74cdc6fd15d42fe34 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Thu, 13 Dec 2018 19:55:14 -0800 Subject: [PATCH 43/83] Parameterize min complete and black formatting --- dask_ml/model_selection/_search.py | 8 +++++--- dask_ml/model_selection/utils_test.py | 8 +++++--- .../model_selection/dask_searchcv/test_model_selection.py | 2 +- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index 7641bcecc..865f01d7d 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -1178,13 +1178,15 @@ def fit(self, X, y=None, groups=None, **fit_params): scheduler = dask.local.get_sync if "Client" in type(getattr(scheduler, "__self__", None)).__name__: - futures = scheduler(dsk, keys, allow_other_workers=True, num_workers=n_jobs, sync=False) + futures = scheduler( + dsk, keys, allow_other_workers=True, num_workers=n_jobs, sync=False + ) result_map = {} while len(result_map) != len(keys): for f in as_completed(futures): - if f.status == 'finished': + if f.status == "finished": result_map[f.key] = f.result() - elif f.status == 'error': + elif f.status == "error": f.retry() out = [result_map[k] for k in keys] diff --git a/dask_ml/model_selection/utils_test.py b/dask_ml/model_selection/utils_test.py index 6371f5819..2d17a4564 100644 --- a/dask_ml/model_selection/utils_test.py +++ b/dask_ml/model_selection/utils_test.py @@ -8,6 +8,7 @@ try: from dask.distributed import get_worker + has_distributed = True except ImportError: get_worker = pytest.fixture(lambda: None) @@ -202,10 +203,11 @@ def score(self, X=None, Y=None): class AsCompletedEstimator(BaseEstimator): - def __init__(self, killed_workers, lock, counter, foo_param=None): + def __init__(self, killed_workers, lock, counter, min_complete, foo_param=None): self.foo_param = foo_param self.killed_workers = killed_workers self.lock = lock + self.min_complete = min_complete self.counter = counter def fit(self, X, y): @@ -217,7 +219,7 @@ def fit(self, X, y): killed_workers = self.killed_workers.get() self.counter.set(self.counter.get() + 1) self.lock.release() - if c >=8 and t not in killed_workers: + if c > self.min_complete and t not in killed_workers: killed_workers[t] = True self.killed_workers.set(killed_workers) exit(1) @@ -230,4 +232,4 @@ def predict(self, X): return 1 def score(self, X, y): - return 1 \ No newline at end of file + return 1 diff --git a/tests/model_selection/dask_searchcv/test_model_selection.py b/tests/model_selection/dask_searchcv/test_model_selection.py index acf4feaef..177ed1de8 100644 --- a/tests/model_selection/dask_searchcv/test_model_selection.py +++ b/tests/model_selection/dask_searchcv/test_model_selection.py @@ -813,7 +813,7 @@ def test_as_completed_distributed(): X, y = make_classification(n_samples=100, n_features=10, random_state=0) gs = dcv.GridSearchCV( - AsCompletedEstimator(killed_workers, lock, counter), + AsCompletedEstimator(killed_workers, lock, counter, min_complete=7), param_grid={"foo_param": [0, 1, 2]}, cv=3, refit=False, From db8e1535953a898da337d96e5b15c222a88c2ddc Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Wed, 13 Feb 2019 15:01:24 -0800 Subject: [PATCH 44/83] remove dask distributed check --- .../dask_searchcv/test_model_selection.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/tests/model_selection/dask_searchcv/test_model_selection.py b/tests/model_selection/dask_searchcv/test_model_selection.py index 177ed1de8..e34fb43cd 100644 --- a/tests/model_selection/dask_searchcv/test_model_selection.py +++ b/tests/model_selection/dask_searchcv/test_model_selection.py @@ -52,16 +52,9 @@ ScalingTransformer, ) -try: - from distributed import Client - from distributed.utils_test import cluster, loop - from distributed.diagnostics.plugin import SchedulerPlugin - from dask.distributed import LocalCluster, Lock, Variable, wait - - has_distributed = True -except ImportError: - loop = pytest.fixture(lambda: None) - has_distributed = False +from distributed import Client +from distributed.utils_test import cluster +from dask.distributed import LocalCluster, Lock, Variable class assert_dask_compute(Callback): From f61976615bc2fdfa3bbb01f17380abe018bb4561 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Wed, 13 Feb 2019 15:31:17 -0800 Subject: [PATCH 45/83] remove pytest check for has_distributed --- tests/model_selection/dask_searchcv/test_model_selection.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/model_selection/dask_searchcv/test_model_selection.py b/tests/model_selection/dask_searchcv/test_model_selection.py index e34fb43cd..118c1974d 100644 --- a/tests/model_selection/dask_searchcv/test_model_selection.py +++ b/tests/model_selection/dask_searchcv/test_model_selection.py @@ -53,7 +53,7 @@ ) from distributed import Client -from distributed.utils_test import cluster +from distributed.utils_test import cluster, loop from dask.distributed import LocalCluster, Lock, Variable @@ -780,7 +780,6 @@ def test_scheduler_param(scheduler, n_jobs): gs.fit(X, y) -@pytest.mark.skipif("not has_distributed") def test_scheduler_param_distributed(loop): X, y = make_classification(n_samples=100, n_features=10, random_state=0) with cluster() as (s, [a, b]): @@ -794,7 +793,6 @@ def f(dask_scheduler): assert client.run_on_scheduler(f) # some work happened on cluster -@pytest.mark.skipif("not has_distributed") def test_as_completed_distributed(): with LocalCluster() as clstr: with Client(clstr) as client: From 6de0af09296e891c89ed711c92a852689feb4743 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Wed, 13 Feb 2019 15:48:46 -0800 Subject: [PATCH 46/83] context manager for lock --- dask_ml/model_selection/utils_test.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/dask_ml/model_selection/utils_test.py b/dask_ml/model_selection/utils_test.py index 2d17a4564..e5d50d69e 100644 --- a/dask_ml/model_selection/utils_test.py +++ b/dask_ml/model_selection/utils_test.py @@ -214,11 +214,10 @@ def fit(self, X, y): w = get_worker() for e in w.executing: t = literal_eval(e) - self.lock.acquire() - c = self.counter.get() - killed_workers = self.killed_workers.get() - self.counter.set(self.counter.get() + 1) - self.lock.release() + with self.lock: + c = self.counter.get() + killed_workers = self.killed_workers.get() + self.counter.set(self.counter.get() + 1) if c > self.min_complete and t not in killed_workers: killed_workers[t] = True self.killed_workers.set(killed_workers) From fa0f65eb1333729e9637e70bd3d882e71e94c248 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Wed, 13 Feb 2019 23:05:38 -0800 Subject: [PATCH 47/83] add logging to retry and use with_results --- dask_ml/model_selection/_search.py | 19 +++++--- dask_ml/model_selection/utils_test.py | 45 ++++++++++--------- .../dask_searchcv/test_model_selection.py | 23 +++++----- 3 files changed, 49 insertions(+), 38 deletions(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index 865f01d7d..1f68948db 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -1,5 +1,6 @@ from __future__ import absolute_import, division, print_function +import logging import numbers from collections import defaultdict from itertools import repeat @@ -53,6 +54,8 @@ ) from .utils import DeprecationDict, is_dask_collection, to_indexable, to_keys, unzip +logger = logging.getLogger(__name__) + try: from cytoolz import get, pluck except ImportError: # pragma: no cover @@ -1183,12 +1186,16 @@ def fit(self, X, y=None, groups=None, **fit_params): ) result_map = {} while len(result_map) != len(keys): - for f in as_completed(futures): - if f.status == "finished": - result_map[f.key] = f.result() - elif f.status == "error": - f.retry() - + failed_futures = [] + for future, result in as_completed(futures, with_results=True, raise_errors=False): + if future.status == "finished": + result_map[future.key] = result + elif future.status == "error": + future.retry() + logger.warning('{} has failed... retrying'.format(future.key)) + failed_futures.append(future) + + futures = failed_futures out = [result_map[k] for k in keys] else: out = scheduler(dsk, keys, num_workers=n_jobs) diff --git a/dask_ml/model_selection/utils_test.py b/dask_ml/model_selection/utils_test.py index e5d50d69e..a62d65036 100644 --- a/dask_ml/model_selection/utils_test.py +++ b/dask_ml/model_selection/utils_test.py @@ -1,18 +1,11 @@ from ast import literal_eval import numpy as np -import pytest from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.utils.validation import _num_samples, check_array -try: - from dask.distributed import get_worker - - has_distributed = True -except ImportError: - get_worker = pytest.fixture(lambda: None) - has_distributed = False +from distributed import get_worker, Lock, Variable, Worker # This class doesn't inherit from BaseEstimator to test hyperparameter search @@ -203,25 +196,33 @@ def score(self, X=None, Y=None): class AsCompletedEstimator(BaseEstimator): - def __init__(self, killed_workers, lock, counter, min_complete, foo_param=None): + def __init__(self, killed_workers_name, lock_name, counter_name, min_complete, foo_param=None): self.foo_param = foo_param - self.killed_workers = killed_workers - self.lock = lock + self.counter_name = counter_name + self.killed_workers_name = killed_workers_name + self.lock_name = lock_name self.min_complete = min_complete - self.counter = counter + + def fit(self, X, y): - w = get_worker() - for e in w.executing: + w: Worker = get_worker() + # self.lock = Lock(self.lock_name) + # self.counter = Variable(self.counter_name) + self.killed_workers = Variable(self.killed_workers_name) + # + for e in list(w.executing): t = literal_eval(e) - with self.lock: - c = self.counter.get() - killed_workers = self.killed_workers.get() - self.counter.set(self.counter.get() + 1) - if c > self.min_complete and t not in killed_workers: - killed_workers[t] = True - self.killed_workers.set(killed_workers) - exit(1) + print(self.killed_workers.get()) + # + # #c = self.counter.get() + # killed_workers = self.killed_workers_name.get() + # #self.counter.set(self.counter.get() + 1) + # print(killed_workers) + # if c > self.min_complete and t not in killed_workers: + # killed_workers[t] = True + # killed_workers.set(killed_workers) + # #exit(1) return self def transform(self, X): diff --git a/tests/model_selection/dask_searchcv/test_model_selection.py b/tests/model_selection/dask_searchcv/test_model_selection.py index 118c1974d..b482f0bf5 100644 --- a/tests/model_selection/dask_searchcv/test_model_selection.py +++ b/tests/model_selection/dask_searchcv/test_model_selection.py @@ -52,9 +52,8 @@ ScalingTransformer, ) -from distributed import Client +from distributed import Client, Nanny, Variable from distributed.utils_test import cluster, loop -from dask.distributed import LocalCluster, Lock, Variable class assert_dask_compute(Callback): @@ -793,22 +792,26 @@ def f(dask_scheduler): assert client.run_on_scheduler(f) # some work happened on cluster -def test_as_completed_distributed(): - with LocalCluster() as clstr: - with Client(clstr) as client: - counter = Variable("counter") +def test_as_completed_distributed(loop): + with cluster(active_rpc_timeout=10, nanny=Nanny) as (s, [a, b]): + with Client(s['address'], loop=loop) as c: + counter_name = 'counter_name' + counter = Variable(counter_name) counter.set(0) - lock = Lock("lock") - killed_workers = Variable("killed_workers") + lock_name = 'lock' + + killed_workers_name = 'killed_workers' + killed_workers = Variable(killed_workers_name) killed_workers.set({}) X, y = make_classification(n_samples=100, n_features=10, random_state=0) gs = dcv.GridSearchCV( - AsCompletedEstimator(killed_workers, lock, counter, min_complete=7), + AsCompletedEstimator(killed_workers_name, lock_name, counter_name, min_complete=2), param_grid={"foo_param": [0, 1, 2]}, cv=3, refit=False, cache_cv=False, + scheduler=c ) gs.fit(X, y) @@ -831,7 +834,7 @@ def check_reprocess(transition_log): ): finished.add(key) - check_reprocess(client.run_on_scheduler(f)) + check_reprocess(c.run_on_scheduler(f)) def test_cv_multiplemetrics(): From 946ba5bcf5d774506614dcb8fea54008b16d1080 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Thu, 14 Feb 2019 00:26:12 -0800 Subject: [PATCH 48/83] seems like as completed tests work now --- dask_ml/model_selection/utils_test.py | 35 +++++++++++-------- .../dask_searchcv/test_model_selection.py | 6 ++-- 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/dask_ml/model_selection/utils_test.py b/dask_ml/model_selection/utils_test.py index a62d65036..9994452c3 100644 --- a/dask_ml/model_selection/utils_test.py +++ b/dask_ml/model_selection/utils_test.py @@ -194,6 +194,9 @@ def score(self, X=None, Y=None): score = 0.0 return score +from time import sleep +import os +import signal class AsCompletedEstimator(BaseEstimator): def __init__(self, killed_workers_name, lock_name, counter_name, min_complete, foo_param=None): @@ -207,22 +210,24 @@ def __init__(self, killed_workers_name, lock_name, counter_name, min_complete, f def fit(self, X, y): w: Worker = get_worker() - # self.lock = Lock(self.lock_name) - # self.counter = Variable(self.counter_name) - self.killed_workers = Variable(self.killed_workers_name) - # + dsk_lock = Lock(self.lock_name, client=w.client) + dsk_counter = Variable(self.counter_name, client=w.client) + dsk_killed_workers = Variable(self.killed_workers_name, client=w.client) + for e in list(w.executing): - t = literal_eval(e) - print(self.killed_workers.get()) - # - # #c = self.counter.get() - # killed_workers = self.killed_workers_name.get() - # #self.counter.set(self.counter.get() + 1) - # print(killed_workers) - # if c > self.min_complete and t not in killed_workers: - # killed_workers[t] = True - # killed_workers.set(killed_workers) - # #exit(1) + should_die = False + with dsk_lock: + t = literal_eval(e) + c = dsk_counter.get() + dsk_counter.set(c + 1) + killed_workers = dsk_killed_workers.get() + if c > self.min_complete and t not in killed_workers: + killed_workers[t] = True + should_die = True + dsk_killed_workers.set(killed_workers) + + if should_die: + os.kill(os.getpid(), 9) return self def transform(self, X): diff --git a/tests/model_selection/dask_searchcv/test_model_selection.py b/tests/model_selection/dask_searchcv/test_model_selection.py index b482f0bf5..cefedd54f 100644 --- a/tests/model_selection/dask_searchcv/test_model_selection.py +++ b/tests/model_selection/dask_searchcv/test_model_selection.py @@ -791,17 +791,18 @@ def f(dask_scheduler): assert client.run_on_scheduler(f) # some work happened on cluster +from dask.distributed import LocalCluster def test_as_completed_distributed(loop): with cluster(active_rpc_timeout=10, nanny=Nanny) as (s, [a, b]): with Client(s['address'], loop=loop) as c: counter_name = 'counter_name' - counter = Variable(counter_name) + counter = Variable(counter_name, client=c) counter.set(0) lock_name = 'lock' killed_workers_name = 'killed_workers' - killed_workers = Variable(killed_workers_name) + killed_workers = Variable(killed_workers_name, client=c) killed_workers.set({}) X, y = make_classification(n_samples=100, n_features=10, random_state=0) @@ -833,7 +834,6 @@ def check_reprocess(transition_log): and end_state == "forgotten" ): finished.add(key) - check_reprocess(c.run_on_scheduler(f)) From b00b558c520316f37232e8a9c9b501f25d1fdf4c Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Thu, 14 Feb 2019 00:26:48 -0800 Subject: [PATCH 49/83] min complete to 7 --- tests/model_selection/dask_searchcv/test_model_selection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/model_selection/dask_searchcv/test_model_selection.py b/tests/model_selection/dask_searchcv/test_model_selection.py index cefedd54f..efd699e17 100644 --- a/tests/model_selection/dask_searchcv/test_model_selection.py +++ b/tests/model_selection/dask_searchcv/test_model_selection.py @@ -807,7 +807,7 @@ def test_as_completed_distributed(loop): X, y = make_classification(n_samples=100, n_features=10, random_state=0) gs = dcv.GridSearchCV( - AsCompletedEstimator(killed_workers_name, lock_name, counter_name, min_complete=2), + AsCompletedEstimator(killed_workers_name, lock_name, counter_name, min_complete=7 ), param_grid={"foo_param": [0, 1, 2]}, cv=3, refit=False, From 19cf8da74777c12d0032dda3c5fe84ba30a29c75 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Thu, 14 Feb 2019 15:00:25 -0800 Subject: [PATCH 50/83] remove type hint --- dask_ml/model_selection/utils_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask_ml/model_selection/utils_test.py b/dask_ml/model_selection/utils_test.py index 9994452c3..34c0d4b52 100644 --- a/dask_ml/model_selection/utils_test.py +++ b/dask_ml/model_selection/utils_test.py @@ -209,7 +209,7 @@ def __init__(self, killed_workers_name, lock_name, counter_name, min_complete, f def fit(self, X, y): - w: Worker = get_worker() + w = get_worker() dsk_lock = Lock(self.lock_name, client=w.client) dsk_counter = Variable(self.counter_name, client=w.client) dsk_killed_workers = Variable(self.killed_workers_name, client=w.client) From 6171a976d09e18a5e3436f46449c532d72f0838d Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Thu, 14 Feb 2019 15:18:25 -0800 Subject: [PATCH 51/83] try latest tornado? --- ci/environment-3.6.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/environment-3.6.yml b/ci/environment-3.6.yml index 8d5c01a2b..a50217248 100644 --- a/ci/environment-3.6.yml +++ b/ci/environment-3.6.yml @@ -39,7 +39,7 @@ dependencies: - sphinx-gallery - tensorflow - testpath<0.4 - - tornado + - tornado==6.0.0b1 - toolz - xgboost - zict From f6e0160833e627f132494c43a7f56fe8aa586535 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Thu, 14 Feb 2019 15:22:56 -0800 Subject: [PATCH 52/83] try tornado 6.0.0b1 in pip --- ci/environment-3.6.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/environment-3.6.yml b/ci/environment-3.6.yml index a50217248..649f146fd 100644 --- a/ci/environment-3.6.yml +++ b/ci/environment-3.6.yml @@ -39,7 +39,6 @@ dependencies: - sphinx-gallery - tensorflow - testpath<0.4 - - tornado==6.0.0b1 - toolz - xgboost - zict @@ -51,3 +50,4 @@ dependencies: - git+https://github.com/dask/dask-xgboost.git - dask_sphinx_theme >=1.1.0 - graphviz + - tornado==6.0.0b1 From d6441e7931a82875ae5bc73869dae6971a2aacc3 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Thu, 14 Feb 2019 15:36:15 -0800 Subject: [PATCH 53/83] revert back to latest tornado --- ci/environment-3.6.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/environment-3.6.yml b/ci/environment-3.6.yml index 649f146fd..8d5c01a2b 100644 --- a/ci/environment-3.6.yml +++ b/ci/environment-3.6.yml @@ -39,6 +39,7 @@ dependencies: - sphinx-gallery - tensorflow - testpath<0.4 + - tornado - toolz - xgboost - zict @@ -50,4 +51,3 @@ dependencies: - git+https://github.com/dask/dask-xgboost.git - dask_sphinx_theme >=1.1.0 - graphviz - - tornado==6.0.0b1 From 922d27c1ae5237872e4922e4217f9449782e2249 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Thu, 14 Feb 2019 18:54:54 -0800 Subject: [PATCH 54/83] add classification mixin --- dask_ml/model_selection/utils_test.py | 7 +++---- .../model_selection/dask_searchcv/test_model_selection.py | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/dask_ml/model_selection/utils_test.py b/dask_ml/model_selection/utils_test.py index 34c0d4b52..3d51366f8 100644 --- a/dask_ml/model_selection/utils_test.py +++ b/dask_ml/model_selection/utils_test.py @@ -197,8 +197,9 @@ def score(self, X=None, Y=None): from time import sleep import os import signal +from sklearn.base import ClassifierMixin -class AsCompletedEstimator(BaseEstimator): +class AsCompletedEstimator(BaseEstimator, ClassifierMixin): def __init__(self, killed_workers_name, lock_name, counter_name, min_complete, foo_param=None): self.foo_param = foo_param self.counter_name = counter_name @@ -206,8 +207,6 @@ def __init__(self, killed_workers_name, lock_name, counter_name, min_complete, f self.lock_name = lock_name self.min_complete = min_complete - - def fit(self, X, y): w = get_worker() dsk_lock = Lock(self.lock_name, client=w.client) @@ -234,7 +233,7 @@ def transform(self, X): return X def predict(self, X): - return 1 + return X def score(self, X, y): return 1 diff --git a/tests/model_selection/dask_searchcv/test_model_selection.py b/tests/model_selection/dask_searchcv/test_model_selection.py index efd699e17..c94e83fa3 100644 --- a/tests/model_selection/dask_searchcv/test_model_selection.py +++ b/tests/model_selection/dask_searchcv/test_model_selection.py @@ -807,7 +807,7 @@ def test_as_completed_distributed(loop): X, y = make_classification(n_samples=100, n_features=10, random_state=0) gs = dcv.GridSearchCV( - AsCompletedEstimator(killed_workers_name, lock_name, counter_name, min_complete=7 ), + AsCompletedEstimator(killed_workers_name, lock_name, counter_name, min_complete=7), param_grid={"foo_param": [0, 1, 2]}, cv=3, refit=False, From a72388ec77a33515555986aadf3f021545f4d0ba Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Thu, 14 Feb 2019 19:19:58 -0800 Subject: [PATCH 55/83] looks like test estimators need kwargs to be serialized --- dask_ml/model_selection/utils_test.py | 11 ++++------- .../dask_searchcv/test_model_selection.py | 1 - 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/dask_ml/model_selection/utils_test.py b/dask_ml/model_selection/utils_test.py index 3d51366f8..3881dd29c 100644 --- a/dask_ml/model_selection/utils_test.py +++ b/dask_ml/model_selection/utils_test.py @@ -1,11 +1,12 @@ from ast import literal_eval +import os import numpy as np from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.utils.validation import _num_samples, check_array -from distributed import get_worker, Lock, Variable, Worker +from distributed import get_worker, Lock, Variable # This class doesn't inherit from BaseEstimator to test hyperparameter search @@ -194,13 +195,9 @@ def score(self, X=None, Y=None): score = 0.0 return score -from time import sleep -import os -import signal -from sklearn.base import ClassifierMixin -class AsCompletedEstimator(BaseEstimator, ClassifierMixin): - def __init__(self, killed_workers_name, lock_name, counter_name, min_complete, foo_param=None): +class AsCompletedEstimator(BaseEstimator): + def __init__(self, killed_workers_name='killed_workers', lock_name='lock', counter_name='counter', min_complete=7, foo_param=None): self.foo_param = foo_param self.counter_name = counter_name self.killed_workers_name = killed_workers_name diff --git a/tests/model_selection/dask_searchcv/test_model_selection.py b/tests/model_selection/dask_searchcv/test_model_selection.py index c94e83fa3..75fc551d0 100644 --- a/tests/model_selection/dask_searchcv/test_model_selection.py +++ b/tests/model_selection/dask_searchcv/test_model_selection.py @@ -791,7 +791,6 @@ def f(dask_scheduler): assert client.run_on_scheduler(f) # some work happened on cluster -from dask.distributed import LocalCluster def test_as_completed_distributed(loop): with cluster(active_rpc_timeout=10, nanny=Nanny) as (s, [a, b]): From a609ce5b1997b4df8c534e543bf22371b193f9a2 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Thu, 14 Feb 2019 19:29:48 -0800 Subject: [PATCH 56/83] try exit(1) --- dask_ml/model_selection/utils_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask_ml/model_selection/utils_test.py b/dask_ml/model_selection/utils_test.py index 3881dd29c..651fd73c2 100644 --- a/dask_ml/model_selection/utils_test.py +++ b/dask_ml/model_selection/utils_test.py @@ -223,7 +223,7 @@ def fit(self, X, y): dsk_killed_workers.set(killed_workers) if should_die: - os.kill(os.getpid(), 9) + exit(1) return self def transform(self, X): From a0fdbd876e79e45c7a3c5139df524aed9cb15a78 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Thu, 14 Feb 2019 19:40:08 -0800 Subject: [PATCH 57/83] try localcluster --- .../dask_searchcv/test_model_selection.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/model_selection/dask_searchcv/test_model_selection.py b/tests/model_selection/dask_searchcv/test_model_selection.py index 75fc551d0..1e2b1638c 100644 --- a/tests/model_selection/dask_searchcv/test_model_selection.py +++ b/tests/model_selection/dask_searchcv/test_model_selection.py @@ -791,10 +791,16 @@ def f(dask_scheduler): assert client.run_on_scheduler(f) # some work happened on cluster +from distributed import Client +from distributed.utils_test import cluster, loop +from dask.distributed import LocalCluster, Lock, Variable + def test_as_completed_distributed(loop): - with cluster(active_rpc_timeout=10, nanny=Nanny) as (s, [a, b]): - with Client(s['address'], loop=loop) as c: + # with cluster(active_rpc_timeout=10, nanny=Nanny) as (s, [a, b]): + # with Client(s['address'], loop=loop) as c: + with LocalCluster() as clstr: + with Client(clstr) as c: counter_name = 'counter_name' counter = Variable(counter_name, client=c) counter.set(0) From 7c450a9cf84b145b45a40ded41cc74b5b6ae26b6 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Thu, 14 Feb 2019 20:03:55 -0800 Subject: [PATCH 58/83] catch exception if worker killed during .result() --- dask_ml/model_selection/_search.py | 23 ++++++++++++------- dask_ml/model_selection/utils_test.py | 2 +- .../dask_searchcv/test_model_selection.py | 8 +++---- 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index 1f68948db..a4d01875b 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -1184,17 +1184,24 @@ def fit(self, X, y=None, groups=None, **fit_params): futures = scheduler( dsk, keys, allow_other_workers=True, num_workers=n_jobs, sync=False ) + + def reschedule_future(f, fs): + f.retry() + logger.warning('{} has failed... retrying'.format(f.key)) + fs.append(f) + return fs + result_map = {} while len(result_map) != len(keys): failed_futures = [] - for future, result in as_completed(futures, with_results=True, raise_errors=False): - if future.status == "finished": - result_map[future.key] = result - elif future.status == "error": - future.retry() - logger.warning('{} has failed... retrying'.format(future.key)) - failed_futures.append(future) - + for future in as_completed(futures): + try: + if future.status == "finished": + result_map[future.key] = future.result() + elif future.status == "error": + failed_futures = reschedule_future(future, failed_futures) + except Exception as e: + failed_futures = reschedule_future(future, failed_futures) futures = failed_futures out = [result_map[k] for k in keys] else: diff --git a/dask_ml/model_selection/utils_test.py b/dask_ml/model_selection/utils_test.py index 651fd73c2..3881dd29c 100644 --- a/dask_ml/model_selection/utils_test.py +++ b/dask_ml/model_selection/utils_test.py @@ -223,7 +223,7 @@ def fit(self, X, y): dsk_killed_workers.set(killed_workers) if should_die: - exit(1) + os.kill(os.getpid(), 9) return self def transform(self, X): diff --git a/tests/model_selection/dask_searchcv/test_model_selection.py b/tests/model_selection/dask_searchcv/test_model_selection.py index 1e2b1638c..0a92090f2 100644 --- a/tests/model_selection/dask_searchcv/test_model_selection.py +++ b/tests/model_selection/dask_searchcv/test_model_selection.py @@ -797,10 +797,10 @@ def f(dask_scheduler): def test_as_completed_distributed(loop): - # with cluster(active_rpc_timeout=10, nanny=Nanny) as (s, [a, b]): - # with Client(s['address'], loop=loop) as c: - with LocalCluster() as clstr: - with Client(clstr) as c: + with cluster(active_rpc_timeout=10, nanny=Nanny) as (s, [a, b]): + with Client(s['address'], loop=loop) as c: + # with LocalCluster() as clstr: + # with Client(clstr) as c: counter_name = 'counter_name' counter = Variable(counter_name, client=c) counter.set(0) From b2d2cf966325c941850aa80cd5769fe9b1ed8817 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Thu, 14 Feb 2019 20:33:35 -0800 Subject: [PATCH 59/83] see if this retry works on linux --- dask_ml/model_selection/_search.py | 15 ++++----------- dask_ml/model_selection/utils_test.py | 11 ++++++----- .../dask_searchcv/test_model_selection.py | 6 ------ 3 files changed, 10 insertions(+), 22 deletions(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index a4d01875b..78dd49839 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -1185,23 +1185,16 @@ def fit(self, X, y=None, groups=None, **fit_params): dsk, keys, allow_other_workers=True, num_workers=n_jobs, sync=False ) - def reschedule_future(f, fs): - f.retry() - logger.warning('{} has failed... retrying'.format(f.key)) - fs.append(f) - return fs - result_map = {} while len(result_map) != len(keys): failed_futures = [] for future in as_completed(futures): try: - if future.status == "finished": - result_map[future.key] = future.result() - elif future.status == "error": - failed_futures = reschedule_future(future, failed_futures) + result_map[future.key] = future.result() except Exception as e: - failed_futures = reschedule_future(future, failed_futures) + future.retry() + logger.warning('{} has failed... retrying'.format(future.key)) + failed_futures.append(future) futures = failed_futures out = [result_map[k] for k in keys] else: diff --git a/dask_ml/model_selection/utils_test.py b/dask_ml/model_selection/utils_test.py index 3881dd29c..a9f55789b 100644 --- a/dask_ml/model_selection/utils_test.py +++ b/dask_ml/model_selection/utils_test.py @@ -196,7 +196,7 @@ def score(self, X=None, Y=None): return score -class AsCompletedEstimator(BaseEstimator): +class AsCompletedEstimator(MockClassifier): def __init__(self, killed_workers_name='killed_workers', lock_name='lock', counter_name='counter', min_complete=7, foo_param=None): self.foo_param = foo_param self.counter_name = counter_name @@ -204,7 +204,7 @@ def __init__(self, killed_workers_name='killed_workers', lock_name='lock', count self.lock_name = lock_name self.min_complete = min_complete - def fit(self, X, y): + def fit(self, X=None, y=None): w = get_worker() dsk_lock = Lock(self.lock_name, client=w.client) dsk_counter = Variable(self.counter_name, client=w.client) @@ -223,14 +223,15 @@ def fit(self, X, y): dsk_killed_workers.set(killed_workers) if should_die: + #exit(1) os.kill(os.getpid(), 9) return self - def transform(self, X): + def transform(self, X=None): return X - def predict(self, X): + def predict(self, X=None): return X - def score(self, X, y): + def score(self, X=None, y=None): return 1 diff --git a/tests/model_selection/dask_searchcv/test_model_selection.py b/tests/model_selection/dask_searchcv/test_model_selection.py index 0a92090f2..75fc551d0 100644 --- a/tests/model_selection/dask_searchcv/test_model_selection.py +++ b/tests/model_selection/dask_searchcv/test_model_selection.py @@ -791,16 +791,10 @@ def f(dask_scheduler): assert client.run_on_scheduler(f) # some work happened on cluster -from distributed import Client -from distributed.utils_test import cluster, loop -from dask.distributed import LocalCluster, Lock, Variable - def test_as_completed_distributed(loop): with cluster(active_rpc_timeout=10, nanny=Nanny) as (s, [a, b]): with Client(s['address'], loop=loop) as c: - # with LocalCluster() as clstr: - # with Client(clstr) as c: counter_name = 'counter_name' counter = Variable(counter_name, client=c) counter.set(0) From ca5b07e009d2884cbe5eff0a93e4fb41d97b9459 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Thu, 14 Feb 2019 20:35:51 -0800 Subject: [PATCH 60/83] remove commented exit(1) --- dask_ml/model_selection/utils_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dask_ml/model_selection/utils_test.py b/dask_ml/model_selection/utils_test.py index a9f55789b..c2ab2e405 100644 --- a/dask_ml/model_selection/utils_test.py +++ b/dask_ml/model_selection/utils_test.py @@ -223,7 +223,6 @@ def fit(self, X=None, y=None): dsk_killed_workers.set(killed_workers) if should_die: - #exit(1) os.kill(os.getpid(), 9) return self From cde1f5c9d64c331e8be2d469c6246848917cf6ad Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Thu, 14 Feb 2019 20:42:15 -0800 Subject: [PATCH 61/83] black formatting and isort --- dask_ml/model_selection/_search.py | 2 +- dask_ml/model_selection/utils_test.py | 15 +++++++++----- .../dask_searchcv/test_model_selection.py | 20 ++++++++++--------- 3 files changed, 22 insertions(+), 15 deletions(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index 78dd49839..f21d10646 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -1193,7 +1193,7 @@ def fit(self, X, y=None, groups=None, **fit_params): result_map[future.key] = future.result() except Exception as e: future.retry() - logger.warning('{} has failed... retrying'.format(future.key)) + logger.warning("{} has failed... retrying".format(future.key)) failed_futures.append(future) futures = failed_futures out = [result_map[k] for k in keys] diff --git a/dask_ml/model_selection/utils_test.py b/dask_ml/model_selection/utils_test.py index c2ab2e405..75d971647 100644 --- a/dask_ml/model_selection/utils_test.py +++ b/dask_ml/model_selection/utils_test.py @@ -1,13 +1,11 @@ -from ast import literal_eval import os +from ast import literal_eval import numpy as np - +from distributed import Lock, Variable, get_worker from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.utils.validation import _num_samples, check_array -from distributed import get_worker, Lock, Variable - # This class doesn't inherit from BaseEstimator to test hyperparameter search # on user-defined classifiers. @@ -197,7 +195,14 @@ def score(self, X=None, Y=None): class AsCompletedEstimator(MockClassifier): - def __init__(self, killed_workers_name='killed_workers', lock_name='lock', counter_name='counter', min_complete=7, foo_param=None): + def __init__( + self, + killed_workers_name="killed_workers", + lock_name="lock", + counter_name="counter", + min_complete=7, + foo_param=None, + ): self.foo_param = foo_param self.counter_name = counter_name self.killed_workers_name = killed_workers_name diff --git a/tests/model_selection/dask_searchcv/test_model_selection.py b/tests/model_selection/dask_searchcv/test_model_selection.py index 75fc551d0..ad80b98c8 100644 --- a/tests/model_selection/dask_searchcv/test_model_selection.py +++ b/tests/model_selection/dask_searchcv/test_model_selection.py @@ -14,6 +14,8 @@ from dask.callbacks import Callback from dask.delayed import delayed from dask.utils import tmpdir +from distributed import Client, Nanny, Variable +from distributed.utils_test import cluster, loop from sklearn.datasets import load_iris, make_classification from sklearn.decomposition import PCA from sklearn.ensemble import RandomForestClassifier @@ -52,9 +54,6 @@ ScalingTransformer, ) -from distributed import Client, Nanny, Variable -from distributed.utils_test import cluster, loop - class assert_dask_compute(Callback): def __init__(self, compute=False): @@ -794,24 +793,26 @@ def f(dask_scheduler): def test_as_completed_distributed(loop): with cluster(active_rpc_timeout=10, nanny=Nanny) as (s, [a, b]): - with Client(s['address'], loop=loop) as c: - counter_name = 'counter_name' + with Client(s["address"], loop=loop) as c: + counter_name = "counter_name" counter = Variable(counter_name, client=c) counter.set(0) - lock_name = 'lock' + lock_name = "lock" - killed_workers_name = 'killed_workers' + killed_workers_name = "killed_workers" killed_workers = Variable(killed_workers_name, client=c) killed_workers.set({}) X, y = make_classification(n_samples=100, n_features=10, random_state=0) gs = dcv.GridSearchCV( - AsCompletedEstimator(killed_workers_name, lock_name, counter_name, min_complete=7), + AsCompletedEstimator( + killed_workers_name, lock_name, counter_name, min_complete=7 + ), param_grid={"foo_param": [0, 1, 2]}, cv=3, refit=False, cache_cv=False, - scheduler=c + scheduler=c, ) gs.fit(X, y) @@ -833,6 +834,7 @@ def check_reprocess(transition_log): and end_state == "forgotten" ): finished.add(key) + check_reprocess(c.run_on_scheduler(f)) From 63ef4c39b67b3e856f169fac6e57f38deef4369b Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Thu, 14 Feb 2019 20:53:53 -0800 Subject: [PATCH 62/83] print exception with retry --- dask_ml/model_selection/_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index f21d10646..a898cea5e 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -1193,7 +1193,7 @@ def fit(self, X, y=None, groups=None, **fit_params): result_map[future.key] = future.result() except Exception as e: future.retry() - logger.warning("{} has failed... retrying".format(future.key)) + logger.warning("{} has failed due to {}... retrying".format(future.key, e)) failed_futures.append(future) futures = failed_futures out = [result_map[k] for k in keys] From 7a455ce558bd42553016baf5e148dcd244b1183e Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Thu, 14 Feb 2019 21:04:29 -0800 Subject: [PATCH 63/83] flake8 fix --- dask_ml/model_selection/_search.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index a898cea5e..65639a8de 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -1193,7 +1193,9 @@ def fit(self, X, y=None, groups=None, **fit_params): result_map[future.key] = future.result() except Exception as e: future.retry() - logger.warning("{} has failed due to {}... retrying".format(future.key, e)) + logger.warning( + "{} has failed due to {}... retrying".format(future.key, e) + ) failed_futures.append(future) futures = failed_futures out = [result_map[k] for k in keys] From 8dc53ab8606b689e69c215d8471bb71211bedf7f Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Thu, 14 Feb 2019 21:10:52 -0800 Subject: [PATCH 64/83] more flake8 --- tests/model_selection/dask_searchcv/test_model_selection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/model_selection/dask_searchcv/test_model_selection.py b/tests/model_selection/dask_searchcv/test_model_selection.py index ad80b98c8..1eec98ecf 100644 --- a/tests/model_selection/dask_searchcv/test_model_selection.py +++ b/tests/model_selection/dask_searchcv/test_model_selection.py @@ -15,7 +15,7 @@ from dask.delayed import delayed from dask.utils import tmpdir from distributed import Client, Nanny, Variable -from distributed.utils_test import cluster, loop +from distributed.utils_test import cluster from sklearn.datasets import load_iris, make_classification from sklearn.decomposition import PCA from sklearn.ensemble import RandomForestClassifier From 6c4150ec839decdaaf9e0e560c0772ed04cb7ea5 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Thu, 14 Feb 2019 21:19:15 -0800 Subject: [PATCH 65/83] match imports with master given loop fails flake8 --- .../dask_searchcv/test_model_selection.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/model_selection/dask_searchcv/test_model_selection.py b/tests/model_selection/dask_searchcv/test_model_selection.py index 1eec98ecf..9bd85f34b 100644 --- a/tests/model_selection/dask_searchcv/test_model_selection.py +++ b/tests/model_selection/dask_searchcv/test_model_selection.py @@ -14,8 +14,6 @@ from dask.callbacks import Callback from dask.delayed import delayed from dask.utils import tmpdir -from distributed import Client, Nanny, Variable -from distributed.utils_test import cluster from sklearn.datasets import load_iris, make_classification from sklearn.decomposition import PCA from sklearn.ensemble import RandomForestClassifier @@ -54,6 +52,15 @@ ScalingTransformer, ) +try: + from distributed import Client, Nanny, Variable + from distributed.utils_test import cluster, loop + + has_distributed = True +except ImportError: + loop = pytest.fixture(lambda: None) + has_distributed = False + class assert_dask_compute(Callback): def __init__(self, compute=False): From 250e08c8e653d671db73aa3a317b8e8c7490a206 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Thu, 14 Feb 2019 21:21:07 -0800 Subject: [PATCH 66/83] match master with has distributed --- tests/model_selection/dask_searchcv/test_model_selection.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/model_selection/dask_searchcv/test_model_selection.py b/tests/model_selection/dask_searchcv/test_model_selection.py index 9bd85f34b..cc35287da 100644 --- a/tests/model_selection/dask_searchcv/test_model_selection.py +++ b/tests/model_selection/dask_searchcv/test_model_selection.py @@ -785,6 +785,7 @@ def test_scheduler_param(scheduler, n_jobs): gs.fit(X, y) +@pytest.mark.skipif("not has_distributed") def test_scheduler_param_distributed(loop): X, y = make_classification(n_samples=100, n_features=10, random_state=0) with cluster() as (s, [a, b]): @@ -798,6 +799,7 @@ def f(dask_scheduler): assert client.run_on_scheduler(f) # some work happened on cluster +@pytest.mark.skipif("not has_distributed") def test_as_completed_distributed(loop): with cluster(active_rpc_timeout=10, nanny=Nanny) as (s, [a, b]): with Client(s["address"], loop=loop) as c: From b962af226ccaf2c0ad4063c65a448bb7b7de414a Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Thu, 14 Feb 2019 23:20:42 -0800 Subject: [PATCH 67/83] clean up AsCompletedEstimator --- dask_ml/model_selection/utils_test.py | 16 +++++++--------- .../dask_searchcv/test_model_selection.py | 2 +- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/dask_ml/model_selection/utils_test.py b/dask_ml/model_selection/utils_test.py index 75d971647..c61101462 100644 --- a/dask_ml/model_selection/utils_test.py +++ b/dask_ml/model_selection/utils_test.py @@ -197,10 +197,10 @@ def score(self, X=None, Y=None): class AsCompletedEstimator(MockClassifier): def __init__( self, - killed_workers_name="killed_workers", - lock_name="lock", - counter_name="counter", - min_complete=7, + killed_workers_name, + lock_name, + counter_name, + min_complete, foo_param=None, ): self.foo_param = foo_param @@ -209,7 +209,7 @@ def __init__( self.lock_name = lock_name self.min_complete = min_complete - def fit(self, X=None, y=None): + def fit(self, X, y=None): w = get_worker() dsk_lock = Lock(self.lock_name, client=w.client) dsk_counter = Variable(self.counter_name, client=w.client) @@ -231,11 +231,9 @@ def fit(self, X=None, y=None): os.kill(os.getpid(), 9) return self - def transform(self, X=None): + def transform(self, X): return X - def predict(self, X=None): + def predict(self, X): return X - def score(self, X=None, y=None): - return 1 diff --git a/tests/model_selection/dask_searchcv/test_model_selection.py b/tests/model_selection/dask_searchcv/test_model_selection.py index cc35287da..e6c24676c 100644 --- a/tests/model_selection/dask_searchcv/test_model_selection.py +++ b/tests/model_selection/dask_searchcv/test_model_selection.py @@ -815,7 +815,7 @@ def test_as_completed_distributed(loop): X, y = make_classification(n_samples=100, n_features=10, random_state=0) gs = dcv.GridSearchCV( AsCompletedEstimator( - killed_workers_name, lock_name, counter_name, min_complete=7 + killed_workers_name, lock_name, counter_name, 7 ), param_grid={"foo_param": [0, 1, 2]}, cv=3, From f206bc577cd3cb547a7750a21f01b36e2631aff8 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Thu, 14 Feb 2019 23:22:54 -0800 Subject: [PATCH 68/83] use base estimator for AsCompletedEstimator --- dask_ml/model_selection/utils_test.py | 4 +++- tests/model_selection/dask_searchcv/test_model_selection.py | 4 +--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dask_ml/model_selection/utils_test.py b/dask_ml/model_selection/utils_test.py index c61101462..0a6291b31 100644 --- a/dask_ml/model_selection/utils_test.py +++ b/dask_ml/model_selection/utils_test.py @@ -194,7 +194,7 @@ def score(self, X=None, Y=None): return score -class AsCompletedEstimator(MockClassifier): +class AsCompletedEstimator(BaseEstimator): def __init__( self, killed_workers_name, @@ -237,3 +237,5 @@ def transform(self, X): def predict(self, X): return X + def score(self, X, y): + return 1 diff --git a/tests/model_selection/dask_searchcv/test_model_selection.py b/tests/model_selection/dask_searchcv/test_model_selection.py index e6c24676c..00362f7a8 100644 --- a/tests/model_selection/dask_searchcv/test_model_selection.py +++ b/tests/model_selection/dask_searchcv/test_model_selection.py @@ -814,9 +814,7 @@ def test_as_completed_distributed(loop): X, y = make_classification(n_samples=100, n_features=10, random_state=0) gs = dcv.GridSearchCV( - AsCompletedEstimator( - killed_workers_name, lock_name, counter_name, 7 - ), + AsCompletedEstimator(killed_workers_name, lock_name, counter_name, 7), param_grid={"foo_param": [0, 1, 2]}, cv=3, refit=False, From 6a18f30865d387416e89641085a903485855b2b6 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Thu, 14 Feb 2019 23:35:24 -0800 Subject: [PATCH 69/83] looks like we need to use mock classifier --- dask_ml/model_selection/utils_test.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/dask_ml/model_selection/utils_test.py b/dask_ml/model_selection/utils_test.py index 0a6291b31..6c2b4742f 100644 --- a/dask_ml/model_selection/utils_test.py +++ b/dask_ml/model_selection/utils_test.py @@ -194,7 +194,7 @@ def score(self, X=None, Y=None): return score -class AsCompletedEstimator(BaseEstimator): +class AsCompletedEstimator(MockClassifier): def __init__( self, killed_workers_name, @@ -236,6 +236,3 @@ def transform(self, X): def predict(self, X): return X - - def score(self, X, y): - return 1 From fa9fb176a1b76279f9557b2ead7ed9c0a3bdad55 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Thu, 14 Feb 2019 23:36:00 -0800 Subject: [PATCH 70/83] init w/ foo param --- dask_ml/model_selection/utils_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask_ml/model_selection/utils_test.py b/dask_ml/model_selection/utils_test.py index 6c2b4742f..cc61a351a 100644 --- a/dask_ml/model_selection/utils_test.py +++ b/dask_ml/model_selection/utils_test.py @@ -203,7 +203,7 @@ def __init__( min_complete, foo_param=None, ): - self.foo_param = foo_param + super().__init__(foo_param) self.counter_name = counter_name self.killed_workers_name = killed_workers_name self.lock_name = lock_name From bfdea5e74b7d41d9e51a760c613159163e511a9c Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Thu, 14 Feb 2019 23:37:06 -0800 Subject: [PATCH 71/83] 2.7 super init --- dask_ml/model_selection/utils_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask_ml/model_selection/utils_test.py b/dask_ml/model_selection/utils_test.py index cc61a351a..c4e52640b 100644 --- a/dask_ml/model_selection/utils_test.py +++ b/dask_ml/model_selection/utils_test.py @@ -203,7 +203,7 @@ def __init__( min_complete, foo_param=None, ): - super().__init__(foo_param) + super(MockClassifier, self).__init__(foo_param) self.counter_name = counter_name self.killed_workers_name = killed_workers_name self.lock_name = lock_name From 8ef5338adf5a15a0d2e7ba73d9bf2e2779bb4d4d Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Thu, 14 Feb 2019 23:40:35 -0800 Subject: [PATCH 72/83] need to use AsCompletedEstimator in super --- dask_ml/model_selection/utils_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask_ml/model_selection/utils_test.py b/dask_ml/model_selection/utils_test.py index c4e52640b..48b143a79 100644 --- a/dask_ml/model_selection/utils_test.py +++ b/dask_ml/model_selection/utils_test.py @@ -203,7 +203,7 @@ def __init__( min_complete, foo_param=None, ): - super(MockClassifier, self).__init__(foo_param) + super(AsCompletedEstimator, self).__init__(foo_param) self.counter_name = counter_name self.killed_workers_name = killed_workers_name self.lock_name = lock_name From 869e8b9b1462d1aa834f549bf5921b820493648e Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Fri, 15 Feb 2019 00:01:57 -0800 Subject: [PATCH 73/83] black reformatting --- dask_ml/model_selection/utils_test.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/dask_ml/model_selection/utils_test.py b/dask_ml/model_selection/utils_test.py index 48b143a79..3515269b3 100644 --- a/dask_ml/model_selection/utils_test.py +++ b/dask_ml/model_selection/utils_test.py @@ -196,12 +196,7 @@ def score(self, X=None, Y=None): class AsCompletedEstimator(MockClassifier): def __init__( - self, - killed_workers_name, - lock_name, - counter_name, - min_complete, - foo_param=None, + self, killed_workers_name, lock_name, counter_name, min_complete, foo_param=None ): super(AsCompletedEstimator, self).__init__(foo_param) self.counter_name = counter_name From 9c034abd70ad3ac6fb9b8bfe5ef5d39a3a02d304 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Fri, 15 Feb 2019 00:11:56 -0800 Subject: [PATCH 74/83] move liter eval out of lock --- dask_ml/model_selection/utils_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask_ml/model_selection/utils_test.py b/dask_ml/model_selection/utils_test.py index 3515269b3..e14e83801 100644 --- a/dask_ml/model_selection/utils_test.py +++ b/dask_ml/model_selection/utils_test.py @@ -212,8 +212,8 @@ def fit(self, X, y=None): for e in list(w.executing): should_die = False + t = literal_eval(e) with dsk_lock: - t = literal_eval(e) c = dsk_counter.get() dsk_counter.set(c + 1) killed_workers = dsk_killed_workers.get() From e482ed9143aa83bee30c54a59c5675945a2a5f19 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Fri, 15 Feb 2019 00:31:57 -0800 Subject: [PATCH 75/83] don't need fit and transform for AsCompleted Test --- dask_ml/model_selection/utils_test.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/dask_ml/model_selection/utils_test.py b/dask_ml/model_selection/utils_test.py index e14e83801..471309edb 100644 --- a/dask_ml/model_selection/utils_test.py +++ b/dask_ml/model_selection/utils_test.py @@ -225,9 +225,3 @@ def fit(self, X, y=None): if should_die: os.kill(os.getpid(), 9) return self - - def transform(self, X): - return X - - def predict(self, X): - return X From 90087449d67dce6ade9626c10f01f15fd17badee Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Sat, 16 Feb 2019 08:51:00 -0800 Subject: [PATCH 76/83] use built in reschedule for as_completed --- dask_ml/model_selection/_search.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index 65639a8de..96b6203cd 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -1186,18 +1186,17 @@ def fit(self, X, y=None, groups=None, **fit_params): ) result_map = {} - while len(result_map) != len(keys): - failed_futures = [] - for future in as_completed(futures): - try: - result_map[future.key] = future.result() - except Exception as e: - future.retry() - logger.warning( - "{} has failed due to {}... retrying".format(future.key, e) - ) - failed_futures.append(future) - futures = failed_futures + ac = as_completed(futures) + for future in ac: + try: + result_map[future.key] = future.result() + except Exception as e: + logger.warning( + "{} has failed due to {}... retrying".format(future.key, e) + ) + future.retry() + ac.add(future) + out = [result_map[k] for k in keys] else: out = scheduler(dsk, keys, num_workers=n_jobs) From 762300326fb730708afb108c7e0dfdeeb0542e57 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Sat, 16 Feb 2019 09:02:50 -0800 Subject: [PATCH 77/83] retry in batches --- dask_ml/model_selection/_search.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index 96b6203cd..85804e7c2 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -1186,16 +1186,17 @@ def fit(self, X, y=None, groups=None, **fit_params): ) result_map = {} - ac = as_completed(futures) - for future in ac: - try: - result_map[future.key] = future.result() - except Exception as e: - logger.warning( - "{} has failed due to {}... retrying".format(future.key, e) - ) - future.retry() - ac.add(future) + ac = as_completed(futures, with_results=True, raise_errors=False) + for batch in ac.batches(): + for future, result in batch: + if future.status == 'finished': + result_map[future.key] = result + else: + logger.warning( + "{} has failed... retrying".format(future.key) + ) + future.retry() + ac.add(future) out = [result_map[k] for k in keys] else: From 9da5b8f0cfa5960a7ee70f216a684bff96a450b2 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Sat, 16 Feb 2019 09:09:23 -0800 Subject: [PATCH 78/83] black reformatting --- dask_ml/model_selection/_search.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index 85804e7c2..9bd722290 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -1189,12 +1189,10 @@ def fit(self, X, y=None, groups=None, **fit_params): ac = as_completed(futures, with_results=True, raise_errors=False) for batch in ac.batches(): for future, result in batch: - if future.status == 'finished': + if future.status == "finished": result_map[future.key] = result else: - logger.warning( - "{} has failed... retrying".format(future.key) - ) + logger.warning("{} has failed... retrying".format(future.key)) future.retry() ac.add(future) From e86bc1f7eeef77e6dfac049d0fc63205c2e7a85b Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Sat, 4 May 2019 09:54:10 -0700 Subject: [PATCH 79/83] remove dead code --- dask_ml/model_selection/methods.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/dask_ml/model_selection/methods.py b/dask_ml/model_selection/methods.py index 435514ac9..c31efb43f 100644 --- a/dask_ml/model_selection/methods.py +++ b/dask_ml/model_selection/methods.py @@ -171,10 +171,6 @@ def cv_extract_params(cvs, keys, vals, n): return {k: cvs.extract_param(tok, v, n) for (k, tok), v in zip(keys, vals)} -def decompress_params(fields, params): - return [{k: v for k, v in zip(fields, p) if v is not MISSING} for p in params] - - def _maybe_timed(x): """Unpack (est, fit_time) tuples if provided""" return x if isinstance(x, tuple) and len(x) == 2 else (x, 0.0) @@ -452,11 +448,6 @@ def create_cv_results( return results -def get_best_params(candidate_params, cv_results, scorer): - best_index = np.flatnonzero(cv_results["rank_test_{}".format(scorer)] == 1)[0] - return candidate_params[best_index] - - def fit_best(estimator, params, X, y, fit_params): estimator = copy_estimator(estimator).set_params(**params) estimator.fit(X, y, **fit_params) From 687a83bc119dcc6aa57127aa01eff1288f1b5ce1 Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Sat, 4 May 2019 16:24:24 -0700 Subject: [PATCH 80/83] Remove optional distributed check as it is a dependency now --- dask_ml/model_selection/_search.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index 9bd722290..f9ad646ee 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -12,6 +12,7 @@ import packaging.version from dask.base import tokenize from dask.delayed import delayed +from dask.distributed import as_completed from dask.utils import derived_from from sklearn import model_selection from sklearn.base import BaseEstimator, MetaEstimatorMixin, clone, is_classifier @@ -61,10 +62,6 @@ except ImportError: # pragma: no cover from toolz import get, pluck -try: - from dask.distributed import as_completed -except ImportError: - pass __all__ = ["GridSearchCV", "RandomizedSearchCV"] From 1eb113c64c7cb7bac878af9ec516f8d8bbb3f8cb Mon Sep 17 00:00:00 2001 From: Paul Vecchio Date: Mon, 6 May 2019 20:05:13 -0700 Subject: [PATCH 81/83] Remove has_distributed check from tests --- .../dask_searchcv/test_model_selection.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/tests/model_selection/dask_searchcv/test_model_selection.py b/tests/model_selection/dask_searchcv/test_model_selection.py index 00362f7a8..da0d89ea8 100644 --- a/tests/model_selection/dask_searchcv/test_model_selection.py +++ b/tests/model_selection/dask_searchcv/test_model_selection.py @@ -14,6 +14,8 @@ from dask.callbacks import Callback from dask.delayed import delayed from dask.utils import tmpdir +from distributed import Client, Nanny, Variable +from distributed.utils_test import cluster, loop from sklearn.datasets import load_iris, make_classification from sklearn.decomposition import PCA from sklearn.ensemble import RandomForestClassifier @@ -52,15 +54,6 @@ ScalingTransformer, ) -try: - from distributed import Client, Nanny, Variable - from distributed.utils_test import cluster, loop - - has_distributed = True -except ImportError: - loop = pytest.fixture(lambda: None) - has_distributed = False - class assert_dask_compute(Callback): def __init__(self, compute=False): @@ -785,7 +778,6 @@ def test_scheduler_param(scheduler, n_jobs): gs.fit(X, y) -@pytest.mark.skipif("not has_distributed") def test_scheduler_param_distributed(loop): X, y = make_classification(n_samples=100, n_features=10, random_state=0) with cluster() as (s, [a, b]): @@ -799,7 +791,6 @@ def f(dask_scheduler): assert client.run_on_scheduler(f) # some work happened on cluster -@pytest.mark.skipif("not has_distributed") def test_as_completed_distributed(loop): with cluster(active_rpc_timeout=10, nanny=Nanny) as (s, [a, b]): with Client(s["address"], loop=loop) as c: From d5b95e412aed29633be96816b69ec520a3c48dc9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 20 May 2019 13:05:34 -0500 Subject: [PATCH 82/83] Add compat for TPOT --- dask_ml/model_selection/_search.py | 66 +++++++++++++++++++++++++++++- 1 file changed, 65 insertions(+), 1 deletion(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index f9ad646ee..1bf708037 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -116,6 +116,70 @@ def map_fit_params(dsk, fit_params): return fit_params +def build_graph( + estimator, + cv, + scorer, + candidate_params, + X, + y=None, + groups=None, + fit_params=None, + iid=True, + refit=True, + error_score="raise", + return_train_score=_RETURN_TRAIN_SCORE_DEFAULT, + cache_cv=True, + multimetric=False, +): + # This is provided for compatibility with TPOT. Remove + # once TPOT is updated and requires a dask-ml>=0.13.0 + def decompress_params(fields, params): + return [{k: v for k, v in zip(fields, p) if v is not MISSING} for p in params] + + fields, tokens, params = normalize_params(candidate_params) + dsk, keys, n_splits, main_token = build_cv_graph( + estimator, + cv, + scorer, + candidate_params, + X, + y=y, + groups=groups, + fit_params=fit_params, + iid=iid, + error_score=error_score, + return_train_score=return_train_score, + cache_cv=cache_cv, + ) + cv_name = "cv-split-" + main_token + if iid: + weights = "cv-n-samples-" + main_token + dsk[weights] = (cv_n_samples, cv_name) + scores = keys[1:] + else: + scores = keys + + cv_results = "cv-results-" + main_token + candidate_params_name = "cv-parameters-" + main_token + dsk[candidate_params_name] = (decompress_params, fields, params) + if multimetric: + metrics = list(scorer.keys()) + else: + metrics = None + dsk[cv_results] = ( + create_cv_results, + scores, + candidate_params_name, + n_splits, + error_score, + weights, + metrics, + ) + keys = [cv_results] + return dsk, keys, n_splits + + def build_cv_graph( estimator, cv, @@ -181,7 +245,7 @@ def build_cv_graph( return_train_score, ) keys = [weights] + scores if weights else scores - return dsk, keys, n_splits + return dsk, keys, n_splits, main_token def build_refit_graph(estimator, X, y, best_params, fit_params): From 33f6fd2c3dbc0560a0e3e9f433b720c79750bd09 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 20 May 2019 13:53:09 -0500 Subject: [PATCH 83/83] expecte token --- dask_ml/model_selection/_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index 1bf708037..589be296b 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -1218,7 +1218,7 @@ def fit(self, X, y=None, groups=None, **fit_params): ) candidate_params = list(self._get_param_iterator()) - dsk, keys, n_splits = build_cv_graph( + dsk, keys, n_splits, _ = build_cv_graph( estimator, self.cv, self.scorer_,