fix issue #14 for scikit-learning update

yzhao062 · Feb 2, 2024 · 93fa075 · 93fa075
1 parent ac651a4
commit 93fa075
Show file tree

Hide file tree

Showing 14 changed files with 59 additions and 117 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -19,4 +19,5 @@ v<0.0.8>, <10/01/2021> -- Urgent update for scikit-learn 1.0.
 v<0.0.9>, <07/16/2023> -- Urgent update for scikit-learn 1.3.
 v<0.1.0>, <10/26/2023> -- Fix the issue for python 3.7.
 v<0.1.0>, <10/26/2023> -- Update docs.
-v<0.1.1>, <11/19/2023> -- Optimize for model compatibility.
+v<0.1.1>, <11/19/2023> -- Optimize for model compatibility.
+v<0.1.2>, <02/01/2024> -- Permanent fix the issue of sklearn incompatibility.
diff --git a/suod/models/base.py b/suod/models/base.py
@@ -25,6 +25,7 @@
 from pyod.models.sklearn_base import _pprint
 from pyod.utils.utility import check_parameter
 
+from suod.models.cost_predictor import build_cost_predictor
 from suod.models.parallel_processes import cost_forecast_meta
 from suod.models.parallel_processes import balanced_scheduling
 from suod.models.parallel_processes import _parallel_fit
@@ -33,7 +34,7 @@
 from suod.models.parallel_processes import _parallel_decision_function
 from suod.models.parallel_processes import _partition_estimators
 from suod.models.parallel_processes import _parallel_approx_estimators
-from ..utils.utility import _unfold_parallel, build_codes, _get_sklearn_version
+from ..utils.utility import _unfold_parallel, build_codes
 
 import warnings
 from collections import defaultdict
@@ -45,36 +46,6 @@
 from inspect import signature
 
 
-def load_predictor_train(cost_forecast_loc_fit):
-	this_directory = os.path.abspath(os.path.dirname(__file__))
-
-	# validate the trained model
-	if cost_forecast_loc_fit is None:
-		try:
-			cost_predictor = joblib.load(os.path.join(
-				this_directory, 'saved_models', 'bps_train.joblib'))
-		except KeyError:
-			cost_predictor = joblib.load(os.path.join(
-				this_directory, 'saved_models', 'bps_train_old.joblib'))
-
-		return cost_predictor
-
-
-def load_predictor_prediction(cost_forecast_loc_pred):
-	this_directory = os.path.abspath(os.path.dirname(__file__))
-
-	# validate the trained model
-	if cost_forecast_loc_pred is None:
-		try:
-			cost_predictor = joblib.load(os.path.join(
-				this_directory, 'saved_models', 'bps_prediction.joblib'))
-		except KeyError:
-			cost_predictor = joblib.load(os.path.join(
-				this_directory, 'saved_models', 'bps_prediction_old.joblib'))
-
-		return cost_predictor
-
-
 # noinspection PyPep8
 class SUOD(object):
 	"""SUOD (Scalable Unsupervised Outlier Detection) is an acceleration
@@ -142,11 +113,6 @@ class SUOD(object):
 	approx_clf : object, optional (default: sklearn RandomForestRegressor)
 		The supervised model used to approximate unsupervised models.
 
-	cost_forecast_loc_fit : str, optional
-		The location of the pretrained cost prediction forecast for training.
-
-	cost_forecast_loc_pred : str, optional
-		The location of the pretrained cost prediction forecast for prediction.
 
 	verbose : bool, optional (default=False)
 		Controls the verbosity of the building process.
@@ -157,7 +123,6 @@ def __init__(self, base_estimators, contamination=0.1, n_jobs=None,
 				 target_dim_frac=0.5, jl_method='basic', bps_flag=True,
 				 approx_clf_list=None, approx_ng_clf_list=None,
 				 approx_flag_global=True, approx_clf=None,
-				 cost_forecast_loc_fit=None, cost_forecast_loc_pred=None,
 				 verbose=False):
 
 		assert (isinstance(base_estimators, (list)))
@@ -172,8 +137,6 @@ def __init__(self, base_estimators, contamination=0.1, n_jobs=None,
 		self.verbose = verbose
 		self.approx_flag_global = approx_flag_global
 		self.contamination = contamination
-		self.cost_forecast_loc_fit = cost_forecast_loc_fit
-		self.cost_forecast_loc_pred = cost_forecast_loc_pred
 
 		self._parameter_validation(contamination, n_jobs, rp_clf_list,
 								   rp_ng_clf_list, approx_clf_list,
@@ -245,29 +208,6 @@ def _parameter_validation(self, contamination, n_jobs, rp_clf_list,
 		else:
 			self.approx_ng_clf_list = approx_ng_clf_list
 
-		# this_directory = os.path.abspath(os.path.dirname(__file__))
-		#
-		# # validate the trained model
-		# if cost_forecast_loc_fit is None:
-		#
-		#     sklearn_version = _get_sklearn_version()
-		#     if sklearn_version[:3] >= '1.3':
-		#         self.cost_forecast_loc_fit_ = os.path.join(
-		#             this_directory, 'saved_models', 'bps_train.joblib')
-		#     else:
-		#         self.cost_forecast_loc_fit_ = os.path.join(
-		#             this_directory, 'saved_models', 'bps_train_old.joblib')
-		# else:
-		#     self.cost_forecast_loc_fit_ = cost_forecast_loc_fit
-		#
-		# if cost_forecast_loc_pred is None:
-		#     self.cost_forecast_loc_pred_ = os.path.join(
-		#         this_directory, 'saved_models', 'bps_prediction.joblib')
-		# else:
-		#     self.cost_forecast_loc_pred_ = cost_forecast_loc_pred
-		#
-		# return self
-
 	def fit(self, X):
 		"""Fit all base estimators.
 
@@ -303,7 +243,16 @@ def fit(self, X):
 		# it is turned off
 		if self.bps_flag:
 			# load the pre-trained cost predictor to forecast the train cost
-			cost_predictor = load_predictor_train(self.cost_forecast_loc_fit)
+			this_directory = os.path.abspath(os.path.dirname(__file__))
+			train_file = os.path.join(this_directory, "saved_models",
+									  "bps_train_curr.joblib")
+			build_cost_predictor(
+				file_name=os.path.join(this_directory, 'saved_models',
+									   'summary_train.txt'),
+				output_file=train_file,
+				save_to_local=True)
+
+			cost_predictor = joblib.load(train_file)
 
 			print(cost_predictor)
 			time_cost_pred = cost_forecast_meta(cost_predictor, X,
@@ -327,7 +276,7 @@ def fit(self, X):
 		# https://github.com/joblib/joblib/issues/806
 		# a fix is on the way: https://github.com/joblib/joblib/pull/966
 		# max_nbytes can be dropped on other OS
-		all_results = Parallel(n_jobs=n_jobs, max_nbytes=None, verbose=True)(
+		all_results = Parallel(n_jobs=n_jobs, verbose=True)(
 			delayed(_parallel_fit)(
 				n_estimators_list[i],
 				self.base_estimators[starts[i]:starts[i + 1]],
@@ -418,9 +367,19 @@ def predict(self, X):
 		# decide whether bps is needed
 		# it is turned off
 		if self.bps_flag:
+
+			this_directory = os.path.abspath(os.path.dirname(__file__))
+			prediction_file = os.path.join(this_directory, "saved_models",
+										   "bps_prediction_curr.joblib")
+
+			build_cost_predictor(
+				file_name=os.path.join(this_directory, 'saved_models',
+									   'summary_prediction.txt'),
+				output_file=prediction_file,
+				save_to_local=True)
+
 			# load the pre-trained cost predictor to forecast the train cost
-			cost_predictor = load_predictor_prediction(
-				self.cost_forecast_loc_pred)
+			cost_predictor = joblib.load(prediction_file)
 
 			time_cost_pred = cost_forecast_meta(cost_predictor, X,
 												self.base_estimator_names)
@@ -437,10 +396,7 @@ def predict(self, X):
 			print('Parallel label prediction...')
 			start = time.time()
 
-		# TODO: code cleanup. There is an existing bug for joblib on Windows:
-		# https://github.com/joblib/joblib/issues/806
-		# max_nbytes can be dropped on other OS
-		all_results_pred = Parallel(n_jobs=n_jobs, max_nbytes=None,
+		all_results_pred = Parallel(n_jobs=n_jobs,
 									verbose=True)(
 			delayed(_parallel_predict)(
 				n_estimators_list[i],
@@ -492,8 +448,18 @@ def decision_function(self, X):
 		# it is turned off
 		if self.bps_flag:
 			# load the pre-trained cost predictor to forecast the train cost
-			cost_predictor = load_predictor_prediction(
-				self.cost_forecast_loc_pred)
+			this_directory = os.path.abspath(os.path.dirname(__file__))
+			prediction_file = os.path.join(this_directory, "saved_models",
+										   "bps_prediction_curr.joblib")
+
+			build_cost_predictor(
+				file_name=os.path.join(this_directory, 'saved_models',
+									   'summary_prediction.txt'),
+				output_file=prediction_file,
+				save_to_local=True)
+
+			# load the pre-trained cost predictor to forecast the train cost
+			cost_predictor = joblib.load(prediction_file)
 
 			time_cost_pred = cost_forecast_meta(cost_predictor, X,
 												self.base_estimator_names)
@@ -510,10 +476,7 @@ def decision_function(self, X):
 			print('Parallel score prediction...')
 			start = time.time()
 
-		# TODO: code cleanup. There is an existing bug for joblib on Windows:
-		# https://github.com/joblib/joblib/issues/806
-		# max_nbytes can be dropped on other OS
-		all_results_scores = Parallel(n_jobs=n_jobs, max_nbytes=None,
+		all_results_scores = Parallel(n_jobs=n_jobs,
 									  verbose=True)(
 			delayed(_parallel_decision_function)(
 				n_estimators_list[i],
@@ -568,9 +531,18 @@ def predict_proba(self, X):
 		# decide whether bps is needed
 		# it is turned off
 		if self.bps_flag:
+			this_directory = os.path.abspath(os.path.dirname(__file__))
+			prediction_file = os.path.join(this_directory, "saved_models",
+										   "bps_prediction_curr.joblib")
+
+			build_cost_predictor(
+				file_name=os.path.join(this_directory, 'saved_models',
+									   'summary_prediction.txt'),
+				output_file=prediction_file,
+				save_to_local=True)
+
 			# load the pre-trained cost predictor to forecast the train cost
-			cost_predictor = load_predictor_prediction(
-				self.cost_forecast_loc_pred)
+			cost_predictor = joblib.load(prediction_file)
 
 			time_cost_pred = cost_forecast_meta(cost_predictor, X,
 												self.base_estimator_names)
@@ -587,10 +559,7 @@ def predict_proba(self, X):
 			print('Parallel score prediction...')
 			start = time.time()
 
-		# TODO: code cleanup. There is an existing bug for joblib on Windows:
-		# https://github.com/joblib/joblib/issues/806
-		# max_nbytes can be dropped on other OS
-		all_results_scores = Parallel(n_jobs=n_jobs, max_nbytes=None,
+		all_results_scores = Parallel(n_jobs=n_jobs,
 									  verbose=True)(
 			delayed(_parallel_predict_proba)(
 				n_estimators_list[i],

diff --git a/suod/models/cost_predictor.py b/suod/models/cost_predictor.py
@@ -108,7 +108,7 @@ def build_cost_predictor(file_name, output_file, save_to_local=True):
         pearson.append(pearsonr(y_test, test_pred)[0])
         spearman.append(spearmanr(y_test, test_pred)[0])
 
-    print('Spearman Rank', np.mean(spearman))
+    # print('Spearman Rank', np.mean(spearman))
 
     clf.fit(X, y)
 
@@ -121,9 +121,9 @@ def build_cost_predictor(file_name, output_file, save_to_local=True):
     # this should be only executed if the pre-trained model is missing.
     build_cost_predictor(
         file_name=os.path.join('saved_models', 'summary_train.txt'),
-        output_file="bps_train.joblib",
+        output_file="bps_train_curr.joblib",
         save_to_local=False)
     build_cost_predictor(
         file_name=os.path.join('saved_models', 'summary_prediction.txt'),
-        output_file="bps_prediction.joblib",
+        output_file="bps_prediction_curr.joblib",
         save_to_local=False)
diff --git a/suod/models/saved_models/bps_prediction.joblib b/suod/models/saved_models/bps_prediction.joblib
diff --git a/suod/models/saved_models/bps_prediction_old.joblib b/suod/models/saved_models/bps_prediction_old.joblib
diff --git a/suod/models/saved_models/bps_train.joblib b/suod/models/saved_models/bps_train.joblib
diff --git a/suod/models/saved_models/bps_train_old.joblib b/suod/models/saved_models/bps_train_old.joblib
diff --git a/suod/test/saved_models/bps_prediction.joblib b/suod/test/saved_models/bps_prediction.joblib
diff --git a/suod/test/saved_models/bps_prediction_old.joblib b/suod/test/saved_models/bps_prediction_old.joblib
diff --git a/suod/test/saved_models/bps_train.joblib b/suod/test/saved_models/bps_train.joblib
diff --git a/suod/test/saved_models/bps_train_old.joblib b/suod/test/saved_models/bps_train_old.joblib
diff --git a/suod/test/test_base.py b/suod/test/test_base.py
@@ -18,8 +18,8 @@
 from pyod.models.pca import PCA
 from pyod.models.hbos import HBOS
 from pyod.models.lscp import LSCP
-from suod.models.base import load_predictor_train
-from suod.models.base import load_predictor_prediction
+from suod.models.cost_predictor import build_cost_predictor
+import joblib
 
 
 class TestBASE(unittest.TestCase):
@@ -47,22 +47,10 @@ def setUp(self):
 				random_state=self.random_state)
 		]
 
-		this_directory = os.path.abspath(os.path.dirname(__file__))
-
-		self.cost_forecast_loc_fit_ = load_predictor_train(
-			os.path.join(this_directory,
-						 'saved_models/bps_train.joblib'))
-
-		self.cost_forecast_loc_pred_ = load_predictor_prediction(
-			os.path.join(this_directory,
-						 'saved_models/bps_prediction.joblib'))
-
 		self.model = SUOD(base_estimators=self.base_estimators, n_jobs=2,
 						  rp_flag_global=True, bps_flag=True,
 						  contamination=self.contamination,
 						  approx_flag_global=True,
-						  cost_forecast_loc_fit=self.cost_forecast_loc_fit_,
-						  cost_forecast_loc_pred=self.cost_forecast_loc_pred_,
 						  verbose=True)
 
 	def test_initialization(self):

diff --git a/suod/test/test_model_save_load.py b/suod/test/test_model_save_load.py
@@ -18,10 +18,6 @@
 from joblib import dump, load
 
 
-from suod.models.base import load_predictor_train
-from suod.models.base import load_predictor_prediction
-
-
 class TestModelSaveLoad(unittest.TestCase):
 	def setUp(self):
 		self.n_train = 1000
@@ -47,22 +43,10 @@ def setUp(self):
 				random_state=self.random_state)
 		]
 
-		this_directory = os.path.abspath(os.path.dirname(__file__))
-
-		self.cost_forecast_loc_fit_ = load_predictor_train(
-			os.path.join(this_directory,
-						 'saved_models/bps_train.joblib'))
-
-		self.cost_forecast_loc_pred_ = load_predictor_prediction(
-			os.path.join(this_directory,
-						 'saved_models/bps_prediction.joblib'))
-
 		self.model = SUOD(base_estimators=self.base_estimators, n_jobs=2,
 						  rp_flag_global=True, bps_flag=True,
 						  contamination=self.contamination,
 						  approx_flag_global=True,
-						  cost_forecast_loc_fit=self.cost_forecast_loc_fit_,
-						  cost_forecast_loc_pred=self.cost_forecast_loc_pred_,
 						  verbose=True)
 
 	def test_save(self):

diff --git a/suod/version.py b/suod/version.py
@@ -20,4 +20,4 @@
 # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
 # 'X.Y.dev0' is the canonical version of 'X.Y.dev'
 #
-__version__ = '0.1.1'  # pragma: no cover
+__version__ = '0.1.2'  # pragma: no cover