Skip to content

Commit

Permalink
fix issue #14 for scikit-learning update
Browse files Browse the repository at this point in the history
  • Loading branch information
yzhao062 committed Feb 2, 2024
1 parent ac651a4 commit 93fa075
Show file tree
Hide file tree
Showing 14 changed files with 59 additions and 117 deletions.
3 changes: 2 additions & 1 deletion CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,5 @@ v<0.0.8>, <10/01/2021> -- Urgent update for scikit-learn 1.0.
v<0.0.9>, <07/16/2023> -- Urgent update for scikit-learn 1.3.
v<0.1.0>, <10/26/2023> -- Fix the issue for python 3.7.
v<0.1.0>, <10/26/2023> -- Update docs.
v<0.1.1>, <11/19/2023> -- Optimize for model compatibility.
v<0.1.1>, <11/19/2023> -- Optimize for model compatibility.
v<0.1.2>, <02/01/2024> -- Permanent fix the issue of sklearn incompatibility.
133 changes: 51 additions & 82 deletions suod/models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from pyod.models.sklearn_base import _pprint
from pyod.utils.utility import check_parameter

from suod.models.cost_predictor import build_cost_predictor
from suod.models.parallel_processes import cost_forecast_meta
from suod.models.parallel_processes import balanced_scheduling
from suod.models.parallel_processes import _parallel_fit
Expand All @@ -33,7 +34,7 @@
from suod.models.parallel_processes import _parallel_decision_function
from suod.models.parallel_processes import _partition_estimators
from suod.models.parallel_processes import _parallel_approx_estimators
from ..utils.utility import _unfold_parallel, build_codes, _get_sklearn_version
from ..utils.utility import _unfold_parallel, build_codes

import warnings
from collections import defaultdict
Expand All @@ -45,36 +46,6 @@
from inspect import signature


def load_predictor_train(cost_forecast_loc_fit):
this_directory = os.path.abspath(os.path.dirname(__file__))

# validate the trained model
if cost_forecast_loc_fit is None:
try:
cost_predictor = joblib.load(os.path.join(
this_directory, 'saved_models', 'bps_train.joblib'))
except KeyError:
cost_predictor = joblib.load(os.path.join(
this_directory, 'saved_models', 'bps_train_old.joblib'))

return cost_predictor


def load_predictor_prediction(cost_forecast_loc_pred):
this_directory = os.path.abspath(os.path.dirname(__file__))

# validate the trained model
if cost_forecast_loc_pred is None:
try:
cost_predictor = joblib.load(os.path.join(
this_directory, 'saved_models', 'bps_prediction.joblib'))
except KeyError:
cost_predictor = joblib.load(os.path.join(
this_directory, 'saved_models', 'bps_prediction_old.joblib'))

return cost_predictor


# noinspection PyPep8
class SUOD(object):
"""SUOD (Scalable Unsupervised Outlier Detection) is an acceleration
Expand Down Expand Up @@ -142,11 +113,6 @@ class SUOD(object):
approx_clf : object, optional (default: sklearn RandomForestRegressor)
The supervised model used to approximate unsupervised models.
cost_forecast_loc_fit : str, optional
The location of the pretrained cost prediction forecast for training.
cost_forecast_loc_pred : str, optional
The location of the pretrained cost prediction forecast for prediction.
verbose : bool, optional (default=False)
Controls the verbosity of the building process.
Expand All @@ -157,7 +123,6 @@ def __init__(self, base_estimators, contamination=0.1, n_jobs=None,
target_dim_frac=0.5, jl_method='basic', bps_flag=True,
approx_clf_list=None, approx_ng_clf_list=None,
approx_flag_global=True, approx_clf=None,
cost_forecast_loc_fit=None, cost_forecast_loc_pred=None,
verbose=False):

assert (isinstance(base_estimators, (list)))
Expand All @@ -172,8 +137,6 @@ def __init__(self, base_estimators, contamination=0.1, n_jobs=None,
self.verbose = verbose
self.approx_flag_global = approx_flag_global
self.contamination = contamination
self.cost_forecast_loc_fit = cost_forecast_loc_fit
self.cost_forecast_loc_pred = cost_forecast_loc_pred

self._parameter_validation(contamination, n_jobs, rp_clf_list,
rp_ng_clf_list, approx_clf_list,
Expand Down Expand Up @@ -245,29 +208,6 @@ def _parameter_validation(self, contamination, n_jobs, rp_clf_list,
else:
self.approx_ng_clf_list = approx_ng_clf_list

# this_directory = os.path.abspath(os.path.dirname(__file__))
#
# # validate the trained model
# if cost_forecast_loc_fit is None:
#
# sklearn_version = _get_sklearn_version()
# if sklearn_version[:3] >= '1.3':
# self.cost_forecast_loc_fit_ = os.path.join(
# this_directory, 'saved_models', 'bps_train.joblib')
# else:
# self.cost_forecast_loc_fit_ = os.path.join(
# this_directory, 'saved_models', 'bps_train_old.joblib')
# else:
# self.cost_forecast_loc_fit_ = cost_forecast_loc_fit
#
# if cost_forecast_loc_pred is None:
# self.cost_forecast_loc_pred_ = os.path.join(
# this_directory, 'saved_models', 'bps_prediction.joblib')
# else:
# self.cost_forecast_loc_pred_ = cost_forecast_loc_pred
#
# return self

def fit(self, X):
"""Fit all base estimators.
Expand Down Expand Up @@ -303,7 +243,16 @@ def fit(self, X):
# it is turned off
if self.bps_flag:
# load the pre-trained cost predictor to forecast the train cost
cost_predictor = load_predictor_train(self.cost_forecast_loc_fit)
this_directory = os.path.abspath(os.path.dirname(__file__))
train_file = os.path.join(this_directory, "saved_models",
"bps_train_curr.joblib")
build_cost_predictor(
file_name=os.path.join(this_directory, 'saved_models',
'summary_train.txt'),
output_file=train_file,
save_to_local=True)

cost_predictor = joblib.load(train_file)

print(cost_predictor)
time_cost_pred = cost_forecast_meta(cost_predictor, X,
Expand All @@ -327,7 +276,7 @@ def fit(self, X):
# https://github.com/joblib/joblib/issues/806
# a fix is on the way: https://github.com/joblib/joblib/pull/966
# max_nbytes can be dropped on other OS
all_results = Parallel(n_jobs=n_jobs, max_nbytes=None, verbose=True)(
all_results = Parallel(n_jobs=n_jobs, verbose=True)(
delayed(_parallel_fit)(
n_estimators_list[i],
self.base_estimators[starts[i]:starts[i + 1]],
Expand Down Expand Up @@ -418,9 +367,19 @@ def predict(self, X):
# decide whether bps is needed
# it is turned off
if self.bps_flag:

this_directory = os.path.abspath(os.path.dirname(__file__))
prediction_file = os.path.join(this_directory, "saved_models",
"bps_prediction_curr.joblib")

build_cost_predictor(
file_name=os.path.join(this_directory, 'saved_models',
'summary_prediction.txt'),
output_file=prediction_file,
save_to_local=True)

# load the pre-trained cost predictor to forecast the train cost
cost_predictor = load_predictor_prediction(
self.cost_forecast_loc_pred)
cost_predictor = joblib.load(prediction_file)

time_cost_pred = cost_forecast_meta(cost_predictor, X,
self.base_estimator_names)
Expand All @@ -437,10 +396,7 @@ def predict(self, X):
print('Parallel label prediction...')
start = time.time()

# TODO: code cleanup. There is an existing bug for joblib on Windows:
# https://github.com/joblib/joblib/issues/806
# max_nbytes can be dropped on other OS
all_results_pred = Parallel(n_jobs=n_jobs, max_nbytes=None,
all_results_pred = Parallel(n_jobs=n_jobs,
verbose=True)(
delayed(_parallel_predict)(
n_estimators_list[i],
Expand Down Expand Up @@ -492,8 +448,18 @@ def decision_function(self, X):
# it is turned off
if self.bps_flag:
# load the pre-trained cost predictor to forecast the train cost
cost_predictor = load_predictor_prediction(
self.cost_forecast_loc_pred)
this_directory = os.path.abspath(os.path.dirname(__file__))
prediction_file = os.path.join(this_directory, "saved_models",
"bps_prediction_curr.joblib")

build_cost_predictor(
file_name=os.path.join(this_directory, 'saved_models',
'summary_prediction.txt'),
output_file=prediction_file,
save_to_local=True)

# load the pre-trained cost predictor to forecast the train cost
cost_predictor = joblib.load(prediction_file)

time_cost_pred = cost_forecast_meta(cost_predictor, X,
self.base_estimator_names)
Expand All @@ -510,10 +476,7 @@ def decision_function(self, X):
print('Parallel score prediction...')
start = time.time()

# TODO: code cleanup. There is an existing bug for joblib on Windows:
# https://github.com/joblib/joblib/issues/806
# max_nbytes can be dropped on other OS
all_results_scores = Parallel(n_jobs=n_jobs, max_nbytes=None,
all_results_scores = Parallel(n_jobs=n_jobs,
verbose=True)(
delayed(_parallel_decision_function)(
n_estimators_list[i],
Expand Down Expand Up @@ -568,9 +531,18 @@ def predict_proba(self, X):
# decide whether bps is needed
# it is turned off
if self.bps_flag:
this_directory = os.path.abspath(os.path.dirname(__file__))
prediction_file = os.path.join(this_directory, "saved_models",
"bps_prediction_curr.joblib")

build_cost_predictor(
file_name=os.path.join(this_directory, 'saved_models',
'summary_prediction.txt'),
output_file=prediction_file,
save_to_local=True)

# load the pre-trained cost predictor to forecast the train cost
cost_predictor = load_predictor_prediction(
self.cost_forecast_loc_pred)
cost_predictor = joblib.load(prediction_file)

time_cost_pred = cost_forecast_meta(cost_predictor, X,
self.base_estimator_names)
Expand All @@ -587,10 +559,7 @@ def predict_proba(self, X):
print('Parallel score prediction...')
start = time.time()

# TODO: code cleanup. There is an existing bug for joblib on Windows:
# https://github.com/joblib/joblib/issues/806
# max_nbytes can be dropped on other OS
all_results_scores = Parallel(n_jobs=n_jobs, max_nbytes=None,
all_results_scores = Parallel(n_jobs=n_jobs,
verbose=True)(
delayed(_parallel_predict_proba)(
n_estimators_list[i],
Expand Down
6 changes: 3 additions & 3 deletions suod/models/cost_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def build_cost_predictor(file_name, output_file, save_to_local=True):
pearson.append(pearsonr(y_test, test_pred)[0])
spearman.append(spearmanr(y_test, test_pred)[0])

print('Spearman Rank', np.mean(spearman))
# print('Spearman Rank', np.mean(spearman))

clf.fit(X, y)

Expand All @@ -121,9 +121,9 @@ def build_cost_predictor(file_name, output_file, save_to_local=True):
# this should be only executed if the pre-trained model is missing.
build_cost_predictor(
file_name=os.path.join('saved_models', 'summary_train.txt'),
output_file="bps_train.joblib",
output_file="bps_train_curr.joblib",
save_to_local=False)
build_cost_predictor(
file_name=os.path.join('saved_models', 'summary_prediction.txt'),
output_file="bps_prediction.joblib",
output_file="bps_prediction_curr.joblib",
save_to_local=False)
Binary file removed suod/models/saved_models/bps_prediction.joblib
Binary file not shown.
Binary file removed suod/models/saved_models/bps_prediction_old.joblib
Binary file not shown.
Binary file removed suod/models/saved_models/bps_train.joblib
Binary file not shown.
Binary file removed suod/models/saved_models/bps_train_old.joblib
Binary file not shown.
Binary file removed suod/test/saved_models/bps_prediction.joblib
Binary file not shown.
Binary file removed suod/test/saved_models/bps_prediction_old.joblib
Binary file not shown.
Binary file removed suod/test/saved_models/bps_train.joblib
Binary file not shown.
Binary file removed suod/test/saved_models/bps_train_old.joblib
Binary file not shown.
16 changes: 2 additions & 14 deletions suod/test/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@
from pyod.models.pca import PCA
from pyod.models.hbos import HBOS
from pyod.models.lscp import LSCP
from suod.models.base import load_predictor_train
from suod.models.base import load_predictor_prediction
from suod.models.cost_predictor import build_cost_predictor
import joblib


class TestBASE(unittest.TestCase):
Expand Down Expand Up @@ -47,22 +47,10 @@ def setUp(self):
random_state=self.random_state)
]

this_directory = os.path.abspath(os.path.dirname(__file__))

self.cost_forecast_loc_fit_ = load_predictor_train(
os.path.join(this_directory,
'saved_models/bps_train.joblib'))

self.cost_forecast_loc_pred_ = load_predictor_prediction(
os.path.join(this_directory,
'saved_models/bps_prediction.joblib'))

self.model = SUOD(base_estimators=self.base_estimators, n_jobs=2,
rp_flag_global=True, bps_flag=True,
contamination=self.contamination,
approx_flag_global=True,
cost_forecast_loc_fit=self.cost_forecast_loc_fit_,
cost_forecast_loc_pred=self.cost_forecast_loc_pred_,
verbose=True)

def test_initialization(self):
Expand Down
16 changes: 0 additions & 16 deletions suod/test/test_model_save_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,6 @@
from joblib import dump, load


from suod.models.base import load_predictor_train
from suod.models.base import load_predictor_prediction


class TestModelSaveLoad(unittest.TestCase):
def setUp(self):
self.n_train = 1000
Expand All @@ -47,22 +43,10 @@ def setUp(self):
random_state=self.random_state)
]

this_directory = os.path.abspath(os.path.dirname(__file__))

self.cost_forecast_loc_fit_ = load_predictor_train(
os.path.join(this_directory,
'saved_models/bps_train.joblib'))

self.cost_forecast_loc_pred_ = load_predictor_prediction(
os.path.join(this_directory,
'saved_models/bps_prediction.joblib'))

self.model = SUOD(base_estimators=self.base_estimators, n_jobs=2,
rp_flag_global=True, bps_flag=True,
contamination=self.contamination,
approx_flag_global=True,
cost_forecast_loc_fit=self.cost_forecast_loc_fit_,
cost_forecast_loc_pred=self.cost_forecast_loc_pred_,
verbose=True)

def test_save(self):
Expand Down
2 changes: 1 addition & 1 deletion suod/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@
# Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
# 'X.Y.dev0' is the canonical version of 'X.Y.dev'
#
__version__ = '0.1.1' # pragma: no cover
__version__ = '0.1.2' # pragma: no cover

0 comments on commit 93fa075

Please sign in to comment.