-
Notifications
You must be signed in to change notification settings - Fork 8
/
models.py
92 lines (81 loc) · 3.53 KB
/
models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# -*- coding: utf-8 -*-
"""
This file lists all the 10 models used on the dataset
Outputs the result and report into all_models_report.txt file
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#scikit imports
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.metrics import classification_report, accuracy_score
#classifiers
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process.kernels import Matern
from sklearn.ensemble import VotingClassifier
#Transformation
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")
#Clear the text file of previous report.
open('all_models_report.txt', 'w').close()
#Load train and test datasets
train_Data = pd.read_csv('data/training_new_data.csv')
featureSet = ["VL.t0","CD4.t0","rtlength", "pr_A", "pr_C","pr_G",
"pr_R", "pr_T","pr_Y", "PR_GC","RT_A", "RT_C","RT_G","RT_R", "RT_T", "RT_Y", "RT_GC"]
# featureSet = ["VL.t0":"RT_GC"]
X = train_Data[featureSet]
y = train_Data.Resp
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#Data transformation with mean 0 and SD 1.
standard_scaler = StandardScaler()
X_train = standard_scaler.fit_transform(X_train)
X_test = standard_scaler.transform(X_test)
# define scoring method
scoring = 'accuracy'
# Define models
names = [" Random Forest","Neural Net", "AdaBoost","XGBoost", "Logistic Regression ",
"Support Vector Machine", "K Nearest Neighbours", "Linear Discriminant Analysis",
"Gaussian Process", "Gaussian Naive Bayes"]
classifiers = [
RandomForestClassifier(bootstrap=True, max_depth=10, n_estimators=550, criterion="entropy",
max_features='auto', class_weight="balanced", n_jobs=5),
MLPClassifier(alpha=1,batch_size=30),
AdaBoostClassifier(),
XGBClassifier(),
LogisticRegressionCV(verbose=5, solver='lbfgs'),
SVC(gamma='scale', kernel='poly', degree=3, class_weight= "balanced"),
KNeighborsClassifier(n_neighbors=3, p=2, n_jobs=10),
LinearDiscriminantAnalysis(),
GaussianProcessClassifier(kernel= 1.0 * Matern(length_scale=1.0, length_scale_bounds=(1e-1, 10.0),nu=1.5)),
GaussianNB()
]
seed = 1
models = zip(names, classifiers)
# evaluate each model
results = []
names = []
for name, model in models:
kfold = KFold(n_splits=10, random_state = seed)
cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
predictions = [round(value) for value in predictions]
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
# Write the report to a file.
with open('all_models_report.txt', 'a') as f:
print(msg, file=f)
print('--------------------------------------------------', file=f)
print(accuracy_score(y_test, predictions), file=f)
print(classification_report(y_test, predictions), file=f)
print('--------------------------------------------------', file=f)