forked from jjfiv/cs451-practicals
-
Notifications
You must be signed in to change notification settings - Fork 0
/
adaboost.py
118 lines (99 loc) · 3.66 KB
/
adaboost.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import random
import numpy as np
from dataclasses import dataclass, field
from numpy.core.fromnumeric import argmax
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.utils import resample
import typing as T
import math
# start off by seeding random number generators:
RANDOM_SEED = 12345
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
# import data; choose feature space
from dataset_poetry import y_train, Xd_train, y_vali, Xd_vali
X_train = Xd_train["numeric"]
X_vali = Xd_vali["numeric"]
@dataclass
class WeightedEnsemble(ClassifierMixin):
""" A weighted ensemble is a list of (weight, classifier) tuples."""
members: T.List[T.Tuple[float, T.Any]] = field(default_factory=list)
def predict_one(self, x: np.ndarray) -> bool:
vote_sum = 0
for weight, clf in self.members:
y = clf.predict([x])[0]
if y:
vote_sum += weight
else:
vote_sum -= weight
return vote_sum > 0
def predict(self, X: np.ndarray) -> np.ndarray:
(N, D) = X.shape
class_votes = np.zeros((N, 1))
for weight, clf in self.members:
ys = clf.predict(X)
for i, y in enumerate(ys):
if y:
class_votes[i] += weight
else:
class_votes[i] -= weight
return class_votes > 0
def adaboost(
X: np.ndarray,
y: np.ndarray,
k: int,
learning_rate: float = 0.5,
make_weak_learner=lambda: DecisionTreeClassifier(max_depth=4),
) -> WeightedEnsemble:
(N, D) = X.shape
importances = np.ones(N) / N # uniform init
output = WeightedEnsemble()
# Train k classifiers
for _ in range(k):
# Train a weak learner:
m = make_weak_learner()
m.fit(X, y, sample_weight=importances)
# Assess how it did:
y_pred = m.predict(X)
assert y_pred.shape == (N,)
mistakes = np.asarray(y_pred != y, dtype=int)
error = np.sum(mistakes * importances)
# Stop if classification is perfect
if error <= 0:
break
# compute importance of this learner
alpha = learning_rate * (math.log((1.0 - error) - math.log(error)))
output.members.append((alpha, m))
# update importances:
# importances *= np.exp(alpha * mistakes * (importances > 0))
mistakes[mistakes == True] = -1
mistakes[mistakes == False] = 1
importances *= np.exp(-alpha * mistakes)
sample_weight_sum = np.sum(importances)
if sample_weight_sum <= 0:
break
importances /= sample_weight_sum
return output
def make_dtree():
return DecisionTreeClassifier(
max_depth=4 # type:ignore
)
def make_linear():
return LogisticRegression()
#%% Train up AdaBoost models:
m = adaboost(X_train, y_train, 100, learning_rate=1.0, make_weak_learner=make_dtree)
print("Adaboost[DT].score = {:.3}".format(m.score(X_vali, y_vali)))
m = adaboost(X_train, y_train, 200, make_weak_learner=make_linear)
print("Adaboost[LR].score = {:.3}".format(m.score(X_vali, y_vali)))
# Don't make your own at home.
msk = AdaBoostClassifier(base_estimator=make_dtree(), n_estimators=100)
msk.fit(X_train, y_train)
print("Adaboost[sk].score = {:.3}".format(msk.score(X_vali, y_vali)))
# Generalization of Adaboost:
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
print("GBC[sk].score = {:.3}".format(gbc.score(X_vali, y_vali)))