ml-finance-python
python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
adaboost.py
(5633B)
1 from __future__ import division, print_function
2 import numpy as np
3 import math
4 from sklearn import datasets
5 import matplotlib.pyplot as plt
6 import pandas as pd
7
8 # Import helper functions
9 from mlfromscratch.utils import train_test_split, accuracy_score, Plot
10
11 # Decision stump used as weak classifier in this impl. of Adaboost
12 class DecisionStump():
13 def __init__(self):
14 # Determines if sample shall be classified as -1 or 1 given threshold
15 self.polarity = 1
16 # The index of the feature used to make classification
17 self.feature_index = None
18 # The threshold value that the feature should be measured against
19 self.threshold = None
20 # Value indicative of the classifier's accuracy
21 self.alpha = None
22
23 class Adaboost():
24 """Boosting method that uses a number of weak classifiers in
25 ensemble to make a strong classifier. This implementation uses decision
26 stumps, which is a one level Decision Tree.
27
28 Parameters:
29 -----------
30 n_clf: int
31 The number of weak classifiers that will be used.
32 """
33 def __init__(self, n_clf=5):
34 self.n_clf = n_clf
35
36 def fit(self, X, y):
37 n_samples, n_features = np.shape(X)
38
39 # Initialize weights to 1/N
40 w = np.full(n_samples, (1 / n_samples))
41
42 self.clfs = []
43 # Iterate through classifiers
44 for _ in range(self.n_clf):
45 clf = DecisionStump()
46 # Minimum error given for using a certain feature value threshold
47 # for predicting sample label
48 min_error = float('inf')
49 # Iterate throught every unique feature value and see what value
50 # makes the best threshold for predicting y
51 for feature_i in range(n_features):
52 feature_values = np.expand_dims(X[:, feature_i], axis=1)
53 unique_values = np.unique(feature_values)
54 # Try every unique feature value as threshold
55 for threshold in unique_values:
56 p = 1
57 # Set all predictions to '1' initially
58 prediction = np.ones(np.shape(y))
59 # Label the samples whose values are below threshold as '-1'
60 prediction[X[:, feature_i] < threshold] = -1
61 # Error = sum of weights of misclassified samples
62 error = sum(w[y != prediction])
63
64 # If the error is over 50% we flip the polarity so that samples that
65 # were classified as 0 are classified as 1, and vice versa
66 # E.g error = 0.8 => (1 - error) = 0.2
67 if error > 0.5:
68 error = 1 - error
69 p = -1
70
71 # If this threshold resulted in the smallest error we save the
72 # configuration
73 if error < min_error:
74 clf.polarity = p
75 clf.threshold = threshold
76 clf.feature_index = feature_i
77 min_error = error
78 # Calculate the alpha which is used to update the sample weights,
79 # Alpha is also an approximation of this classifier's proficiency
80 clf.alpha = 0.5 * math.log((1.0 - min_error) / (min_error + 1e-10))
81 # Set all predictions to '1' initially
82 predictions = np.ones(np.shape(y))
83 # The indexes where the sample values are below threshold
84 negative_idx = (clf.polarity * X[:, clf.feature_index] < clf.polarity * clf.threshold)
85 # Label those as '-1'
86 predictions[negative_idx] = -1
87 # Calculate new weights
88 # Missclassified samples gets larger weights and correctly classified samples smaller
89 w *= np.exp(-clf.alpha * y * predictions)
90 # Normalize to one
91 w /= np.sum(w)
92
93 # Save classifier
94 self.clfs.append(clf)
95
96 def predict(self, X):
97 n_samples = np.shape(X)[0]
98 y_pred = np.zeros((n_samples, 1))
99 # For each classifier => label the samples
100 for clf in self.clfs:
101 # Set all predictions to '1' initially
102 predictions = np.ones(np.shape(y_pred))
103 # The indexes where the sample values are below threshold
104 negative_idx = (clf.polarity * X[:, clf.feature_index] < clf.polarity * clf.threshold)
105 # Label those as '-1'
106 predictions[negative_idx] = -1
107 # Add predictions weighted by the classifiers alpha
108 # (alpha indicative of classifier's proficiency)
109 y_pred += clf.alpha * predictions
110
111 # Return sign of prediction sum
112 y_pred = np.sign(y_pred).flatten()
113
114 return y_pred
115
116
117 def main():
118 data = datasets.load_digits()
119 X = data.data
120 y = data.target
121
122 digit1 = 1
123 digit2 = 8
124 idx = np.append(np.where(y == digit1)[0], np.where(y == digit2)[0])
125 y = data.target[idx]
126 # Change labels to {-1, 1}
127 y[y == digit1] = -1
128 y[y == digit2] = 1
129 X = data.data[idx]
130
131 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
132
133 # Adaboost classification with 5 weak classifiers
134 clf = Adaboost(n_clf=5)
135 clf.fit(X_train, y_train)
136 y_pred = clf.predict(X_test)
137
138 accuracy = accuracy_score(y_test, y_pred)
139 print ("Accuracy:", accuracy)
140
141 # Reduce dimensions to 2d using pca and plot the results
142 Plot().plot_in_2d(X_test, y_pred, title="Adaboost", accuracy=accuracy)
143
144
145 if __name__ == "__main__":
146 main()