ml-finance-python

python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
adaboost.py

(5633B)
      1 from __future__ import division, print_function
      2 import numpy as np
      3 import math
      4 from sklearn import datasets
      5 import matplotlib.pyplot as plt
      6 import pandas as pd
      7 
      8 # Import helper functions
      9 from mlfromscratch.utils import train_test_split, accuracy_score, Plot
     10 
     11 # Decision stump used as weak classifier in this impl. of Adaboost
     12 class DecisionStump():
     13     def __init__(self):
     14         # Determines if sample shall be classified as -1 or 1 given threshold
     15         self.polarity = 1
     16         # The index of the feature used to make classification
     17         self.feature_index = None
     18         # The threshold value that the feature should be measured against
     19         self.threshold = None
     20         # Value indicative of the classifier's accuracy
     21         self.alpha = None
     22 
     23 class Adaboost():
     24     """Boosting method that uses a number of weak classifiers in 
     25     ensemble to make a strong classifier. This implementation uses decision
     26     stumps, which is a one level Decision Tree. 
     27 
     28     Parameters:
     29     -----------
     30     n_clf: int
     31         The number of weak classifiers that will be used. 
     32     """
     33     def __init__(self, n_clf=5):
     34         self.n_clf = n_clf
     35 
     36     def fit(self, X, y):
     37         n_samples, n_features = np.shape(X)
     38 
     39         # Initialize weights to 1/N
     40         w = np.full(n_samples, (1 / n_samples))
     41         
     42         self.clfs = []
     43         # Iterate through classifiers
     44         for _ in range(self.n_clf):
     45             clf = DecisionStump()
     46             # Minimum error given for using a certain feature value threshold
     47             # for predicting sample label
     48             min_error = float('inf')
     49             # Iterate throught every unique feature value and see what value
     50             # makes the best threshold for predicting y
     51             for feature_i in range(n_features):
     52                 feature_values = np.expand_dims(X[:, feature_i], axis=1)
     53                 unique_values = np.unique(feature_values)
     54                 # Try every unique feature value as threshold
     55                 for threshold in unique_values:
     56                     p = 1
     57                     # Set all predictions to '1' initially
     58                     prediction = np.ones(np.shape(y))
     59                     # Label the samples whose values are below threshold as '-1'
     60                     prediction[X[:, feature_i] < threshold] = -1
     61                     # Error = sum of weights of misclassified samples
     62                     error = sum(w[y != prediction])
     63                     
     64                     # If the error is over 50% we flip the polarity so that samples that
     65                     # were classified as 0 are classified as 1, and vice versa
     66                     # E.g error = 0.8 => (1 - error) = 0.2
     67                     if error > 0.5:
     68                         error = 1 - error
     69                         p = -1
     70 
     71                     # If this threshold resulted in the smallest error we save the
     72                     # configuration
     73                     if error < min_error:
     74                         clf.polarity = p
     75                         clf.threshold = threshold
     76                         clf.feature_index = feature_i
     77                         min_error = error
     78             # Calculate the alpha which is used to update the sample weights,
     79             # Alpha is also an approximation of this classifier's proficiency
     80             clf.alpha = 0.5 * math.log((1.0 - min_error) / (min_error + 1e-10))
     81             # Set all predictions to '1' initially
     82             predictions = np.ones(np.shape(y))
     83             # The indexes where the sample values are below threshold
     84             negative_idx = (clf.polarity * X[:, clf.feature_index] < clf.polarity * clf.threshold)
     85             # Label those as '-1'
     86             predictions[negative_idx] = -1
     87             # Calculate new weights 
     88             # Missclassified samples gets larger weights and correctly classified samples smaller
     89             w *= np.exp(-clf.alpha * y * predictions)
     90             # Normalize to one
     91             w /= np.sum(w)
     92 
     93             # Save classifier
     94             self.clfs.append(clf)
     95 
     96     def predict(self, X):
     97         n_samples = np.shape(X)[0]
     98         y_pred = np.zeros((n_samples, 1))
     99         # For each classifier => label the samples
    100         for clf in self.clfs:
    101             # Set all predictions to '1' initially
    102             predictions = np.ones(np.shape(y_pred))
    103             # The indexes where the sample values are below threshold
    104             negative_idx = (clf.polarity * X[:, clf.feature_index] < clf.polarity * clf.threshold)
    105             # Label those as '-1'
    106             predictions[negative_idx] = -1
    107             # Add predictions weighted by the classifiers alpha
    108             # (alpha indicative of classifier's proficiency)
    109             y_pred += clf.alpha * predictions
    110 
    111         # Return sign of prediction sum
    112         y_pred = np.sign(y_pred).flatten()
    113 
    114         return y_pred
    115 
    116 
    117 def main():
    118     data = datasets.load_digits()
    119     X = data.data
    120     y = data.target
    121 
    122     digit1 = 1
    123     digit2 = 8
    124     idx = np.append(np.where(y == digit1)[0], np.where(y == digit2)[0])
    125     y = data.target[idx]
    126     # Change labels to {-1, 1}
    127     y[y == digit1] = -1
    128     y[y == digit2] = 1
    129     X = data.data[idx]
    130 
    131     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
    132 
    133     # Adaboost classification with 5 weak classifiers
    134     clf = Adaboost(n_clf=5)
    135     clf.fit(X_train, y_train)
    136     y_pred = clf.predict(X_test)
    137 
    138     accuracy = accuracy_score(y_test, y_pred)
    139     print ("Accuracy:", accuracy)
    140 
    141     # Reduce dimensions to 2d using pca and plot the results
    142     Plot().plot_in_2d(X_test, y_pred, title="Adaboost", accuracy=accuracy)
    143 
    144 
    145 if __name__ == "__main__":
    146     main()