ml-finance-python

python scripts for finance machine learning

git clone https://9o.is/git/ml-finance-python.git

xgboost.py

(3726B)


      1 from __future__ import division, print_function
      2 import numpy as np
      3 import progressbar
      4 
      5 from mlfromscratch.utils import train_test_split, standardize, to_categorical, normalize
      6 from mlfromscratch.utils import mean_squared_error, accuracy_score
      7 from mlfromscratch.supervised_learning import XGBoostRegressionTree
      8 from mlfromscratch.deep_learning.activation_functions import Sigmoid
      9 from mlfromscratch.utils.misc import bar_widgets
     10 from mlfromscratch.utils import Plot
     11 
     12 
     13 class LogisticLoss():
     14     def __init__(self):
     15         sigmoid = Sigmoid()
     16         self.log_func = sigmoid
     17         self.log_grad = sigmoid.gradient
     18 
     19     def loss(self, y, y_pred):
     20         y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
     21         p = self.log_func(y_pred)
     22         return y * np.log(p) + (1 - y) * np.log(1 - p)
     23 
     24     # gradient w.r.t y_pred
     25     def gradient(self, y, y_pred):
     26         p = self.log_func(y_pred)
     27         return -(y - p)
     28 
     29     # w.r.t y_pred
     30     def hess(self, y, y_pred):
     31         p = self.log_func(y_pred)
     32         return p * (1 - p)
     33 
     34 
     35 class XGBoost(object):
     36     """The XGBoost classifier.
     37 
     38     Reference: http://xgboost.readthedocs.io/en/latest/model.html
     39 
     40     Parameters:
     41     -----------
     42     n_estimators: int
     43         The number of classification trees that are used.
     44     learning_rate: float
     45         The step length that will be taken when following the negative gradient during
     46         training.
     47     min_samples_split: int
     48         The minimum number of samples needed to make a split when building a tree.
     49     min_impurity: float
     50         The minimum impurity required to split the tree further. 
     51     max_depth: int
     52         The maximum depth of a tree.
     53     """
     54     def __init__(self, n_estimators=200, learning_rate=0.001, min_samples_split=2,
     55                  min_impurity=1e-7, max_depth=2):
     56         self.n_estimators = n_estimators            # Number of trees
     57         self.learning_rate = learning_rate          # Step size for weight update
     58         self.min_samples_split = min_samples_split  # The minimum n of sampels to justify split
     59         self.min_impurity = min_impurity              # Minimum variance reduction to continue
     60         self.max_depth = max_depth                  # Maximum depth for tree
     61 
     62         self.bar = progressbar.ProgressBar(widgets=bar_widgets)
     63         
     64         # Log loss for classification
     65         self.loss = LogisticLoss()
     66 
     67         # Initialize regression trees
     68         self.trees = []
     69         for _ in range(n_estimators):
     70             tree = XGBoostRegressionTree(
     71                     min_samples_split=self.min_samples_split,
     72                     min_impurity=min_impurity,
     73                     max_depth=self.max_depth,
     74                     loss=self.loss)
     75 
     76             self.trees.append(tree)
     77 
     78     def fit(self, X, y):
     79         y = to_categorical(y)
     80 
     81         y_pred = np.zeros(np.shape(y))
     82         for i in self.bar(range(self.n_estimators)):
     83             tree = self.trees[i]
     84             y_and_pred = np.concatenate((y, y_pred), axis=1)
     85             tree.fit(X, y_and_pred)
     86             update_pred = tree.predict(X)
     87 
     88             y_pred -= np.multiply(self.learning_rate, update_pred)
     89 
     90     def predict(self, X):
     91         y_pred = None
     92         # Make predictions
     93         for tree in self.trees:
     94             # Estimate gradient and update prediction
     95             update_pred = tree.predict(X)
     96             if y_pred is None:
     97                 y_pred = np.zeros_like(update_pred)
     98             y_pred -= np.multiply(self.learning_rate, update_pred)
     99 
    100         # Turn into probability distribution (Softmax)
    101         y_pred = np.exp(y_pred) / np.sum(np.exp(y_pred), axis=1, keepdims=True)
    102         # Set label to the value that maximizes probability
    103         y_pred = np.argmax(y_pred, axis=1)
    104         return y_pred