ml-finance-python
python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
xgboost.py
(3726B)
1 from __future__ import division, print_function
2 import numpy as np
3 import progressbar
4
5 from mlfromscratch.utils import train_test_split, standardize, to_categorical, normalize
6 from mlfromscratch.utils import mean_squared_error, accuracy_score
7 from mlfromscratch.supervised_learning import XGBoostRegressionTree
8 from mlfromscratch.deep_learning.activation_functions import Sigmoid
9 from mlfromscratch.utils.misc import bar_widgets
10 from mlfromscratch.utils import Plot
11
12
13 class LogisticLoss():
14 def __init__(self):
15 sigmoid = Sigmoid()
16 self.log_func = sigmoid
17 self.log_grad = sigmoid.gradient
18
19 def loss(self, y, y_pred):
20 y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
21 p = self.log_func(y_pred)
22 return y * np.log(p) + (1 - y) * np.log(1 - p)
23
24 # gradient w.r.t y_pred
25 def gradient(self, y, y_pred):
26 p = self.log_func(y_pred)
27 return -(y - p)
28
29 # w.r.t y_pred
30 def hess(self, y, y_pred):
31 p = self.log_func(y_pred)
32 return p * (1 - p)
33
34
35 class XGBoost(object):
36 """The XGBoost classifier.
37
38 Reference: http://xgboost.readthedocs.io/en/latest/model.html
39
40 Parameters:
41 -----------
42 n_estimators: int
43 The number of classification trees that are used.
44 learning_rate: float
45 The step length that will be taken when following the negative gradient during
46 training.
47 min_samples_split: int
48 The minimum number of samples needed to make a split when building a tree.
49 min_impurity: float
50 The minimum impurity required to split the tree further.
51 max_depth: int
52 The maximum depth of a tree.
53 """
54 def __init__(self, n_estimators=200, learning_rate=0.001, min_samples_split=2,
55 min_impurity=1e-7, max_depth=2):
56 self.n_estimators = n_estimators # Number of trees
57 self.learning_rate = learning_rate # Step size for weight update
58 self.min_samples_split = min_samples_split # The minimum n of sampels to justify split
59 self.min_impurity = min_impurity # Minimum variance reduction to continue
60 self.max_depth = max_depth # Maximum depth for tree
61
62 self.bar = progressbar.ProgressBar(widgets=bar_widgets)
63
64 # Log loss for classification
65 self.loss = LogisticLoss()
66
67 # Initialize regression trees
68 self.trees = []
69 for _ in range(n_estimators):
70 tree = XGBoostRegressionTree(
71 min_samples_split=self.min_samples_split,
72 min_impurity=min_impurity,
73 max_depth=self.max_depth,
74 loss=self.loss)
75
76 self.trees.append(tree)
77
78 def fit(self, X, y):
79 y = to_categorical(y)
80
81 y_pred = np.zeros(np.shape(y))
82 for i in self.bar(range(self.n_estimators)):
83 tree = self.trees[i]
84 y_and_pred = np.concatenate((y, y_pred), axis=1)
85 tree.fit(X, y_and_pred)
86 update_pred = tree.predict(X)
87
88 y_pred -= np.multiply(self.learning_rate, update_pred)
89
90 def predict(self, X):
91 y_pred = None
92 # Make predictions
93 for tree in self.trees:
94 # Estimate gradient and update prediction
95 update_pred = tree.predict(X)
96 if y_pred is None:
97 y_pred = np.zeros_like(update_pred)
98 y_pred -= np.multiply(self.learning_rate, update_pred)
99
100 # Turn into probability distribution (Softmax)
101 y_pred = np.exp(y_pred) / np.sum(np.exp(y_pred), axis=1, keepdims=True)
102 # Set label to the value that maximizes probability
103 y_pred = np.argmax(y_pred, axis=1)
104 return y_pred