ml-finance-python
python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
gradient_boosting.py
(4281B)
1 from __future__ import division, print_function
2 import numpy as np
3 import progressbar
4
5 # Import helper functions
6 from mlfromscratch.utils import train_test_split, standardize, to_categorical
7 from mlfromscratch.utils import mean_squared_error, accuracy_score
8 from mlfromscratch.deep_learning.loss_functions import SquareLoss, CrossEntropy
9 from mlfromscratch.supervised_learning.decision_tree import RegressionTree
10 from mlfromscratch.utils.misc import bar_widgets
11
12
13 class GradientBoosting(object):
14 """Super class of GradientBoostingClassifier and GradientBoostinRegressor.
15 Uses a collection of regression trees that trains on predicting the gradient
16 of the loss function.
17
18 Parameters:
19 -----------
20 n_estimators: int
21 The number of classification trees that are used.
22 learning_rate: float
23 The step length that will be taken when following the negative gradient during
24 training.
25 min_samples_split: int
26 The minimum number of samples needed to make a split when building a tree.
27 min_impurity: float
28 The minimum impurity required to split the tree further.
29 max_depth: int
30 The maximum depth of a tree.
31 regression: boolean
32 True or false depending on if we're doing regression or classification.
33 """
34 def __init__(self, n_estimators, learning_rate, min_samples_split,
35 min_impurity, max_depth, regression):
36 self.n_estimators = n_estimators
37 self.learning_rate = learning_rate
38 self.min_samples_split = min_samples_split
39 self.min_impurity = min_impurity
40 self.max_depth = max_depth
41 self.regression = regression
42 self.bar = progressbar.ProgressBar(widgets=bar_widgets)
43
44 # Square loss for regression
45 # Log loss for classification
46 self.loss = SquareLoss()
47 if not self.regression:
48 self.loss = CrossEntropy()
49
50 # Initialize regression trees
51 self.trees = []
52 for _ in range(n_estimators):
53 tree = RegressionTree(
54 min_samples_split=self.min_samples_split,
55 min_impurity=min_impurity,
56 max_depth=self.max_depth)
57 self.trees.append(tree)
58
59
60 def fit(self, X, y):
61 y_pred = np.full(np.shape(y), np.mean(y, axis=0))
62 for i in self.bar(range(self.n_estimators)):
63 gradient = self.loss.gradient(y, y_pred)
64 self.trees[i].fit(X, gradient)
65 update = self.trees[i].predict(X)
66 # Update y prediction
67 y_pred -= np.multiply(self.learning_rate, update)
68
69
70 def predict(self, X):
71 y_pred = np.array([])
72 # Make predictions
73 for tree in self.trees:
74 update = tree.predict(X)
75 update = np.multiply(self.learning_rate, update)
76 y_pred = -update if not y_pred.any() else y_pred - update
77
78 if not self.regression:
79 # Turn into probability distribution
80 y_pred = np.exp(y_pred) / np.expand_dims(np.sum(np.exp(y_pred), axis=1), axis=1)
81 # Set label to the value that maximizes probability
82 y_pred = np.argmax(y_pred, axis=1)
83 return y_pred
84
85
86 class GradientBoostingRegressor(GradientBoosting):
87 def __init__(self, n_estimators=200, learning_rate=0.5, min_samples_split=2,
88 min_var_red=1e-7, max_depth=4, debug=False):
89 super(GradientBoostingRegressor, self).__init__(n_estimators=n_estimators,
90 learning_rate=learning_rate,
91 min_samples_split=min_samples_split,
92 min_impurity=min_var_red,
93 max_depth=max_depth,
94 regression=True)
95
96 class GradientBoostingClassifier(GradientBoosting):
97 def __init__(self, n_estimators=200, learning_rate=.5, min_samples_split=2,
98 min_info_gain=1e-7, max_depth=2, debug=False):
99 super(GradientBoostingClassifier, self).__init__(n_estimators=n_estimators,
100 learning_rate=learning_rate,
101 min_samples_split=min_samples_split,
102 min_impurity=min_info_gain,
103 max_depth=max_depth,
104 regression=False)
105
106 def fit(self, X, y):
107 y = to_categorical(y)
108 super(GradientBoostingClassifier, self).fit(X, y)
109