ml-finance-python
python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
random_forest.py
(3683B)
1 from __future__ import division, print_function
2 import numpy as np
3 import math
4 import progressbar
5
6 # Import helper functions
7 from mlfromscratch.utils import divide_on_feature, train_test_split, get_random_subsets, normalize
8 from mlfromscratch.utils import accuracy_score, calculate_entropy
9 from mlfromscratch.unsupervised_learning import PCA
10 from mlfromscratch.supervised_learning import ClassificationTree
11 from mlfromscratch.utils.misc import bar_widgets
12 from mlfromscratch.utils import Plot
13
14
15 class RandomForest():
16 """Random Forest classifier. Uses a collection of classification trees that
17 trains on random subsets of the data using a random subsets of the features.
18
19 Parameters:
20 -----------
21 n_estimators: int
22 The number of classification trees that are used.
23 max_features: int
24 The maximum number of features that the classification trees are allowed to
25 use.
26 min_samples_split: int
27 The minimum number of samples needed to make a split when building a tree.
28 min_gain: float
29 The minimum impurity required to split the tree further.
30 max_depth: int
31 The maximum depth of a tree.
32 """
33 def __init__(self, n_estimators=100, max_features=None, min_samples_split=2,
34 min_gain=0, max_depth=float("inf")):
35 self.n_estimators = n_estimators # Number of trees
36 self.max_features = max_features # Maxmimum number of features per tree
37 self.min_samples_split = min_samples_split
38 self.min_gain = min_gain # Minimum information gain req. to continue
39 self.max_depth = max_depth # Maximum depth for tree
40 self.progressbar = progressbar.ProgressBar(widgets=bar_widgets)
41
42 # Initialize decision trees
43 self.trees = []
44 for _ in range(n_estimators):
45 self.trees.append(
46 ClassificationTree(
47 min_samples_split=self.min_samples_split,
48 min_impurity=min_gain,
49 max_depth=self.max_depth))
50
51 def fit(self, X, y):
52 n_features = np.shape(X)[1]
53 # If max_features have not been defined => select it as
54 # sqrt(n_features)
55 if not self.max_features:
56 self.max_features = int(math.sqrt(n_features))
57
58 # Choose one random subset of the data for each tree
59 subsets = get_random_subsets(X, y, self.n_estimators)
60
61 for i in self.progressbar(range(self.n_estimators)):
62 X_subset, y_subset = subsets[i]
63 # Feature bagging (select random subsets of the features)
64 idx = np.random.choice(range(n_features), size=self.max_features, replace=True)
65 # Save the indices of the features for prediction
66 self.trees[i].feature_indices = idx
67 # Choose the features corresponding to the indices
68 X_subset = X_subset[:, idx]
69 # Fit the tree to the data
70 self.trees[i].fit(X_subset, y_subset)
71
72 def predict(self, X):
73 y_preds = np.empty((X.shape[0], len(self.trees)))
74 # Let each tree make a prediction on the data
75 for i, tree in enumerate(self.trees):
76 # Indices of the features that the tree has trained on
77 idx = tree.feature_indices
78 # Make a prediction based on those features
79 prediction = tree.predict(X[:, idx])
80 y_preds[:, i] = prediction
81
82 y_pred = []
83 # For each sample
84 for sample_predictions in y_preds:
85 # Select the most common class prediction
86 y_pred.append(np.bincount(sample_predictions.astype('int')).argmax())
87 return y_pred