ml-finance-python

python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
random_forest.py

(3683B)
      1 from __future__ import division, print_function
      2 import numpy as np
      3 import math
      4 import progressbar
      5 
      6 # Import helper functions
      7 from mlfromscratch.utils import divide_on_feature, train_test_split, get_random_subsets, normalize
      8 from mlfromscratch.utils import accuracy_score, calculate_entropy
      9 from mlfromscratch.unsupervised_learning import PCA
     10 from mlfromscratch.supervised_learning import ClassificationTree
     11 from mlfromscratch.utils.misc import bar_widgets
     12 from mlfromscratch.utils import Plot
     13 
     14 
     15 class RandomForest():
     16     """Random Forest classifier. Uses a collection of classification trees that
     17     trains on random subsets of the data using a random subsets of the features.
     18 
     19     Parameters:
     20     -----------
     21     n_estimators: int
     22         The number of classification trees that are used.
     23     max_features: int
     24         The maximum number of features that the classification trees are allowed to
     25         use.
     26     min_samples_split: int
     27         The minimum number of samples needed to make a split when building a tree.
     28     min_gain: float
     29         The minimum impurity required to split the tree further. 
     30     max_depth: int
     31         The maximum depth of a tree.
     32     """
     33     def __init__(self, n_estimators=100, max_features=None, min_samples_split=2,
     34                  min_gain=0, max_depth=float("inf")):
     35         self.n_estimators = n_estimators    # Number of trees
     36         self.max_features = max_features    # Maxmimum number of features per tree
     37         self.min_samples_split = min_samples_split
     38         self.min_gain = min_gain            # Minimum information gain req. to continue
     39         self.max_depth = max_depth          # Maximum depth for tree
     40         self.progressbar = progressbar.ProgressBar(widgets=bar_widgets)
     41 
     42         # Initialize decision trees
     43         self.trees = []
     44         for _ in range(n_estimators):
     45             self.trees.append(
     46                 ClassificationTree(
     47                     min_samples_split=self.min_samples_split,
     48                     min_impurity=min_gain,
     49                     max_depth=self.max_depth))
     50 
     51     def fit(self, X, y):
     52         n_features = np.shape(X)[1]
     53         # If max_features have not been defined => select it as
     54         # sqrt(n_features)
     55         if not self.max_features:
     56             self.max_features = int(math.sqrt(n_features))
     57 
     58         # Choose one random subset of the data for each tree
     59         subsets = get_random_subsets(X, y, self.n_estimators)
     60 
     61         for i in self.progressbar(range(self.n_estimators)):
     62             X_subset, y_subset = subsets[i]
     63             # Feature bagging (select random subsets of the features)
     64             idx = np.random.choice(range(n_features), size=self.max_features, replace=True)
     65             # Save the indices of the features for prediction
     66             self.trees[i].feature_indices = idx
     67             # Choose the features corresponding to the indices
     68             X_subset = X_subset[:, idx]
     69             # Fit the tree to the data
     70             self.trees[i].fit(X_subset, y_subset)
     71 
     72     def predict(self, X):
     73         y_preds = np.empty((X.shape[0], len(self.trees)))
     74         # Let each tree make a prediction on the data
     75         for i, tree in enumerate(self.trees):
     76             # Indices of the features that the tree has trained on
     77             idx = tree.feature_indices
     78             # Make a prediction based on those features
     79             prediction = tree.predict(X[:, idx])
     80             y_preds[:, i] = prediction
     81             
     82         y_pred = []
     83         # For each sample
     84         for sample_predictions in y_preds:
     85             # Select the most common class prediction
     86             y_pred.append(np.bincount(sample_predictions.astype('int')).argmax())
     87         return y_pred