ml-finance-python

python scripts for finance machine learning

git clone https://9o.is/git/ml-finance-python.git

naive_bayes.py

(3363B)


      1 from __future__ import division, print_function
      2 import numpy as np
      3 import math
      4 from mlfromscratch.utils import train_test_split, normalize
      5 from mlfromscratch.utils import Plot, accuracy_score
      6 
      7 class NaiveBayes():
      8     """The Gaussian Naive Bayes classifier. """
      9     def fit(self, X, y):
     10         self.X, self.y = X, y
     11         self.classes = np.unique(y)
     12         self.parameters = []
     13         # Calculate the mean and variance of each feature for each class
     14         for i, c in enumerate(self.classes):
     15             # Only select the rows where the label equals the given class
     16             X_where_c = X[np.where(y == c)]
     17             self.parameters.append([])
     18             # Add the mean and variance for each feature (column)
     19             for col in X_where_c.T:
     20                 parameters = {"mean": col.mean(), "var": col.var()}
     21                 self.parameters[i].append(parameters)
     22 
     23     def _calculate_likelihood(self, mean, var, x):
     24         """ Gaussian likelihood of the data x given mean and var """
     25         eps = 1e-4 # Added in denominator to prevent division by zero
     26         coeff = 1.0 / math.sqrt(2.0 * math.pi * var + eps)
     27         exponent = math.exp(-(math.pow(x - mean, 2) / (2 * var + eps)))
     28         return coeff * exponent
     29 
     30     def _calculate_prior(self, c):
     31         """ Calculate the prior of class c
     32         (samples where class == c / total number of samples)"""
     33         frequency = np.mean(self.y == c)
     34         return frequency
     35 
     36     def _classify(self, sample):
     37         """ Classification using Bayes Rule P(Y|X) = P(X|Y)*P(Y)/P(X),
     38             or Posterior = Likelihood * Prior / Scaling Factor
     39 
     40         P(Y|X) - The posterior is the probability that sample x is of class y given the
     41                  feature values of x being distributed according to distribution of y and the prior.
     42         P(X|Y) - Likelihood of data X given class distribution Y.
     43                  Gaussian distribution (given by _calculate_likelihood)
     44         P(Y)   - Prior (given by _calculate_prior)
     45         P(X)   - Scales the posterior to make it a proper probability distribution.
     46                  This term is ignored in this implementation since it doesn't affect
     47                  which class distribution the sample is most likely to belong to.
     48 
     49         Classifies the sample as the class that results in the largest P(Y|X) (posterior)
     50         """
     51         posteriors = []
     52         # Go through list of classes
     53         for i, c in enumerate(self.classes):
     54             # Initialize posterior as prior
     55             posterior = self._calculate_prior(c)
     56             # Naive assumption (independence):
     57             # P(x1,x2,x3|Y) = P(x1|Y)*P(x2|Y)*P(x3|Y)
     58             # Posterior is product of prior and likelihoods (ignoring scaling factor)
     59             for feature_value, params in zip(sample, self.parameters[i]):
     60                 # Likelihood of feature value given distribution of feature values given y
     61                 likelihood = self._calculate_likelihood(params["mean"], params["var"], feature_value)
     62                 posterior *= likelihood
     63             posteriors.append(posterior)
     64         # Return the class with the largest posterior probability
     65         return self.classes[np.argmax(posteriors)]
     66 
     67     def predict(self, X):
     68         """ Predict the class labels of the samples in X """
     69         y_pred = [self._classify(sample) for sample in X]
     70         return y_pred