ml-finance-python
python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
data_operation.py
(2240B)
1 from __future__ import division
2 import numpy as np
3 import math
4 import sys
5
6
7 def calculate_entropy(y):
8 """ Calculate the entropy of label array y """
9 log2 = lambda x: math.log(x) / math.log(2)
10 unique_labels = np.unique(y)
11 entropy = 0
12 for label in unique_labels:
13 count = len(y[y == label])
14 p = count / len(y)
15 entropy += -p * log2(p)
16 return entropy
17
18
19 def mean_squared_error(y_true, y_pred):
20 """ Returns the mean squared error between y_true and y_pred """
21 mse = np.mean(np.power(y_true - y_pred, 2))
22 return mse
23
24
25 def calculate_variance(X):
26 """ Return the variance of the features in dataset X """
27 mean = np.ones(np.shape(X)) * X.mean(0)
28 n_samples = np.shape(X)[0]
29 variance = (1 / n_samples) * np.diag((X - mean).T.dot(X - mean))
30
31 return variance
32
33
34 def calculate_std_dev(X):
35 """ Calculate the standard deviations of the features in dataset X """
36 std_dev = np.sqrt(calculate_variance(X))
37 return std_dev
38
39
40 def euclidean_distance(x1, x2):
41 """ Calculates the l2 distance between two vectors """
42 distance = 0
43 # Squared distance between each coordinate
44 for i in range(len(x1)):
45 distance += pow((x1[i] - x2[i]), 2)
46 return math.sqrt(distance)
47
48
49 def accuracy_score(y_true, y_pred):
50 """ Compare y_true to y_pred and return the accuracy """
51 accuracy = np.sum(y_true == y_pred, axis=0) / len(y_true)
52 return accuracy
53
54
55 def calculate_covariance_matrix(X, Y=None):
56 """ Calculate the covariance matrix for the dataset X """
57 if Y is None:
58 Y = X
59 n_samples = np.shape(X)[0]
60 covariance_matrix = (1 / (n_samples-1)) * (X - X.mean(axis=0)).T.dot(Y - Y.mean(axis=0))
61
62 return np.array(covariance_matrix, dtype=float)
63
64
65 def calculate_correlation_matrix(X, Y=None):
66 """ Calculate the correlation matrix for the dataset X """
67 if Y is None:
68 Y = X
69 n_samples = np.shape(X)[0]
70 covariance = (1 / n_samples) * (X - X.mean(0)).T.dot(Y - Y.mean(0))
71 std_dev_X = np.expand_dims(calculate_std_dev(X), 1)
72 std_dev_y = np.expand_dims(calculate_std_dev(Y), 1)
73 correlation_matrix = np.divide(covariance, std_dev_X.dot(std_dev_y.T))
74
75 return np.array(correlation_matrix, dtype=float)