ml-finance-python
python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
data_manipulation.py
(5089B)
1 from __future__ import division
2 from itertools import combinations_with_replacement
3 import numpy as np
4 import math
5 import sys
6
7
8 def shuffle_data(X, y, seed=None):
9 """ Random shuffle of the samples in X and y """
10 if seed:
11 np.random.seed(seed)
12 idx = np.arange(X.shape[0])
13 np.random.shuffle(idx)
14 return X[idx], y[idx]
15
16
17 def batch_iterator(X, y=None, batch_size=64):
18 """ Simple batch generator """
19 n_samples = X.shape[0]
20 for i in np.arange(0, n_samples, batch_size):
21 begin, end = i, min(i+batch_size, n_samples)
22 if y is not None:
23 yield X[begin:end], y[begin:end]
24 else:
25 yield X[begin:end]
26
27
28 def divide_on_feature(X, feature_i, threshold):
29 """ Divide dataset based on if sample value on feature index is larger than
30 the given threshold """
31 split_func = None
32 if isinstance(threshold, int) or isinstance(threshold, float):
33 split_func = lambda sample: sample[feature_i] >= threshold
34 else:
35 split_func = lambda sample: sample[feature_i] == threshold
36
37 X_1 = np.array([sample for sample in X if split_func(sample)])
38 X_2 = np.array([sample for sample in X if not split_func(sample)])
39
40 return np.array([X_1, X_2])
41
42
43 def polynomial_features(X, degree):
44 n_samples, n_features = np.shape(X)
45
46 def index_combinations():
47 combs = [combinations_with_replacement(range(n_features), i) for i in range(0, degree + 1)]
48 flat_combs = [item for sublist in combs for item in sublist]
49 return flat_combs
50
51 combinations = index_combinations()
52 n_output_features = len(combinations)
53 X_new = np.empty((n_samples, n_output_features))
54
55 for i, index_combs in enumerate(combinations):
56 X_new[:, i] = np.prod(X[:, index_combs], axis=1)
57
58 return X_new
59
60
61 def get_random_subsets(X, y, n_subsets, replacements=True):
62 """ Return random subsets (with replacements) of the data """
63 n_samples = np.shape(X)[0]
64 # Concatenate x and y and do a random shuffle
65 X_y = np.concatenate((X, y.reshape((1, len(y))).T), axis=1)
66 np.random.shuffle(X_y)
67 subsets = []
68
69 # Uses 50% of training samples without replacements
70 subsample_size = int(n_samples // 2)
71 if replacements:
72 subsample_size = n_samples # 100% with replacements
73
74 for _ in range(n_subsets):
75 idx = np.random.choice(
76 range(n_samples),
77 size=np.shape(range(subsample_size)),
78 replace=replacements)
79 X = X_y[idx][:, :-1]
80 y = X_y[idx][:, -1]
81 subsets.append([X, y])
82 return subsets
83
84
85 def normalize(X, axis=-1, order=2):
86 """ Normalize the dataset X """
87 l2 = np.atleast_1d(np.linalg.norm(X, order, axis))
88 l2[l2 == 0] = 1
89 return X / np.expand_dims(l2, axis)
90
91
92 def standardize(X):
93 """ Standardize the dataset X """
94 X_std = X
95 mean = X.mean(axis=0)
96 std = X.std(axis=0)
97 for col in range(np.shape(X)[1]):
98 if std[col]:
99 X_std[:, col] = (X_std[:, col] - mean[col]) / std[col]
100 # X_std = (X - X.mean(axis=0)) / X.std(axis=0)
101 return X_std
102
103
104 def train_test_split(X, y, test_size=0.5, shuffle=True, seed=None):
105 """ Split the data into train and test sets """
106 if shuffle:
107 X, y = shuffle_data(X, y, seed)
108 # Split the training data from test data in the ratio specified in
109 # test_size
110 split_i = len(y) - int(len(y) // (1 / test_size))
111 X_train, X_test = X[:split_i], X[split_i:]
112 y_train, y_test = y[:split_i], y[split_i:]
113
114 return X_train, X_test, y_train, y_test
115
116
117 def k_fold_cross_validation_sets(X, y, k, shuffle=True):
118 """ Split the data into k sets of training / test data """
119 if shuffle:
120 X, y = shuffle_data(X, y)
121
122 n_samples = len(y)
123 left_overs = {}
124 n_left_overs = (n_samples % k)
125 if n_left_overs != 0:
126 left_overs["X"] = X[-n_left_overs:]
127 left_overs["y"] = y[-n_left_overs:]
128 X = X[:-n_left_overs]
129 y = y[:-n_left_overs]
130
131 X_split = np.split(X, k)
132 y_split = np.split(y, k)
133 sets = []
134 for i in range(k):
135 X_test, y_test = X_split[i], y_split[i]
136 X_train = np.concatenate(X_split[:i] + X_split[i + 1:], axis=0)
137 y_train = np.concatenate(y_split[:i] + y_split[i + 1:], axis=0)
138 sets.append([X_train, X_test, y_train, y_test])
139
140 # Add left over samples to last set as training samples
141 if n_left_overs != 0:
142 np.append(sets[-1][0], left_overs["X"], axis=0)
143 np.append(sets[-1][2], left_overs["y"], axis=0)
144
145 return np.array(sets)
146
147
148 def to_categorical(x, n_col=None):
149 """ One-hot encoding of nominal values """
150 if not n_col:
151 n_col = np.amax(x) + 1
152 one_hot = np.zeros((x.shape[0], n_col))
153 one_hot[np.arange(x.shape[0]), x] = 1
154 return one_hot
155
156
157 def to_nominal(x):
158 """ Conversion from one-hot encoding to nominal """
159 return np.argmax(x, axis=1)
160
161
162 def make_diagonal(x):
163 """ Converts a vector into an diagonal matrix """
164 m = np.zeros((len(x), len(x)))
165 for i in range(len(m[0])):
166 m[i, i] = x[i]
167 return m