ml-finance-python

python scripts for finance machine learning

git clone https://9o.is/git/ml-finance-python.git

data_manipulation.py

(5089B)


      1 from __future__ import division
      2 from itertools import combinations_with_replacement
      3 import numpy as np
      4 import math
      5 import sys
      6 
      7 
      8 def shuffle_data(X, y, seed=None):
      9     """ Random shuffle of the samples in X and y """
     10     if seed:
     11         np.random.seed(seed)
     12     idx = np.arange(X.shape[0])
     13     np.random.shuffle(idx)
     14     return X[idx], y[idx]
     15 
     16 
     17 def batch_iterator(X, y=None, batch_size=64):
     18     """ Simple batch generator """
     19     n_samples = X.shape[0]
     20     for i in np.arange(0, n_samples, batch_size):
     21         begin, end = i, min(i+batch_size, n_samples)
     22         if y is not None:
     23             yield X[begin:end], y[begin:end]
     24         else:
     25             yield X[begin:end]
     26 
     27 
     28 def divide_on_feature(X, feature_i, threshold):
     29     """ Divide dataset based on if sample value on feature index is larger than
     30         the given threshold """
     31     split_func = None
     32     if isinstance(threshold, int) or isinstance(threshold, float):
     33         split_func = lambda sample: sample[feature_i] >= threshold
     34     else:
     35         split_func = lambda sample: sample[feature_i] == threshold
     36 
     37     X_1 = np.array([sample for sample in X if split_func(sample)])
     38     X_2 = np.array([sample for sample in X if not split_func(sample)])
     39 
     40     return np.array([X_1, X_2])
     41 
     42 
     43 def polynomial_features(X, degree):
     44     n_samples, n_features = np.shape(X)
     45 
     46     def index_combinations():
     47         combs = [combinations_with_replacement(range(n_features), i) for i in range(0, degree + 1)]
     48         flat_combs = [item for sublist in combs for item in sublist]
     49         return flat_combs
     50     
     51     combinations = index_combinations()
     52     n_output_features = len(combinations)
     53     X_new = np.empty((n_samples, n_output_features))
     54     
     55     for i, index_combs in enumerate(combinations):  
     56         X_new[:, i] = np.prod(X[:, index_combs], axis=1)
     57 
     58     return X_new
     59 
     60 
     61 def get_random_subsets(X, y, n_subsets, replacements=True):
     62     """ Return random subsets (with replacements) of the data """
     63     n_samples = np.shape(X)[0]
     64     # Concatenate x and y and do a random shuffle
     65     X_y = np.concatenate((X, y.reshape((1, len(y))).T), axis=1)
     66     np.random.shuffle(X_y)
     67     subsets = []
     68 
     69     # Uses 50% of training samples without replacements
     70     subsample_size = int(n_samples // 2)
     71     if replacements:
     72         subsample_size = n_samples      # 100% with replacements
     73 
     74     for _ in range(n_subsets):
     75         idx = np.random.choice(
     76             range(n_samples),
     77             size=np.shape(range(subsample_size)),
     78             replace=replacements)
     79         X = X_y[idx][:, :-1]
     80         y = X_y[idx][:, -1]
     81         subsets.append([X, y])
     82     return subsets
     83 
     84 
     85 def normalize(X, axis=-1, order=2):
     86     """ Normalize the dataset X """
     87     l2 = np.atleast_1d(np.linalg.norm(X, order, axis))
     88     l2[l2 == 0] = 1
     89     return X / np.expand_dims(l2, axis)
     90 
     91 
     92 def standardize(X):
     93     """ Standardize the dataset X """
     94     X_std = X
     95     mean = X.mean(axis=0)
     96     std = X.std(axis=0)
     97     for col in range(np.shape(X)[1]):
     98         if std[col]:
     99             X_std[:, col] = (X_std[:, col] - mean[col]) / std[col]
    100     # X_std = (X - X.mean(axis=0)) / X.std(axis=0)
    101     return X_std
    102 
    103 
    104 def train_test_split(X, y, test_size=0.5, shuffle=True, seed=None):
    105     """ Split the data into train and test sets """
    106     if shuffle:
    107         X, y = shuffle_data(X, y, seed)
    108     # Split the training data from test data in the ratio specified in
    109     # test_size
    110     split_i = len(y) - int(len(y) // (1 / test_size))
    111     X_train, X_test = X[:split_i], X[split_i:]
    112     y_train, y_test = y[:split_i], y[split_i:]
    113 
    114     return X_train, X_test, y_train, y_test
    115 
    116 
    117 def k_fold_cross_validation_sets(X, y, k, shuffle=True):
    118     """ Split the data into k sets of training / test data """
    119     if shuffle:
    120         X, y = shuffle_data(X, y)
    121 
    122     n_samples = len(y)
    123     left_overs = {}
    124     n_left_overs = (n_samples % k)
    125     if n_left_overs != 0:
    126         left_overs["X"] = X[-n_left_overs:]
    127         left_overs["y"] = y[-n_left_overs:]
    128         X = X[:-n_left_overs]
    129         y = y[:-n_left_overs]
    130 
    131     X_split = np.split(X, k)
    132     y_split = np.split(y, k)
    133     sets = []
    134     for i in range(k):
    135         X_test, y_test = X_split[i], y_split[i]
    136         X_train = np.concatenate(X_split[:i] + X_split[i + 1:], axis=0)
    137         y_train = np.concatenate(y_split[:i] + y_split[i + 1:], axis=0)
    138         sets.append([X_train, X_test, y_train, y_test])
    139 
    140     # Add left over samples to last set as training samples
    141     if n_left_overs != 0:
    142         np.append(sets[-1][0], left_overs["X"], axis=0)
    143         np.append(sets[-1][2], left_overs["y"], axis=0)
    144 
    145     return np.array(sets)
    146 
    147 
    148 def to_categorical(x, n_col=None):
    149     """ One-hot encoding of nominal values """
    150     if not n_col:
    151         n_col = np.amax(x) + 1
    152     one_hot = np.zeros((x.shape[0], n_col))
    153     one_hot[np.arange(x.shape[0]), x] = 1
    154     return one_hot
    155 
    156 
    157 def to_nominal(x):
    158     """ Conversion from one-hot encoding to nominal """
    159     return np.argmax(x, axis=1)
    160 
    161 
    162 def make_diagonal(x):
    163     """ Converts a vector into an diagonal matrix """
    164     m = np.zeros((len(x), len(x)))
    165     for i in range(len(m[0])):
    166         m[i, i] = x[i]
    167     return m