ml-finance-python

python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
05_how_to_optimize_a_NN_architecture.py

(4447B)
      1 # coding: utf-8
      2 
      3 import warnings
      4 
      5 warnings.filterwarnings('ignore')
      6 
      7 import numpy as np
      8 import pandas as pd
      9 from joblib import dump
     10 
     11 from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
     12 from sklearn.metrics import roc_auc_score
     13 import tensorflow as tf
     14 from keras.models import Sequential
     15 from keras.wrappers.scikit_learn import KerasClassifier
     16 from keras.layers import Dense, Dropout, Activation
     17 from keras.callbacks import EarlyStopping, TensorBoard
     18 
     19 np.random.seed(42)
     20 
     21 data = pd.read_hdf('data.h5', 'returns')
     22 test_data = data['2017':]
     23 X_train = data[:'2016'].drop('label', axis=1)
     24 y_train = data[:'2016'].label
     25 
     26 del data
     27 
     28 input_dim = X_train.shape[1]
     29 
     30 
     31 def auc_roc(y_true, y_pred):
     32     # any tensorflow metric
     33     value, update_op = tf.metrics.auc(y_true, y_pred)
     34 
     35     # find all variables created for this metric
     36     metric_vars = [i for i in tf.local_variables() if 'auc_roc' in i.name.split('/')[1]]
     37 
     38     # Add metric variables to GLOBAL_VARIABLES collection.
     39     # They will be initialized for new session.
     40     for v in metric_vars:
     41         tf.add_to_collection(tf.GraphKeys.GLOBAL_VARIABLES, v)
     42 
     43     # force to update metric values
     44     with tf.control_dependencies([update_op]):
     45         value = tf.identity(value)
     46         return value
     47 
     48 
     49 def make_model(dense_layers, activation, dropout):
     50     '''Creates a multi-layer perceptron model
     51     
     52     dense_layers: List of layer sizes; one number per layer
     53     '''
     54 
     55     model = Sequential()
     56     for i, layer_size in enumerate(dense_layers, 1):
     57         if i == 1:
     58             model.add(Dense(layer_size, input_dim=input_dim))
     59             model.add(Activation(activation))
     60         else:
     61             model.add(Dense(layer_size))
     62             model.add(Activation(activation))
     63     model.add(Dropout(dropout))
     64     model.add(Dense(1))
     65     model.add(Activation('sigmoid'))
     66 
     67     model.compile(loss='binary_crossentropy',
     68                   optimizer='Adam',
     69                   metrics=['binary_accuracy', auc_roc])
     70 
     71     return model
     72 
     73 
     74 clf = KerasClassifier(make_model, epochs=10, batch_size=32)
     75 
     76 
     77 class OneStepTimeSeriesSplit:
     78     """Generates tuples of train_idx, test_idx pairs
     79     Assumes the index contains a level labeled 'date'"""
     80 
     81     def __init__(self, n_splits=3, test_period_length=1, shuffle=False):
     82         self.n_splits = n_splits
     83         self.test_period_length = test_period_length
     84         self.shuffle = shuffle
     85         self.test_end = n_splits * test_period_length
     86 
     87     @staticmethod
     88     def chunks(l, chunk_size):
     89         for i in range(0, len(l), chunk_size):
     90             yield l[i:i + chunk_size]
     91 
     92     def split(self, X, y=None, groups=None):
     93         unique_dates = (X.index
     94                             .get_level_values('date')
     95                             .unique()
     96                             .sort_values(ascending=False)[:self.test_end])
     97 
     98         dates = X.reset_index()[['date']]
     99         for test_date in self.chunks(unique_dates, self.test_period_length):
    100             train_idx = dates[dates.date < min(test_date)].index
    101             test_idx = dates[dates.date.isin(test_date)].index
    102             if self.shuffle:
    103                 np.random.shuffle(list(train_idx))
    104             yield train_idx, test_idx
    105 
    106     def get_n_splits(self, X, y, groups=None):
    107         return self.n_splits
    108 
    109 
    110 cv = OneStepTimeSeriesSplit(n_splits=12)
    111 
    112 param_grid = {'dense_layers': [[32], [32, 32], [64], [64, 64], [64, 64, 32], [64, 32], [128]],
    113               'activation'  : ['relu', 'tanh'],
    114               'dropout'     : [.25, .5, .75],
    115               }
    116 
    117 gs = GridSearchCV(estimator=clf,
    118                   param_grid=param_grid,
    119                   scoring='roc_auc',
    120                   cv=cv,
    121                   refit=True,
    122                   return_train_score=True,
    123                   n_jobs=-1,
    124                   verbose=1,
    125                   error_score=np.nan
    126                   )
    127 
    128 fit_params = dict(callbacks=[EarlyStopping(monitor='auc_roc', patience=300, verbose=1, mode='max')],
    129                   verbose=2,
    130                   epochs=50)
    131 
    132 gs.fit(X=X_train.astype(float), y=y_train, **fit_params)
    133 print('\nBest Score: {:.2%}'.format(gs.best_score_))
    134 print('Best Params:\n', pd.Series(gs.best_params_))
    135 
    136 dump(gs, 'gs.joblib')
    137 gs.best_estimator_.model.save('best_model.h5')
    138 pd.DataFrame(gs.cv_results_).to_csv('cv_results.csv', index=False)
    139 
    140 y_pred = gs.best_estimator_.model.predict(test_data.drop('label', axis=1))
    141 print(roc_auc_score(y_true=test_data.label, y_score=y_pred))