ml-finance-python
python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
05_how_to_optimize_a_NN_architecture.py
(4447B)
1 # coding: utf-8
2
3 import warnings
4
5 warnings.filterwarnings('ignore')
6
7 import numpy as np
8 import pandas as pd
9 from joblib import dump
10
11 from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
12 from sklearn.metrics import roc_auc_score
13 import tensorflow as tf
14 from keras.models import Sequential
15 from keras.wrappers.scikit_learn import KerasClassifier
16 from keras.layers import Dense, Dropout, Activation
17 from keras.callbacks import EarlyStopping, TensorBoard
18
19 np.random.seed(42)
20
21 data = pd.read_hdf('data.h5', 'returns')
22 test_data = data['2017':]
23 X_train = data[:'2016'].drop('label', axis=1)
24 y_train = data[:'2016'].label
25
26 del data
27
28 input_dim = X_train.shape[1]
29
30
31 def auc_roc(y_true, y_pred):
32 # any tensorflow metric
33 value, update_op = tf.metrics.auc(y_true, y_pred)
34
35 # find all variables created for this metric
36 metric_vars = [i for i in tf.local_variables() if 'auc_roc' in i.name.split('/')[1]]
37
38 # Add metric variables to GLOBAL_VARIABLES collection.
39 # They will be initialized for new session.
40 for v in metric_vars:
41 tf.add_to_collection(tf.GraphKeys.GLOBAL_VARIABLES, v)
42
43 # force to update metric values
44 with tf.control_dependencies([update_op]):
45 value = tf.identity(value)
46 return value
47
48
49 def make_model(dense_layers, activation, dropout):
50 '''Creates a multi-layer perceptron model
51
52 dense_layers: List of layer sizes; one number per layer
53 '''
54
55 model = Sequential()
56 for i, layer_size in enumerate(dense_layers, 1):
57 if i == 1:
58 model.add(Dense(layer_size, input_dim=input_dim))
59 model.add(Activation(activation))
60 else:
61 model.add(Dense(layer_size))
62 model.add(Activation(activation))
63 model.add(Dropout(dropout))
64 model.add(Dense(1))
65 model.add(Activation('sigmoid'))
66
67 model.compile(loss='binary_crossentropy',
68 optimizer='Adam',
69 metrics=['binary_accuracy', auc_roc])
70
71 return model
72
73
74 clf = KerasClassifier(make_model, epochs=10, batch_size=32)
75
76
77 class OneStepTimeSeriesSplit:
78 """Generates tuples of train_idx, test_idx pairs
79 Assumes the index contains a level labeled 'date'"""
80
81 def __init__(self, n_splits=3, test_period_length=1, shuffle=False):
82 self.n_splits = n_splits
83 self.test_period_length = test_period_length
84 self.shuffle = shuffle
85 self.test_end = n_splits * test_period_length
86
87 @staticmethod
88 def chunks(l, chunk_size):
89 for i in range(0, len(l), chunk_size):
90 yield l[i:i + chunk_size]
91
92 def split(self, X, y=None, groups=None):
93 unique_dates = (X.index
94 .get_level_values('date')
95 .unique()
96 .sort_values(ascending=False)[:self.test_end])
97
98 dates = X.reset_index()[['date']]
99 for test_date in self.chunks(unique_dates, self.test_period_length):
100 train_idx = dates[dates.date < min(test_date)].index
101 test_idx = dates[dates.date.isin(test_date)].index
102 if self.shuffle:
103 np.random.shuffle(list(train_idx))
104 yield train_idx, test_idx
105
106 def get_n_splits(self, X, y, groups=None):
107 return self.n_splits
108
109
110 cv = OneStepTimeSeriesSplit(n_splits=12)
111
112 param_grid = {'dense_layers': [[32], [32, 32], [64], [64, 64], [64, 64, 32], [64, 32], [128]],
113 'activation' : ['relu', 'tanh'],
114 'dropout' : [.25, .5, .75],
115 }
116
117 gs = GridSearchCV(estimator=clf,
118 param_grid=param_grid,
119 scoring='roc_auc',
120 cv=cv,
121 refit=True,
122 return_train_score=True,
123 n_jobs=-1,
124 verbose=1,
125 error_score=np.nan
126 )
127
128 fit_params = dict(callbacks=[EarlyStopping(monitor='auc_roc', patience=300, verbose=1, mode='max')],
129 verbose=2,
130 epochs=50)
131
132 gs.fit(X=X_train.astype(float), y=y_train, **fit_params)
133 print('\nBest Score: {:.2%}'.format(gs.best_score_))
134 print('Best Params:\n', pd.Series(gs.best_params_))
135
136 dump(gs, 'gs.joblib')
137 gs.best_estimator_.model.save('best_model.h5')
138 pd.DataFrame(gs.cv_results_).to_csv('cv_results.csv', index=False)
139
140 y_pred = gs.best_estimator_.model.predict(test_data.drop('label', axis=1))
141 print(roc_auc_score(y_true=test_data.label, y_score=y_pred))