ml-finance-python
python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
02_stacked_lstm_with_feature_embeddings.ipynb
(26000B)
1 {
2 "cells": [
3 {
4 "cell_type": "markdown",
5 "metadata": {},
6 "source": [
7 "# Stacked LSTMs for Time Series Classification"
8 ]
9 },
10 {
11 "cell_type": "markdown",
12 "metadata": {},
13 "source": [
14 "We'll now build a slightly deeper model by stacking two LSTM layers using the Quandl stock price data (see the stacked_lstm_with_feature_embeddings notebook for implementation details). Furthermore, we will include features that are not sequential in nature, namely indicator variables for identifying the equity and the month."
15 ]
16 },
17 {
18 "cell_type": "markdown",
19 "metadata": {},
20 "source": [
21 "## Run inside docker container for GPU acceleration"
22 ]
23 },
24 {
25 "cell_type": "markdown",
26 "metadata": {},
27 "source": [
28 "See [tensorflow guide](https://www.tensorflow.org/install/docker) and more detailed [instructions](https://blog.sicara.com/tensorflow-gpu-opencv-jupyter-docker-10705b6cd1d)"
29 ]
30 },
31 {
32 "cell_type": "markdown",
33 "metadata": {},
34 "source": [
35 "`docker run -it -p 8889:8888 -v /path/to/machine-learning-for-trading/18_recurrent_neural_nets:/rnn --name tensorflow tensorflow/tensorflow:latest-gpu-py3 bash`"
36 ]
37 },
38 {
39 "cell_type": "markdown",
40 "metadata": {},
41 "source": [
42 "Inside docker container: \n",
43 "`jupyter notebook --ip 0.0.0.0 --no-browser --allow-root`"
44 ]
45 },
46 {
47 "cell_type": "markdown",
48 "metadata": {},
49 "source": [
50 "## Imports"
51 ]
52 },
53 {
54 "cell_type": "code",
55 "execution_count": 18,
56 "metadata": {},
57 "outputs": [],
58 "source": [
59 "%matplotlib inline\n",
60 "import numpy as np\n",
61 "import pandas as pd\n",
62 "import matplotlib.pyplot as plt\n",
63 "import seaborn as sns\n",
64 "from datetime import datetime, date\n",
65 "from sklearn.metrics import mean_squared_error, roc_auc_score\n",
66 "from sklearn.preprocessing import minmax_scale\n",
67 "from keras.callbacks import ModelCheckpoint, EarlyStopping\n",
68 "from keras.models import Sequential, Model\n",
69 "from keras.layers import Dense, LSTM, Input, concatenate, Embedding, Reshape\n",
70 "import keras\n",
71 "import keras.backend as K\n",
72 "import tensorflow as tf"
73 ]
74 },
75 {
76 "cell_type": "code",
77 "execution_count": 19,
78 "metadata": {},
79 "outputs": [],
80 "source": [
81 "sns.set_style('whitegrid')\n",
82 "np.random.seed(42)\n",
83 "K.clear_session()"
84 ]
85 },
86 {
87 "cell_type": "markdown",
88 "metadata": {},
89 "source": [
90 "## Data"
91 ]
92 },
93 {
94 "cell_type": "markdown",
95 "metadata": {},
96 "source": [
97 "Data produced by the notebook [build_dataset](00_build_dataset.ipynb)."
98 ]
99 },
100 {
101 "cell_type": "code",
102 "execution_count": 20,
103 "metadata": {
104 "scrolled": true
105 },
106 "outputs": [
107 {
108 "name": "stdout",
109 "output_type": "stream",
110 "text": [
111 "<class 'pandas.core.frame.DataFrame'>\n",
112 "DatetimeIndex: 1167341 entries, 2009-01-01 to 2017-12-01\n",
113 "Data columns (total 66 columns):\n",
114 "ticker 1167341 non-null int64\n",
115 "1 1167341 non-null float64\n",
116 "2 1167341 non-null float64\n",
117 "3 1167341 non-null float64\n",
118 "4 1167341 non-null float64\n",
119 "5 1167341 non-null float64\n",
120 "6 1167341 non-null float64\n",
121 "7 1167341 non-null float64\n",
122 "8 1167341 non-null float64\n",
123 "9 1167341 non-null float64\n",
124 "10 1167341 non-null float64\n",
125 "11 1167341 non-null float64\n",
126 "12 1167341 non-null float64\n",
127 "13 1167341 non-null float64\n",
128 "14 1167341 non-null float64\n",
129 "15 1167341 non-null float64\n",
130 "16 1167341 non-null float64\n",
131 "17 1167341 non-null float64\n",
132 "18 1167341 non-null float64\n",
133 "19 1167341 non-null float64\n",
134 "20 1167341 non-null float64\n",
135 "21 1167341 non-null float64\n",
136 "22 1167341 non-null float64\n",
137 "23 1167341 non-null float64\n",
138 "24 1167341 non-null float64\n",
139 "25 1167341 non-null float64\n",
140 "26 1167341 non-null float64\n",
141 "27 1167341 non-null float64\n",
142 "28 1167341 non-null float64\n",
143 "29 1167341 non-null float64\n",
144 "30 1167341 non-null float64\n",
145 "31 1167341 non-null float64\n",
146 "32 1167341 non-null float64\n",
147 "33 1167341 non-null float64\n",
148 "34 1167341 non-null float64\n",
149 "35 1167341 non-null float64\n",
150 "36 1167341 non-null float64\n",
151 "37 1167341 non-null float64\n",
152 "38 1167341 non-null float64\n",
153 "39 1167341 non-null float64\n",
154 "40 1167341 non-null float64\n",
155 "41 1167341 non-null float64\n",
156 "42 1167341 non-null float64\n",
157 "43 1167341 non-null float64\n",
158 "44 1167341 non-null float64\n",
159 "45 1167341 non-null float64\n",
160 "46 1167341 non-null float64\n",
161 "47 1167341 non-null float64\n",
162 "48 1167341 non-null float64\n",
163 "49 1167341 non-null float64\n",
164 "50 1167341 non-null float64\n",
165 "51 1167341 non-null float64\n",
166 "52 1167341 non-null float64\n",
167 "label 1167341 non-null int64\n",
168 "month_1 1167341 non-null uint8\n",
169 "month_2 1167341 non-null uint8\n",
170 "month_3 1167341 non-null uint8\n",
171 "month_4 1167341 non-null uint8\n",
172 "month_5 1167341 non-null uint8\n",
173 "month_6 1167341 non-null uint8\n",
174 "month_7 1167341 non-null uint8\n",
175 "month_8 1167341 non-null uint8\n",
176 "month_9 1167341 non-null uint8\n",
177 "month_10 1167341 non-null uint8\n",
178 "month_11 1167341 non-null uint8\n",
179 "month_12 1167341 non-null uint8\n",
180 "dtypes: float64(52), int64(2), uint8(12)\n",
181 "memory usage: 503.2 MB\n"
182 ]
183 }
184 ],
185 "source": [
186 "data = pd.read_hdf('data.h5', 'returns_weekly')\n",
187 "data = data.drop([c for c in data.columns if str(c).startswith('year')], axis=1)\n",
188 "data.info()"
189 ]
190 },
191 {
192 "cell_type": "markdown",
193 "metadata": {},
194 "source": [
195 "## Train-test split\n",
196 "\n",
197 "To respect the time series nature of the data, we set aside the data at the end of the sample as hold-out or test set. More specifically, we'll use the data for 2018."
198 ]
199 },
200 {
201 "cell_type": "code",
202 "execution_count": 4,
203 "metadata": {},
204 "outputs": [],
205 "source": [
206 "window_size=52\n",
207 "ticker = 1\n",
208 "months = 12\n",
209 "n_tickers = data.ticker.nunique()"
210 ]
211 },
212 {
213 "cell_type": "code",
214 "execution_count": 5,
215 "metadata": {},
216 "outputs": [],
217 "source": [
218 "train_data = data[:'2016']\n",
219 "test_data = data['2017']\n",
220 "del data"
221 ]
222 },
223 {
224 "cell_type": "markdown",
225 "metadata": {},
226 "source": [
227 "For each train and test dataset, we generate a list with three input arrays containing the return series, the stock ticker (converted to integer values), and the month (as an integer), as shown here:"
228 ]
229 },
230 {
231 "cell_type": "code",
232 "execution_count": 6,
233 "metadata": {},
234 "outputs": [
235 {
236 "data": {
237 "text/plain": [
238 "([(1035424, 52, 1), (1035424,), (1035424, 12)], (1035424,))"
239 ]
240 },
241 "execution_count": 6,
242 "metadata": {},
243 "output_type": "execute_result"
244 }
245 ],
246 "source": [
247 "X_train = [\n",
248 " train_data.loc[:, list(range(1, window_size+1))].values.reshape(-1, window_size , 1),\n",
249 " train_data.ticker,\n",
250 " train_data.filter(like='month')\n",
251 "]\n",
252 "y_train = train_data.label\n",
253 "[x.shape for x in X_train], y_train.shape"
254 ]
255 },
256 {
257 "cell_type": "code",
258 "execution_count": 7,
259 "metadata": {},
260 "outputs": [
261 {
262 "data": {
263 "text/plain": [
264 "([(131917, 52, 1), (131917,), (131917, 12)], (131917,))"
265 ]
266 },
267 "execution_count": 7,
268 "metadata": {},
269 "output_type": "execute_result"
270 }
271 ],
272 "source": [
273 "# keep the last year for testing\n",
274 "X_test = [\n",
275 " test_data.loc[:, list(range(1, window_size+1))].values.reshape(-1, window_size , 1),\n",
276 " test_data.ticker,\n",
277 " test_data.filter(like='month')\n",
278 "]\n",
279 "y_test = test_data.label\n",
280 "[x.shape for x in X_test], y_test.shape"
281 ]
282 },
283 {
284 "cell_type": "markdown",
285 "metadata": {},
286 "source": [
287 "## Custom Metric"
288 ]
289 },
290 {
291 "cell_type": "code",
292 "execution_count": 8,
293 "metadata": {},
294 "outputs": [],
295 "source": [
296 "def roc_auc(y_true, y_pred):\n",
297 " # any tensorflow metric\n",
298 " value, update_op = tf.metrics.auc(y_true, y_pred)\n",
299 "\n",
300 " # find all variables created for this metric\n",
301 " metric_vars = [i for i in tf.local_variables() if 'auc_roc' in i.name.split('/')[1]]\n",
302 "\n",
303 " # Add metric variables to GLOBAL_VARIABLES collection.\n",
304 " # They will be initialized for new session.\n",
305 " for v in metric_vars:\n",
306 " tf.add_to_collection(tf.GraphKeys.GLOBAL_VARIABLES, v)\n",
307 "\n",
308 " # force to update metric values\n",
309 " with tf.control_dependencies([update_op]):\n",
310 " value = tf.identity(value)\n",
311 " return value"
312 ]
313 },
314 {
315 "cell_type": "code",
316 "execution_count": 9,
317 "metadata": {},
318 "outputs": [
319 {
320 "name": "stdout",
321 "output_type": "stream",
322 "text": [
323 "WARNING:tensorflow:From /home/stefan/.pyenv/versions/miniconda3-latest/envs/ml4t/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n",
324 "Instructions for updating:\n",
325 "Colocations handled automatically by placer.\n"
326 ]
327 }
328 ],
329 "source": [
330 "# source: https://github.com/keras-team/keras/issues/3230\n",
331 "def auc(y_true, y_pred):\n",
332 " ptas = tf.stack([binary_PTA(y_true, y_pred, k) for k in np.linspace(0, 1, 1000)], axis=0)\n",
333 " pfas = tf.stack([binary_PFA(y_true, y_pred, k) for k in np.linspace(0, 1, 1000)], axis=0)\n",
334 " pfas = tf.concat([tf.ones((1,)), pfas], axis=0)\n",
335 " binSizes = -(pfas[1:] - pfas[:-1])\n",
336 " s = ptas * binSizes\n",
337 " return K.sum(s, axis=0)\n",
338 "\n",
339 "\n",
340 "def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)):\n",
341 " \"\"\"prob false alert for binary classifier\"\"\"\n",
342 " y_pred = K.cast(y_pred >= threshold, 'float32')\n",
343 " # N = total number of negative labels\n",
344 " N = K.sum(1 - y_true)\n",
345 " # FP = total number of false alerts, alerts from the negative class labels\n",
346 " FP = K.sum(y_pred - y_pred * y_true)\n",
347 " return FP / (N + 1)\n",
348 "\n",
349 "\n",
350 "def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)):\n",
351 " \"\"\"prob true alerts for binary classifier\"\"\"\n",
352 " y_pred = K.cast(y_pred >= threshold, 'float32')\n",
353 " # P = total number of positive labels\n",
354 " P = K.sum(y_true)\n",
355 " # TP = total number of correct alerts, alerts from the positive class labels\n",
356 " TP = K.sum(y_pred * y_true)\n",
357 " return TP / (P + 1)"
358 ]
359 },
360 {
361 "cell_type": "markdown",
362 "metadata": {},
363 "source": [
364 "## Define the Model Architecture"
365 ]
366 },
367 {
368 "cell_type": "markdown",
369 "metadata": {},
370 "source": [
371 "The functional API of Keras makes it easy to design architectures with multiple inputs and outputs. This example illustrates a network with three inputs, as follows:\n",
372 "\n",
373 "- A two stacked LSTM layers with 25 and 10 units respectively\n",
374 "- An embedding layer that learns a 10-dimensional real-valued representation of the equities\n",
375 "- A one-hot encoded representation of the month\n",
376 "\n",
377 "This can be constructed using just a few lines - see e.g., \n",
378 "- the [general Keras documentation](https://keras.io/getting-started/sequential-model-guide/), \n",
379 "- the [LTSM documentation](https://keras.io/layers/recurrent/).\n",
380 "\n",
381 "Make sure you are initializing your optimizer given the [keras-recommended approach for RNNs](https://keras.io/optimizers/) \n",
382 "\n",
383 "We begin by defining the three inputs with their respective shapes, as described here:"
384 ]
385 },
386 {
387 "cell_type": "code",
388 "execution_count": null,
389 "metadata": {},
390 "outputs": [],
391 "source": [
392 "returns = Input(shape=(window_size, n_features), name='Returns')\n",
393 "tickers = Input(shape=(1,), name='Tickers')\n",
394 "months = Input(shape=(12,), name='Months')"
395 ]
396 },
397 {
398 "cell_type": "markdown",
399 "metadata": {},
400 "source": [
401 "### LSTM Layers"
402 ]
403 },
404 {
405 "cell_type": "markdown",
406 "metadata": {},
407 "source": [
408 "To define stacked LSTM layers, we set the `return_sequences` keyword to `True`. This ensures that the first layer produces an output that conforms to the expected three-dimensional input format. Note that we also use dropout regularization and how the functional API passes the tensor outputs from one layer to the subsequent layer:"
409 ]
410 },
411 {
412 "cell_type": "code",
413 "execution_count": 10,
414 "metadata": {},
415 "outputs": [],
416 "source": [
417 "lstm1_units = 25\n",
418 "lstm2_units = 10\n",
419 "n_features = 1"
420 ]
421 },
422 {
423 "cell_type": "code",
424 "execution_count": null,
425 "metadata": {},
426 "outputs": [],
427 "source": [
428 "lstm1 = LSTM(units=lstm1_units, \n",
429 " input_shape=(window_size, n_features), \n",
430 " name='LSTM1', \n",
431 " dropout=.2,\n",
432 " return_sequences=True)(returns)\n",
433 "lstm_model = LSTM(units=lstm2_units, \n",
434 " dropout=.2,\n",
435 " name='LSTM2')(lstm1)\n"
436 ]
437 },
438 {
439 "cell_type": "markdown",
440 "metadata": {},
441 "source": [
442 "### Embedding Layer"
443 ]
444 },
445 {
446 "cell_type": "markdown",
447 "metadata": {},
448 "source": [
449 "The embedding layer requires the `input_dim` keyword, which defines how many embeddings the layer will learn, the `output_dim` keyword, which defines the size of the embedding, and the `input_length` keyword to set the number of elements passed to the layer (here only one ticker per sample). \n",
450 "\n",
451 "To combine the embedding layer with the LSTM layer and the months input, we need to reshape (or flatten) it, as follows:"
452 ]
453 },
454 {
455 "cell_type": "code",
456 "execution_count": 11,
457 "metadata": {},
458 "outputs": [
459 {
460 "name": "stdout",
461 "output_type": "stream",
462 "text": [
463 "WARNING:tensorflow:From /home/stefan/.pyenv/versions/miniconda3-latest/envs/ml4t/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.\n",
464 "Instructions for updating:\n",
465 "Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.\n"
466 ]
467 }
468 ],
469 "source": [
470 "ticker_embedding = Embedding(input_dim=n_tickers, \n",
471 " output_dim=10, \n",
472 " input_length=1)(tickers)\n",
473 "ticker_embedding = Reshape(target_shape=(10,))(ticker_embedding)"
474 ]
475 },
476 {
477 "cell_type": "markdown",
478 "metadata": {},
479 "source": [
480 "### Concatenate Model components"
481 ]
482 },
483 {
484 "cell_type": "markdown",
485 "metadata": {},
486 "source": [
487 "Now we can concatenate the three tensors and add fully-connected layers to learn a mapping from these learned time series, ticker, and month indicators to the outcome, a positive or negative return in the following week, as shown here:"
488 ]
489 },
490 {
491 "cell_type": "code",
492 "execution_count": null,
493 "metadata": {},
494 "outputs": [],
495 "source": [
496 "merged = concatenate([lstm_model, ticker_embedding, months], name='Merged')\n",
497 "hidden_dense = Dense(10, name='FC1')(merged)\n",
498 "output = Dense(1, name='Output')(hidden_dense)\n",
499 "\n",
500 "rnn = Model(inputs=[returns, tickers, months], outputs=output)"
501 ]
502 },
503 {
504 "cell_type": "markdown",
505 "metadata": {},
506 "source": [
507 "The summary lays out this slightly more sophisticated architecture with 29,371 parameters, as follows:"
508 ]
509 },
510 {
511 "cell_type": "code",
512 "execution_count": 12,
513 "metadata": {
514 "scrolled": false
515 },
516 "outputs": [
517 {
518 "name": "stdout",
519 "output_type": "stream",
520 "text": [
521 "__________________________________________________________________________________________________\n",
522 "Layer (type) Output Shape Param # Connected to \n",
523 "==================================================================================================\n",
524 "Returns (InputLayer) (None, 52, 1) 0 \n",
525 "__________________________________________________________________________________________________\n",
526 "Tickers (InputLayer) (None, 1) 0 \n",
527 "__________________________________________________________________________________________________\n",
528 "LSTM1 (LSTM) (None, 52, 25) 2700 Returns[0][0] \n",
529 "__________________________________________________________________________________________________\n",
530 "embedding_1 (Embedding) (None, 1, 10) 24890 Tickers[0][0] \n",
531 "__________________________________________________________________________________________________\n",
532 "LSTM2 (LSTM) (None, 10) 1440 LSTM1[0][0] \n",
533 "__________________________________________________________________________________________________\n",
534 "reshape_1 (Reshape) (None, 10) 0 embedding_1[0][0] \n",
535 "__________________________________________________________________________________________________\n",
536 "Months (InputLayer) (None, 12) 0 \n",
537 "__________________________________________________________________________________________________\n",
538 "Merged (Concatenate) (None, 32) 0 LSTM2[0][0] \n",
539 " reshape_1[0][0] \n",
540 " Months[0][0] \n",
541 "__________________________________________________________________________________________________\n",
542 "FC1 (Dense) (None, 10) 330 Merged[0][0] \n",
543 "__________________________________________________________________________________________________\n",
544 "Output (Dense) (None, 1) 11 FC1[0][0] \n",
545 "==================================================================================================\n",
546 "Total params: 29,371\n",
547 "Trainable params: 29,371\n",
548 "Non-trainable params: 0\n",
549 "__________________________________________________________________________________________________\n"
550 ]
551 }
552 ],
553 "source": [
554 "rnn.summary()"
555 ]
556 },
557 {
558 "cell_type": "markdown",
559 "metadata": {},
560 "source": [
561 "## Train the Model"
562 ]
563 },
564 {
565 "cell_type": "markdown",
566 "metadata": {},
567 "source": [
568 "We compile the model to compute a custom auc metric as follows:"
569 ]
570 },
571 {
572 "cell_type": "code",
573 "execution_count": 13,
574 "metadata": {},
575 "outputs": [],
576 "source": [
577 "rnn.compile(loss='binary_crossentropy', \n",
578 " optimizer='adam',\n",
579 " metrics=['accuracy', auc])"
580 ]
581 },
582 {
583 "cell_type": "code",
584 "execution_count": 14,
585 "metadata": {},
586 "outputs": [],
587 "source": [
588 "rnn_path = 'models/quandl.lstm_months_{}_{}.weights.best.hdf5'.format(lstm1_units, lstm2_units)\n",
589 "checkpointer = ModelCheckpoint(filepath=rnn_path,\n",
590 " monitor='val_loss',\n",
591 " save_best_only=True,\n",
592 " save_weights_only=True,\n",
593 " period=5)"
594 ]
595 },
596 {
597 "cell_type": "code",
598 "execution_count": 15,
599 "metadata": {},
600 "outputs": [],
601 "source": [
602 "early_stopping = EarlyStopping(monitor='val_loss', \n",
603 " patience=5,\n",
604 " restore_best_weights=True)"
605 ]
606 },
607 {
608 "cell_type": "code",
609 "execution_count": null,
610 "metadata": {
611 "scrolled": false
612 },
613 "outputs": [],
614 "source": [
615 "result = rnn.fit(X_train,\n",
616 " y_train,\n",
617 " epochs=50,\n",
618 " batch_size=32,\n",
619 " validation_data=(X_test, y_test),\n",
620 " callbacks=[checkpointer, early_stopping],\n",
621 " verbose=1)"
622 ]
623 },
624 {
625 "cell_type": "markdown",
626 "metadata": {},
627 "source": [
628 "Training stops after 18 epochs, producing a test area under the curve (AUC) of 0.63 for the best model with 13 rounds of training (each of which takes around an hour on a single GPU)."
629 ]
630 },
631 {
632 "cell_type": "code",
633 "execution_count": null,
634 "metadata": {},
635 "outputs": [],
636 "source": [
637 "loss_history = pd.DataFrame(result.history)\n",
638 "loss_history"
639 ]
640 },
641 {
642 "cell_type": "code",
643 "execution_count": null,
644 "metadata": {},
645 "outputs": [],
646 "source": [
647 "def which_metric(m):\n",
648 " return m.split('_')[-1]"
649 ]
650 },
651 {
652 "cell_type": "code",
653 "execution_count": null,
654 "metadata": {},
655 "outputs": [],
656 "source": [
657 "loss_history.groupby(which_metric, axis=1).plot(figsize=(14, 6));"
658 ]
659 },
660 {
661 "cell_type": "markdown",
662 "metadata": {},
663 "source": [
664 "## Evaluate model performance"
665 ]
666 },
667 {
668 "cell_type": "code",
669 "execution_count": null,
670 "metadata": {},
671 "outputs": [],
672 "source": [
673 "test_predict = pd.Series(rnn.predict(X_test).squeeze(), index=y_test.index)"
674 ]
675 },
676 {
677 "cell_type": "code",
678 "execution_count": null,
679 "metadata": {},
680 "outputs": [],
681 "source": [
682 "roc_auc_score(y_score=test_predict, y_true=y_test)"
683 ]
684 },
685 {
686 "cell_type": "code",
687 "execution_count": null,
688 "metadata": {},
689 "outputs": [],
690 "source": [
691 "rnn.load_weights(rnn_path)"
692 ]
693 },
694 {
695 "cell_type": "code",
696 "execution_count": null,
697 "metadata": {},
698 "outputs": [],
699 "source": [
700 "test_predict = pd.Series(rnn.predict(X_test).squeeze(), index=y_test.index)"
701 ]
702 },
703 {
704 "cell_type": "code",
705 "execution_count": null,
706 "metadata": {},
707 "outputs": [],
708 "source": [
709 "roc_auc_score(y_score=test_predict, y_true=y_test)"
710 ]
711 },
712 {
713 "cell_type": "code",
714 "execution_count": null,
715 "metadata": {},
716 "outputs": [],
717 "source": [
718 "score"
719 ]
720 },
721 {
722 "cell_type": "code",
723 "execution_count": null,
724 "metadata": {},
725 "outputs": [],
726 "source": [
727 "predictions = (test_predict.to_frame('prediction').assign(data='test')\n",
728 " .append(train_predict.to_frame('prediction').assign(data='train')))\n",
729 "predictions.info()"
730 ]
731 },
732 {
733 "cell_type": "code",
734 "execution_count": null,
735 "metadata": {},
736 "outputs": [],
737 "source": [
738 "results = sp500_scaled.join(predictions).dropna()\n",
739 "results.info()"
740 ]
741 },
742 {
743 "cell_type": "code",
744 "execution_count": null,
745 "metadata": {},
746 "outputs": [],
747 "source": [
748 "corr = {}\n",
749 "for run, df in results.groupby('data'):\n",
750 " corr[run] = df.SP500.corr(df.prediction)"
751 ]
752 },
753 {
754 "cell_type": "code",
755 "execution_count": null,
756 "metadata": {},
757 "outputs": [],
758 "source": [
759 "sp500_scaled['Train Prediction'] = pd.Series(train_predict.squeeze(), index=y_train.index)\n",
760 "sp500_scaled['Test Prediction'] = pd.Series(test_predict.squeeze(), index=y_test.index)"
761 ]
762 },
763 {
764 "cell_type": "code",
765 "execution_count": null,
766 "metadata": {},
767 "outputs": [],
768 "source": [
769 "training_error = np.sqrt(rnn.evaluate(X_train, y_train, verbose=0))\n",
770 "testing_error = np.sqrt(rnn.evaluate(X_test, y_test, verbose=0))\n",
771 "print('Training Error: {:.4f} | Test Error: {:.4f}'.format(training_error, testing_error))"
772 ]
773 },
774 {
775 "cell_type": "code",
776 "execution_count": null,
777 "metadata": {},
778 "outputs": [],
779 "source": [
780 "sns.set_style('whitegrid')"
781 ]
782 }
783 ],
784 "metadata": {
785 "anaconda-cloud": {},
786 "kernelspec": {
787 "display_name": "Python 3",
788 "language": "python",
789 "name": "python3"
790 },
791 "language_info": {
792 "codemirror_mode": {
793 "name": "ipython",
794 "version": 3
795 },
796 "file_extension": ".py",
797 "mimetype": "text/x-python",
798 "name": "python",
799 "nbconvert_exporter": "python",
800 "pygments_lexer": "ipython3",
801 "version": "3.6.8"
802 },
803 "toc": {
804 "base_numbering": 1,
805 "nav_menu": {},
806 "number_sections": true,
807 "sideBar": true,
808 "skip_h1_title": true,
809 "title_cell": "Table of Contents",
810 "title_sidebar": "Contents",
811 "toc_cell": false,
812 "toc_position": {},
813 "toc_section_display": true,
814 "toc_window_display": true
815 }
816 },
817 "nbformat": 4,
818 "nbformat_minor": 1
819 }