ml-finance-python
python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
00_build_dataset.ipynb
(25966B)
1 {
2 "cells": [
3 {
4 "cell_type": "markdown",
5 "metadata": {},
6 "source": [
7 "# Train a Deep NN to predict Asset Price movements"
8 ]
9 },
10 {
11 "cell_type": "markdown",
12 "metadata": {},
13 "source": [
14 "## Setup Docker for GPU acceleration"
15 ]
16 },
17 {
18 "cell_type": "markdown",
19 "metadata": {},
20 "source": [
21 "`docker run -it -p 8889:8888 -v /path/to/machine-learning-for-trading/16_convolutions_neural_nets/cnn:/cnn --name tensorflow tensorflow/tensorflow:latest-gpu-py3 bash`"
22 ]
23 },
24 {
25 "cell_type": "markdown",
26 "metadata": {},
27 "source": [
28 "## Imports & Settings"
29 ]
30 },
31 {
32 "cell_type": "code",
33 "execution_count": 1,
34 "metadata": {},
35 "outputs": [],
36 "source": [
37 "import warnings\n",
38 "warnings.filterwarnings('ignore')"
39 ]
40 },
41 {
42 "cell_type": "code",
43 "execution_count": 2,
44 "metadata": {},
45 "outputs": [
46 {
47 "name": "stderr",
48 "output_type": "stream",
49 "text": [
50 "Using TensorFlow backend.\n"
51 ]
52 }
53 ],
54 "source": [
55 "import os\n",
56 "from pathlib import Path\n",
57 "from importlib import reload\n",
58 "from joblib import dump, load\n",
59 "\n",
60 "import numpy as np\n",
61 "import pandas as pd\n",
62 "import matplotlib.pyplot as plt\n",
63 "import seaborn as sns\n",
64 "\n",
65 "from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold\n",
66 "from sklearn.metrics import roc_auc_score\n",
67 "\n",
68 "import tensorflow as tf\n",
69 "from keras.models import Sequential\n",
70 "from keras import backend as K\n",
71 "from keras.wrappers.scikit_learn import KerasClassifier\n",
72 "from keras.layers import Dense, Dropout, Activation\n",
73 "from keras.models import load_model\n",
74 "from keras.callbacks import Callback, EarlyStopping, TensorBoard, ModelCheckpoint"
75 ]
76 },
77 {
78 "cell_type": "code",
79 "execution_count": 3,
80 "metadata": {},
81 "outputs": [],
82 "source": [
83 "np.random.seed(42)"
84 ]
85 },
86 {
87 "cell_type": "markdown",
88 "metadata": {},
89 "source": [
90 "## Build Dataset"
91 ]
92 },
93 {
94 "cell_type": "markdown",
95 "metadata": {},
96 "source": [
97 "We load the Quandl adjusted stock price data:"
98 ]
99 },
100 {
101 "cell_type": "code",
102 "execution_count": 4,
103 "metadata": {},
104 "outputs": [
105 {
106 "name": "stdout",
107 "output_type": "stream",
108 "text": [
109 "<class 'pandas.core.frame.DataFrame'>\n",
110 "DatetimeIndex: 2896 entries, 2007-01-01 to 2018-03-27\n",
111 "Columns: 3199 entries, A to ZUMZ\n",
112 "dtypes: float64(3199)\n",
113 "memory usage: 70.7 MB\n"
114 ]
115 }
116 ],
117 "source": [
118 "prices = (pd.read_hdf('../data/assets.h5', 'quandl/wiki/prices')\n",
119 " .adj_close\n",
120 " .unstack().loc['2007':])\n",
121 "prices.info()"
122 ]
123 },
124 {
125 "cell_type": "markdown",
126 "metadata": {},
127 "source": [
128 "### Resample to weekly frequency"
129 ]
130 },
131 {
132 "cell_type": "markdown",
133 "metadata": {},
134 "source": [
135 "We start by generating weekly returns for close to 2,500 stocks without missing data for the 2008-17 period, as follows:"
136 ]
137 },
138 {
139 "cell_type": "code",
140 "execution_count": 5,
141 "metadata": {},
142 "outputs": [
143 {
144 "name": "stdout",
145 "output_type": "stream",
146 "text": [
147 "<class 'pandas.core.frame.DataFrame'>\n",
148 "DatetimeIndex: 522 entries, 2017-12-31 to 2008-01-06\n",
149 "Freq: -1W-SUN\n",
150 "Columns: 2489 entries, A to ZUMZ\n",
151 "dtypes: float64(2489)\n",
152 "memory usage: 9.9 MB\n"
153 ]
154 }
155 ],
156 "source": [
157 "returns = (prices\n",
158 " .resample('W')\n",
159 " .last()\n",
160 " .pct_change()\n",
161 " .loc['2008': '2017']\n",
162 " .dropna(axis=1)\n",
163 " .sort_index(ascending=False))\n",
164 "returns.info()"
165 ]
166 },
167 {
168 "cell_type": "code",
169 "execution_count": 6,
170 "metadata": {
171 "scrolled": true
172 },
173 "outputs": [
174 {
175 "data": {
176 "text/html": [
177 "<div>\n",
178 "<style scoped>\n",
179 " .dataframe tbody tr th:only-of-type {\n",
180 " vertical-align: middle;\n",
181 " }\n",
182 "\n",
183 " .dataframe tbody tr th {\n",
184 " vertical-align: top;\n",
185 " }\n",
186 "\n",
187 " .dataframe thead th {\n",
188 " text-align: right;\n",
189 " }\n",
190 "</style>\n",
191 "<table border=\"1\" class=\"dataframe\">\n",
192 " <thead>\n",
193 " <tr style=\"text-align: right;\">\n",
194 " <th>ticker</th>\n",
195 " <th>A</th>\n",
196 " <th>AAL</th>\n",
197 " <th>AAN</th>\n",
198 " <th>AAON</th>\n",
199 " <th>AAP</th>\n",
200 " <th>AAPL</th>\n",
201 " <th>AAWW</th>\n",
202 " <th>ABAX</th>\n",
203 " <th>ABC</th>\n",
204 " <th>ABCB</th>\n",
205 " <th>...</th>\n",
206 " <th>ZEUS</th>\n",
207 " <th>ZIGO</th>\n",
208 " <th>ZINC</th>\n",
209 " <th>ZION</th>\n",
210 " <th>ZIOP</th>\n",
211 " <th>ZIXI</th>\n",
212 " <th>ZLC</th>\n",
213 " <th>ZMH</th>\n",
214 " <th>ZQK</th>\n",
215 " <th>ZUMZ</th>\n",
216 " </tr>\n",
217 " <tr>\n",
218 " <th>date</th>\n",
219 " <th></th>\n",
220 " <th></th>\n",
221 " <th></th>\n",
222 " <th></th>\n",
223 " <th></th>\n",
224 " <th></th>\n",
225 " <th></th>\n",
226 " <th></th>\n",
227 " <th></th>\n",
228 " <th></th>\n",
229 " <th></th>\n",
230 " <th></th>\n",
231 " <th></th>\n",
232 " <th></th>\n",
233 " <th></th>\n",
234 " <th></th>\n",
235 " <th></th>\n",
236 " <th></th>\n",
237 " <th></th>\n",
238 " <th></th>\n",
239 " <th></th>\n",
240 " </tr>\n",
241 " </thead>\n",
242 " <tbody>\n",
243 " <tr>\n",
244 " <th>2017-12-31</th>\n",
245 " <td>-0.005642</td>\n",
246 " <td>-0.010648</td>\n",
247 " <td>-0.010184</td>\n",
248 " <td>-0.001361</td>\n",
249 " <td>-0.008553</td>\n",
250 " <td>-0.033027</td>\n",
251 " <td>-0.024938</td>\n",
252 " <td>-0.001814</td>\n",
253 " <td>-0.006922</td>\n",
254 " <td>-0.019329</td>\n",
255 " <td>...</td>\n",
256 " <td>-0.029797</td>\n",
257 " <td>0.000000</td>\n",
258 " <td>0.000000</td>\n",
259 " <td>-0.009741</td>\n",
260 " <td>0.022222</td>\n",
261 " <td>-0.015730</td>\n",
262 " <td>0.000000</td>\n",
263 " <td>0.000000</td>\n",
264 " <td>0.000000</td>\n",
265 " <td>-0.029138</td>\n",
266 " </tr>\n",
267 " <tr>\n",
268 " <th>2017-12-24</th>\n",
269 " <td>-0.003846</td>\n",
270 " <td>0.029965</td>\n",
271 " <td>0.090171</td>\n",
272 " <td>0.044034</td>\n",
273 " <td>-0.001490</td>\n",
274 " <td>0.006557</td>\n",
275 " <td>0.046087</td>\n",
276 " <td>0.032681</td>\n",
277 " <td>-0.007620</td>\n",
278 " <td>0.017598</td>\n",
279 " <td>...</td>\n",
280 " <td>0.032153</td>\n",
281 " <td>0.000000</td>\n",
282 " <td>0.000000</td>\n",
283 " <td>0.026395</td>\n",
284 " <td>-0.068966</td>\n",
285 " <td>-0.024123</td>\n",
286 " <td>0.000000</td>\n",
287 " <td>0.000000</td>\n",
288 " <td>0.000000</td>\n",
289 " <td>0.067164</td>\n",
290 " </tr>\n",
291 " <tr>\n",
292 " <th>2017-12-17</th>\n",
293 " <td>0.003413</td>\n",
294 " <td>0.000784</td>\n",
295 " <td>-0.052591</td>\n",
296 " <td>-0.014006</td>\n",
297 " <td>0.003888</td>\n",
298 " <td>0.026569</td>\n",
299 " <td>0.004367</td>\n",
300 " <td>0.008396</td>\n",
301 " <td>0.074625</td>\n",
302 " <td>0.026567</td>\n",
303 " <td>...</td>\n",
304 " <td>0.036715</td>\n",
305 " <td>0.000000</td>\n",
306 " <td>0.000000</td>\n",
307 " <td>-0.018064</td>\n",
308 " <td>-0.018059</td>\n",
309 " <td>0.075472</td>\n",
310 " <td>0.000000</td>\n",
311 " <td>0.000000</td>\n",
312 " <td>0.000000</td>\n",
313 " <td>-0.051887</td>\n",
314 " </tr>\n",
315 " <tr>\n",
316 " <th>2017-12-10</th>\n",
317 " <td>-0.019071</td>\n",
318 " <td>0.041012</td>\n",
319 " <td>-0.005359</td>\n",
320 " <td>-0.017882</td>\n",
321 " <td>0.010375</td>\n",
322 " <td>-0.009822</td>\n",
323 " <td>-0.028014</td>\n",
324 " <td>-0.010386</td>\n",
325 " <td>0.020600</td>\n",
326 " <td>-0.054271</td>\n",
327 " <td>...</td>\n",
328 " <td>-0.002410</td>\n",
329 " <td>0.000000</td>\n",
330 " <td>0.000000</td>\n",
331 " <td>0.016973</td>\n",
332 " <td>-0.015556</td>\n",
333 " <td>-0.055679</td>\n",
334 " <td>0.000000</td>\n",
335 " <td>0.000000</td>\n",
336 " <td>0.000000</td>\n",
337 " <td>0.062657</td>\n",
338 " </tr>\n",
339 " <tr>\n",
340 " <th>2017-12-03</th>\n",
341 " <td>-0.009660</td>\n",
342 " <td>0.009267</td>\n",
343 " <td>0.105501</td>\n",
344 " <td>0.013947</td>\n",
345 " <td>0.112630</td>\n",
346 " <td>-0.022404</td>\n",
347 " <td>0.073838</td>\n",
348 " <td>-0.028456</td>\n",
349 " <td>0.045796</td>\n",
350 " <td>0.024717</td>\n",
351 " <td>...</td>\n",
352 " <td>0.065742</td>\n",
353 " <td>0.000000</td>\n",
354 " <td>0.000000</td>\n",
355 " <td>0.080475</td>\n",
356 " <td>0.014656</td>\n",
357 " <td>-0.006637</td>\n",
358 " <td>0.000000</td>\n",
359 " <td>0.000000</td>\n",
360 " <td>0.000000</td>\n",
361 " <td>0.047244</td>\n",
362 " </tr>\n",
363 " <tr>\n",
364 " <th>2008-02-03</th>\n",
365 " <td>0.038265</td>\n",
366 " <td>0.252238</td>\n",
367 " <td>0.002941</td>\n",
368 " <td>0.095182</td>\n",
369 " <td>0.097833</td>\n",
370 " <td>0.028767</td>\n",
371 " <td>0.006245</td>\n",
372 " <td>-0.078058</td>\n",
373 " <td>0.036913</td>\n",
374 " <td>0.083217</td>\n",
375 " <td>...</td>\n",
376 " <td>0.137066</td>\n",
377 " <td>0.127561</td>\n",
378 " <td>0.286550</td>\n",
379 " <td>0.167722</td>\n",
380 " <td>-0.087879</td>\n",
381 " <td>0.069364</td>\n",
382 " <td>0.171949</td>\n",
383 " <td>0.193189</td>\n",
384 " <td>0.127811</td>\n",
385 " <td>0.149083</td>\n",
386 " </tr>\n",
387 " <tr>\n",
388 " <th>2008-01-27</th>\n",
389 " <td>-0.013963</td>\n",
390 " <td>-0.048762</td>\n",
391 " <td>0.191310</td>\n",
392 " <td>0.071788</td>\n",
393 " <td>0.043997</td>\n",
394 " <td>-0.194286</td>\n",
395 " <td>-0.008984</td>\n",
396 " <td>-0.090807</td>\n",
397 " <td>-0.034771</td>\n",
398 " <td>0.054572</td>\n",
399 " <td>...</td>\n",
400 " <td>0.018349</td>\n",
401 " <td>-0.026292</td>\n",
402 " <td>-0.046975</td>\n",
403 " <td>0.136418</td>\n",
404 " <td>-0.003021</td>\n",
405 " <td>0.145695</td>\n",
406 " <td>0.042164</td>\n",
407 " <td>-0.014553</td>\n",
408 " <td>0.141892</td>\n",
409 " <td>0.118666</td>\n",
410 " </tr>\n",
411 " <tr>\n",
412 " <th>2008-01-20</th>\n",
413 " <td>-0.065000</td>\n",
414 " <td>0.086627</td>\n",
415 " <td>-0.080541</td>\n",
416 " <td>-0.054762</td>\n",
417 " <td>-0.007176</td>\n",
418 " <td>-0.065609</td>\n",
419 " <td>0.015818</td>\n",
420 " <td>-0.019721</td>\n",
421 " <td>-0.015219</td>\n",
422 " <td>-0.044397</td>\n",
423 " <td>...</td>\n",
424 " <td>0.040573</td>\n",
425 " <td>0.010999</td>\n",
426 " <td>-0.167109</td>\n",
427 " <td>-0.051614</td>\n",
428 " <td>-0.054286</td>\n",
429 " <td>-0.124638</td>\n",
430 " <td>0.037172</td>\n",
431 " <td>-0.037312</td>\n",
432 " <td>-0.030144</td>\n",
433 " <td>-0.076969</td>\n",
434 " </tr>\n",
435 " <tr>\n",
436 " <th>2008-01-13</th>\n",
437 " <td>0.035375</td>\n",
438 " <td>-0.041902</td>\n",
439 " <td>-0.037818</td>\n",
440 " <td>-0.046538</td>\n",
441 " <td>-0.101486</td>\n",
442 " <td>-0.040878</td>\n",
443 " <td>-0.052095</td>\n",
444 " <td>0.097385</td>\n",
445 " <td>0.080137</td>\n",
446 " <td>-0.017313</td>\n",
447 " <td>...</td>\n",
448 " <td>-0.054176</td>\n",
449 " <td>-0.047993</td>\n",
450 " <td>-0.102381</td>\n",
451 " <td>0.037264</td>\n",
452 " <td>-0.022346</td>\n",
453 " <td>-0.172662</td>\n",
454 " <td>0.011799</td>\n",
455 " <td>0.051880</td>\n",
456 " <td>0.018692</td>\n",
457 " <td>-0.094249</td>\n",
458 " </tr>\n",
459 " <tr>\n",
460 " <th>2008-01-06</th>\n",
461 " <td>-0.072553</td>\n",
462 " <td>-0.156356</td>\n",
463 " <td>-0.068707</td>\n",
464 " <td>-0.133301</td>\n",
465 " <td>-0.065496</td>\n",
466 " <td>-0.098984</td>\n",
467 " <td>-0.029478</td>\n",
468 " <td>-0.098374</td>\n",
469 " <td>-0.037363</td>\n",
470 " <td>-0.132733</td>\n",
471 " <td>...</td>\n",
472 " <td>-0.027290</td>\n",
473 " <td>-0.075806</td>\n",
474 " <td>-0.004739</td>\n",
475 " <td>-0.081058</td>\n",
476 " <td>0.101538</td>\n",
477 " <td>-0.143737</td>\n",
478 " <td>-0.134100</td>\n",
479 " <td>0.000752</td>\n",
480 " <td>-0.133102</td>\n",
481 " <td>-0.269012</td>\n",
482 " </tr>\n",
483 " </tbody>\n",
484 "</table>\n",
485 "<p>10 rows × 2489 columns</p>\n",
486 "</div>"
487 ],
488 "text/plain": [
489 "ticker A AAL AAN AAON AAP AAPL \\\n",
490 "date \n",
491 "2017-12-31 -0.005642 -0.010648 -0.010184 -0.001361 -0.008553 -0.033027 \n",
492 "2017-12-24 -0.003846 0.029965 0.090171 0.044034 -0.001490 0.006557 \n",
493 "2017-12-17 0.003413 0.000784 -0.052591 -0.014006 0.003888 0.026569 \n",
494 "2017-12-10 -0.019071 0.041012 -0.005359 -0.017882 0.010375 -0.009822 \n",
495 "2017-12-03 -0.009660 0.009267 0.105501 0.013947 0.112630 -0.022404 \n",
496 "2008-02-03 0.038265 0.252238 0.002941 0.095182 0.097833 0.028767 \n",
497 "2008-01-27 -0.013963 -0.048762 0.191310 0.071788 0.043997 -0.194286 \n",
498 "2008-01-20 -0.065000 0.086627 -0.080541 -0.054762 -0.007176 -0.065609 \n",
499 "2008-01-13 0.035375 -0.041902 -0.037818 -0.046538 -0.101486 -0.040878 \n",
500 "2008-01-06 -0.072553 -0.156356 -0.068707 -0.133301 -0.065496 -0.098984 \n",
501 "\n",
502 "ticker AAWW ABAX ABC ABCB ... ZEUS ZIGO \\\n",
503 "date ... \n",
504 "2017-12-31 -0.024938 -0.001814 -0.006922 -0.019329 ... -0.029797 0.000000 \n",
505 "2017-12-24 0.046087 0.032681 -0.007620 0.017598 ... 0.032153 0.000000 \n",
506 "2017-12-17 0.004367 0.008396 0.074625 0.026567 ... 0.036715 0.000000 \n",
507 "2017-12-10 -0.028014 -0.010386 0.020600 -0.054271 ... -0.002410 0.000000 \n",
508 "2017-12-03 0.073838 -0.028456 0.045796 0.024717 ... 0.065742 0.000000 \n",
509 "2008-02-03 0.006245 -0.078058 0.036913 0.083217 ... 0.137066 0.127561 \n",
510 "2008-01-27 -0.008984 -0.090807 -0.034771 0.054572 ... 0.018349 -0.026292 \n",
511 "2008-01-20 0.015818 -0.019721 -0.015219 -0.044397 ... 0.040573 0.010999 \n",
512 "2008-01-13 -0.052095 0.097385 0.080137 -0.017313 ... -0.054176 -0.047993 \n",
513 "2008-01-06 -0.029478 -0.098374 -0.037363 -0.132733 ... -0.027290 -0.075806 \n",
514 "\n",
515 "ticker ZINC ZION ZIOP ZIXI ZLC ZMH \\\n",
516 "date \n",
517 "2017-12-31 0.000000 -0.009741 0.022222 -0.015730 0.000000 0.000000 \n",
518 "2017-12-24 0.000000 0.026395 -0.068966 -0.024123 0.000000 0.000000 \n",
519 "2017-12-17 0.000000 -0.018064 -0.018059 0.075472 0.000000 0.000000 \n",
520 "2017-12-10 0.000000 0.016973 -0.015556 -0.055679 0.000000 0.000000 \n",
521 "2017-12-03 0.000000 0.080475 0.014656 -0.006637 0.000000 0.000000 \n",
522 "2008-02-03 0.286550 0.167722 -0.087879 0.069364 0.171949 0.193189 \n",
523 "2008-01-27 -0.046975 0.136418 -0.003021 0.145695 0.042164 -0.014553 \n",
524 "2008-01-20 -0.167109 -0.051614 -0.054286 -0.124638 0.037172 -0.037312 \n",
525 "2008-01-13 -0.102381 0.037264 -0.022346 -0.172662 0.011799 0.051880 \n",
526 "2008-01-06 -0.004739 -0.081058 0.101538 -0.143737 -0.134100 0.000752 \n",
527 "\n",
528 "ticker ZQK ZUMZ \n",
529 "date \n",
530 "2017-12-31 0.000000 -0.029138 \n",
531 "2017-12-24 0.000000 0.067164 \n",
532 "2017-12-17 0.000000 -0.051887 \n",
533 "2017-12-10 0.000000 0.062657 \n",
534 "2017-12-03 0.000000 0.047244 \n",
535 "2008-02-03 0.127811 0.149083 \n",
536 "2008-01-27 0.141892 0.118666 \n",
537 "2008-01-20 -0.030144 -0.076969 \n",
538 "2008-01-13 0.018692 -0.094249 \n",
539 "2008-01-06 -0.133102 -0.269012 \n",
540 "\n",
541 "[10 rows x 2489 columns]"
542 ]
543 },
544 "execution_count": 6,
545 "metadata": {},
546 "output_type": "execute_result"
547 }
548 ],
549 "source": [
550 "returns.head().append(returns.tail())"
551 ]
552 },
553 {
554 "cell_type": "markdown",
555 "metadata": {},
556 "source": [
557 "### Create & stack 52-week sequences"
558 ]
559 },
560 {
561 "cell_type": "markdown",
562 "metadata": {},
563 "source": [
564 "We'll use 52-week sequences, which we'll create in a stacked format:"
565 ]
566 },
567 {
568 "cell_type": "code",
569 "execution_count": 7,
570 "metadata": {},
571 "outputs": [],
572 "source": [
573 "n = len(returns)\n",
574 "T = 52 # weeks\n",
575 "tcols = list(range(T))"
576 ]
577 },
578 {
579 "cell_type": "code",
580 "execution_count": 8,
581 "metadata": {},
582 "outputs": [
583 {
584 "name": "stdout",
585 "output_type": "stream",
586 "text": [
587 "0 50 100 150 200 250 300 350 400 450 <class 'pandas.core.frame.DataFrame'>\n",
588 "RangeIndex: 1244500 entries, 0 to 1244499\n",
589 "Data columns (total 25 columns):\n",
590 "ticker 1244500 non-null object\n",
591 "0 1244500 non-null float64\n",
592 "1 1244500 non-null float64\n",
593 "2 1244500 non-null float64\n",
594 "3 1244500 non-null float64\n",
595 "4 1244500 non-null float64\n",
596 "5 1244500 non-null float64\n",
597 "6 1244500 non-null float64\n",
598 "7 1244500 non-null float64\n",
599 "8 1244500 non-null float64\n",
600 "9 1244500 non-null float64\n",
601 "10 1244500 non-null float64\n",
602 "11 1244500 non-null float64\n",
603 "12 1244500 non-null float64\n",
604 "13 1244500 non-null float64\n",
605 "14 1244500 non-null float64\n",
606 "15 1244500 non-null float64\n",
607 "16 1244500 non-null float64\n",
608 "17 1244500 non-null float64\n",
609 "18 1244500 non-null float64\n",
610 "19 1244500 non-null float64\n",
611 "20 1244500 non-null float64\n",
612 "21 1244500 non-null float64\n",
613 "year 1244500 non-null int64\n",
614 "month 1244500 non-null int64\n",
615 "dtypes: float64(22), int64(2), object(1)\n",
616 "memory usage: 237.4+ MB\n"
617 ]
618 }
619 ],
620 "source": [
621 "data = pd.DataFrame()\n",
622 "for i in range(n-T-1):\n",
623 " if i % 50 == 0:\n",
624 " print(i, end=' ', flush=True)\n",
625 " df = returns.iloc[i:i+T+1]\n",
626 " data = pd.concat([data, (df\n",
627 " .reset_index(drop=True)\n",
628 " .transpose()\n",
629 " .reset_index()\n",
630 " .assign(year=df.index[0].year,\n",
631 " month=df.index[0].month))],\n",
632 " ignore_index=True)\n",
633 "data.info()"
634 ]
635 },
636 {
637 "cell_type": "markdown",
638 "metadata": {},
639 "source": [
640 "### Create categorical variables"
641 ]
642 },
643 {
644 "cell_type": "markdown",
645 "metadata": {},
646 "source": [
647 "We create dummy variables for different time periods, namely months and years:"
648 ]
649 },
650 {
651 "cell_type": "code",
652 "execution_count": 9,
653 "metadata": {},
654 "outputs": [
655 {
656 "name": "stdout",
657 "output_type": "stream",
658 "text": [
659 "<class 'pandas.core.frame.DataFrame'>\n",
660 "DatetimeIndex: 1244500 entries, 2008-06-01 to 2017-12-01\n",
661 "Data columns (total 45 columns):\n",
662 "ticker 1244500 non-null int64\n",
663 "1 1244500 non-null float64\n",
664 "2 1244500 non-null float64\n",
665 "3 1244500 non-null float64\n",
666 "4 1244500 non-null float64\n",
667 "5 1244500 non-null float64\n",
668 "6 1244500 non-null float64\n",
669 "7 1244500 non-null float64\n",
670 "8 1244500 non-null float64\n",
671 "9 1244500 non-null float64\n",
672 "10 1244500 non-null float64\n",
673 "11 1244500 non-null float64\n",
674 "12 1244500 non-null float64\n",
675 "13 1244500 non-null float64\n",
676 "14 1244500 non-null float64\n",
677 "15 1244500 non-null float64\n",
678 "16 1244500 non-null float64\n",
679 "17 1244500 non-null float64\n",
680 "18 1244500 non-null float64\n",
681 "19 1244500 non-null float64\n",
682 "20 1244500 non-null float64\n",
683 "21 1244500 non-null float64\n",
684 "label 1244500 non-null int64\n",
685 "year_2008 1244500 non-null uint8\n",
686 "year_2009 1244500 non-null uint8\n",
687 "year_2010 1244500 non-null uint8\n",
688 "year_2011 1244500 non-null uint8\n",
689 "year_2012 1244500 non-null uint8\n",
690 "year_2013 1244500 non-null uint8\n",
691 "year_2014 1244500 non-null uint8\n",
692 "year_2015 1244500 non-null uint8\n",
693 "year_2016 1244500 non-null uint8\n",
694 "year_2017 1244500 non-null uint8\n",
695 "month_1 1244500 non-null uint8\n",
696 "month_2 1244500 non-null uint8\n",
697 "month_3 1244500 non-null uint8\n",
698 "month_4 1244500 non-null uint8\n",
699 "month_5 1244500 non-null uint8\n",
700 "month_6 1244500 non-null uint8\n",
701 "month_7 1244500 non-null uint8\n",
702 "month_8 1244500 non-null uint8\n",
703 "month_9 1244500 non-null uint8\n",
704 "month_10 1244500 non-null uint8\n",
705 "month_11 1244500 non-null uint8\n",
706 "month_12 1244500 non-null uint8\n",
707 "dtypes: float64(21), int64(2), uint8(22)\n",
708 "memory usage: 254.0 MB\n"
709 ]
710 }
711 ],
712 "source": [
713 "data[tcols] = (data[tcols].apply(lambda x: x.clip(lower=x.quantile(.01),\n",
714 " upper=x.quantile(.99))))\n",
715 "data.ticker = pd.factorize(data.ticker)[0]\n",
716 "data['label'] = (data[0] > 0).astype(int)\n",
717 "data['date'] = pd.to_datetime(data.assign(day=1)[['year', 'month', 'day']])\n",
718 "data = pd.get_dummies((data.drop(0, axis=1)\n",
719 " .set_index('date')\n",
720 " .apply(pd.to_numeric)),\n",
721 " columns=['year', 'month']).sort_index()\n",
722 "data.info()"
723 ]
724 },
725 {
726 "cell_type": "code",
727 "execution_count": 10,
728 "metadata": {},
729 "outputs": [],
730 "source": [
731 "data.to_hdf('data.h5', 'returns_daily')"
732 ]
733 },
734 {
735 "cell_type": "code",
736 "execution_count": 11,
737 "metadata": {},
738 "outputs": [
739 {
740 "data": {
741 "text/plain": [
742 "(1244500, 45)"
743 ]
744 },
745 "execution_count": 11,
746 "metadata": {},
747 "output_type": "execute_result"
748 }
749 ],
750 "source": [
751 "data.shape"
752 ]
753 }
754 ],
755 "metadata": {
756 "kernelspec": {
757 "display_name": "Python 3",
758 "language": "python",
759 "name": "python3"
760 },
761 "language_info": {
762 "codemirror_mode": {
763 "name": "ipython",
764 "version": 3
765 },
766 "file_extension": ".py",
767 "mimetype": "text/x-python",
768 "name": "python",
769 "nbconvert_exporter": "python",
770 "pygments_lexer": "ipython3",
771 "version": "3.6.8"
772 },
773 "toc": {
774 "base_numbering": 1,
775 "nav_menu": {},
776 "number_sections": true,
777 "sideBar": true,
778 "skip_h1_title": true,
779 "title_cell": "Table of Contents",
780 "title_sidebar": "Contents",
781 "toc_cell": false,
782 "toc_position": {
783 "height": "calc(100% - 180px)",
784 "left": "10px",
785 "top": "150px",
786 "width": "282.222px"
787 },
788 "toc_section_display": true,
789 "toc_window_display": true
790 }
791 },
792 "nbformat": 4,
793 "nbformat_minor": 2
794 }