ml-finance-python
python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
00_data_prep.ipynb
(20120B)
1 {
2 "cells": [
3 {
4 "cell_type": "markdown",
5 "metadata": {},
6 "source": [
7 "# How to prepare the data"
8 ]
9 },
10 {
11 "cell_type": "markdown",
12 "metadata": {},
13 "source": [
14 "We use a simplified version of the data set constructed in Chapter 4, Alpha factor research. It consists of daily stock prices provided by Quandl for the 2010-2017 period and various engineered features. \n",
15 "\n",
16 "The decision tree models in this chapter are not equipped to handle missing or categorical variables, so we will apply dummy encoding to the latter after dropping any of the former."
17 ]
18 },
19 {
20 "cell_type": "code",
21 "execution_count": 1,
22 "metadata": {
23 "ExecuteTime": {
24 "end_time": "2018-11-09T16:12:33.669796Z",
25 "start_time": "2018-11-09T16:12:33.468906Z"
26 }
27 },
28 "outputs": [],
29 "source": [
30 "%matplotlib inline\n",
31 "\n",
32 "import warnings\n",
33 "import os\n",
34 "from pathlib import Path\n",
35 "import quandl\n",
36 "import numpy as np\n",
37 "import pandas as pd\n",
38 "\n",
39 "import matplotlib.pyplot as plt\n",
40 "import seaborn as sns\n",
41 "import graphviz\n",
42 "from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz, _tree\n",
43 "from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression\n",
44 "from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV\n",
45 "from sklearn.metrics import roc_auc_score, roc_curve, mean_squared_error, precision_recall_curve\n",
46 "from sklearn.preprocessing import Imputer\n",
47 "import statsmodels.api as sm\n",
48 "from scipy.interpolate import interp1d, interp2d"
49 ]
50 },
51 {
52 "cell_type": "code",
53 "execution_count": 2,
54 "metadata": {
55 "ExecuteTime": {
56 "end_time": "2018-11-09T16:12:33.673698Z",
57 "start_time": "2018-11-09T16:12:33.671186Z"
58 }
59 },
60 "outputs": [],
61 "source": [
62 "warnings.filterwarnings('ignore')\n",
63 "plt.style.use('ggplot')"
64 ]
65 },
66 {
67 "cell_type": "markdown",
68 "metadata": {},
69 "source": [
70 "## Get Data"
71 ]
72 },
73 {
74 "cell_type": "code",
75 "execution_count": 3,
76 "metadata": {
77 "ExecuteTime": {
78 "end_time": "2018-11-09T16:12:37.787049Z",
79 "start_time": "2018-11-09T16:12:33.675024Z"
80 }
81 },
82 "outputs": [
83 {
84 "name": "stdout",
85 "output_type": "stream",
86 "text": [
87 "<class 'pandas.io.pytables.HDFStore'>\n",
88 "File path: ../data/assets.h5\n",
89 "/engineered_features frame (shape->[445640,33]) \n",
90 "/fred/assets frame (shape->[4826,5]) \n",
91 "/quandl/wiki/prices frame (shape->[15389314,12])\n",
92 "/quandl/wiki/stocks frame (shape->[1,2]) \n",
93 "/sp500/prices frame (shape->[37721,5]) \n",
94 "/sp500/stocks frame (shape->[1,7]) \n",
95 "/us_equities/stocks frame (shape->[6834,6]) \n"
96 ]
97 }
98 ],
99 "source": [
100 "with pd.HDFStore('../data/assets.h5') as store:\n",
101 " print(store.info())\n",
102 " prices = store['quandl/wiki/prices'].adj_close.unstack('ticker')\n",
103 " stocks = store['us_equities/stocks']"
104 ]
105 },
106 {
107 "cell_type": "code",
108 "execution_count": 4,
109 "metadata": {
110 "ExecuteTime": {
111 "end_time": "2018-11-09T16:12:37.806190Z",
112 "start_time": "2018-11-09T16:12:37.788666Z"
113 }
114 },
115 "outputs": [],
116 "source": [
117 "shared = prices.columns.intersection(stocks.index)\n",
118 "prices = prices.loc['2010': '2018', shared]\n",
119 "stocks = stocks.loc[shared, ['marketcap', 'ipoyear', 'sector']]"
120 ]
121 },
122 {
123 "cell_type": "code",
124 "execution_count": 5,
125 "metadata": {
126 "ExecuteTime": {
127 "end_time": "2018-11-09T16:12:37.812849Z",
128 "start_time": "2018-11-09T16:12:37.807928Z"
129 }
130 },
131 "outputs": [
132 {
133 "name": "stdout",
134 "output_type": "stream",
135 "text": [
136 "<class 'pandas.core.frame.DataFrame'>\n",
137 "Index: 2412 entries, A to ZUMZ\n",
138 "Data columns (total 3 columns):\n",
139 "marketcap 2407 non-null float64\n",
140 "ipoyear 1065 non-null float64\n",
141 "sector 2372 non-null object\n",
142 "dtypes: float64(2), object(1)\n",
143 "memory usage: 75.4+ KB\n"
144 ]
145 }
146 ],
147 "source": [
148 "stocks.info()"
149 ]
150 },
151 {
152 "cell_type": "code",
153 "execution_count": 6,
154 "metadata": {
155 "ExecuteTime": {
156 "end_time": "2018-11-09T16:12:37.884662Z",
157 "start_time": "2018-11-09T16:12:37.813852Z"
158 }
159 },
160 "outputs": [
161 {
162 "name": "stdout",
163 "output_type": "stream",
164 "text": [
165 "<class 'pandas.core.frame.DataFrame'>\n",
166 "DatetimeIndex: 2113 entries, 2010-01-04 to 2018-03-27\n",
167 "Columns: 2412 entries, A to ZUMZ\n",
168 "dtypes: float64(2412)\n",
169 "memory usage: 38.9 MB\n"
170 ]
171 }
172 ],
173 "source": [
174 "prices.info()"
175 ]
176 },
177 {
178 "cell_type": "markdown",
179 "metadata": {},
180 "source": [
181 "### Create monthly return series"
182 ]
183 },
184 {
185 "cell_type": "markdown",
186 "metadata": {},
187 "source": [
188 "Remove outliers"
189 ]
190 },
191 {
192 "cell_type": "code",
193 "execution_count": 8,
194 "metadata": {
195 "ExecuteTime": {
196 "end_time": "2018-11-09T16:12:37.918524Z",
197 "start_time": "2018-11-09T16:12:37.885693Z"
198 }
199 },
200 "outputs": [],
201 "source": [
202 "returns = prices.resample('M').last().pct_change().stack().swaplevel()\n",
203 "returns = (returns[returns.between(left=returns.quantile(.05), \n",
204 " right=returns.quantile(.95))].to_frame('returns'))"
205 ]
206 },
207 {
208 "cell_type": "markdown",
209 "metadata": {},
210 "source": [
211 "### Lagged Returns"
212 ]
213 },
214 {
215 "cell_type": "code",
216 "execution_count": 9,
217 "metadata": {
218 "ExecuteTime": {
219 "end_time": "2018-11-09T16:12:38.090053Z",
220 "start_time": "2018-11-09T16:12:37.919593Z"
221 }
222 },
223 "outputs": [],
224 "source": [
225 "for t in range(1, 13):\n",
226 " returns[f't-{t}'] = returns.groupby(level='ticker').returns.shift(t)\n",
227 "returns = returns.dropna()"
228 ]
229 },
230 {
231 "cell_type": "markdown",
232 "metadata": {},
233 "source": [
234 "### Time Period Dummies"
235 ]
236 },
237 {
238 "cell_type": "code",
239 "execution_count": 10,
240 "metadata": {
241 "ExecuteTime": {
242 "end_time": "2018-11-09T16:12:38.143960Z",
243 "start_time": "2018-11-09T16:12:38.091096Z"
244 }
245 },
246 "outputs": [],
247 "source": [
248 "# returns = returns.reset_index('date')\n",
249 "dates = returns.index.get_level_values('date')\n",
250 "returns['year'] = dates.year\n",
251 "returns['month'] = dates.month\n",
252 "returns = pd.get_dummies(returns, columns=['year', 'month'])"
253 ]
254 },
255 {
256 "cell_type": "code",
257 "execution_count": 11,
258 "metadata": {
259 "ExecuteTime": {
260 "end_time": "2018-11-09T16:12:38.211032Z",
261 "start_time": "2018-11-09T16:12:38.144952Z"
262 }
263 },
264 "outputs": [
265 {
266 "name": "stdout",
267 "output_type": "stream",
268 "text": [
269 "<class 'pandas.core.frame.DataFrame'>\n",
270 "MultiIndex: 171162 entries, (AAON, 2011-02-28 00:00:00) to (ZTS, 2018-03-31 00:00:00)\n",
271 "Data columns (total 33 columns):\n",
272 "returns 171162 non-null float64\n",
273 "t-1 171162 non-null float64\n",
274 "t-2 171162 non-null float64\n",
275 "t-3 171162 non-null float64\n",
276 "t-4 171162 non-null float64\n",
277 "t-5 171162 non-null float64\n",
278 "t-6 171162 non-null float64\n",
279 "t-7 171162 non-null float64\n",
280 "t-8 171162 non-null float64\n",
281 "t-9 171162 non-null float64\n",
282 "t-10 171162 non-null float64\n",
283 "t-11 171162 non-null float64\n",
284 "t-12 171162 non-null float64\n",
285 "year_2011 171162 non-null uint8\n",
286 "year_2012 171162 non-null uint8\n",
287 "year_2013 171162 non-null uint8\n",
288 "year_2014 171162 non-null uint8\n",
289 "year_2015 171162 non-null uint8\n",
290 "year_2016 171162 non-null uint8\n",
291 "year_2017 171162 non-null uint8\n",
292 "year_2018 171162 non-null uint8\n",
293 "month_1 171162 non-null uint8\n",
294 "month_2 171162 non-null uint8\n",
295 "month_3 171162 non-null uint8\n",
296 "month_4 171162 non-null uint8\n",
297 "month_5 171162 non-null uint8\n",
298 "month_6 171162 non-null uint8\n",
299 "month_7 171162 non-null uint8\n",
300 "month_8 171162 non-null uint8\n",
301 "month_9 171162 non-null uint8\n",
302 "month_10 171162 non-null uint8\n",
303 "month_11 171162 non-null uint8\n",
304 "month_12 171162 non-null uint8\n",
305 "dtypes: float64(13), uint8(20)\n",
306 "memory usage: 20.7+ MB\n"
307 ]
308 }
309 ],
310 "source": [
311 "returns.info()"
312 ]
313 },
314 {
315 "cell_type": "markdown",
316 "metadata": {},
317 "source": [
318 "### Get stock characteristics"
319 ]
320 },
321 {
322 "cell_type": "markdown",
323 "metadata": {},
324 "source": [
325 "#### Create age proxy"
326 ]
327 },
328 {
329 "cell_type": "code",
330 "execution_count": 12,
331 "metadata": {
332 "ExecuteTime": {
333 "end_time": "2018-11-09T16:12:38.216013Z",
334 "start_time": "2018-11-09T16:12:38.212117Z"
335 }
336 },
337 "outputs": [],
338 "source": [
339 "stocks['age'] = pd.qcut(stocks.ipoyear, q=5, labels=list(range(1, 6))).astype(float).fillna(0).astype(int)\n",
340 "stocks = stocks.drop('ipoyear', axis=1)"
341 ]
342 },
343 {
344 "cell_type": "markdown",
345 "metadata": {},
346 "source": [
347 "#### Create size proxy"
348 ]
349 },
350 {
351 "cell_type": "code",
352 "execution_count": 15,
353 "metadata": {},
354 "outputs": [
355 {
356 "name": "stdout",
357 "output_type": "stream",
358 "text": [
359 "<class 'pandas.core.frame.DataFrame'>\n",
360 "Index: 2412 entries, A to ZUMZ\n",
361 "Data columns (total 3 columns):\n",
362 "marketcap 2407 non-null float64\n",
363 "sector 2372 non-null object\n",
364 "age 2412 non-null int64\n",
365 "dtypes: float64(1), int64(1), object(1)\n",
366 "memory usage: 155.4+ KB\n"
367 ]
368 }
369 ],
370 "source": [
371 "stocks.info()"
372 ]
373 },
374 {
375 "cell_type": "code",
376 "execution_count": 16,
377 "metadata": {},
378 "outputs": [
379 {
380 "data": {
381 "text/plain": [
382 "ticker\n",
383 "A 1.960000e+10\n",
384 "AA 8.540000e+09\n",
385 "AAL 1.767000e+10\n",
386 "AAMC 1.104900e+08\n",
387 "AAN 3.200000e+09\n",
388 "Name: marketcap, dtype: float64"
389 ]
390 },
391 "execution_count": 16,
392 "metadata": {},
393 "output_type": "execute_result"
394 }
395 ],
396 "source": [
397 "stocks.marketcap.head()"
398 ]
399 },
400 {
401 "cell_type": "code",
402 "execution_count": 18,
403 "metadata": {
404 "ExecuteTime": {
405 "end_time": "2018-11-09T16:12:38.352174Z",
406 "start_time": "2018-11-09T16:12:38.240412Z"
407 }
408 },
409 "outputs": [],
410 "source": [
411 "stocks['size'] = pd.qcut(stocks.marketcap, q=10, labels=list(range(1, 11)))\n",
412 "stocks = stocks.drop(['marketcap'], axis=1)"
413 ]
414 },
415 {
416 "cell_type": "markdown",
417 "metadata": {},
418 "source": [
419 "#### Create Dummy variables"
420 ]
421 },
422 {
423 "cell_type": "code",
424 "execution_count": 19,
425 "metadata": {
426 "ExecuteTime": {
427 "end_time": "2018-11-09T16:12:38.357464Z",
428 "start_time": "2018-11-09T16:12:38.353287Z"
429 }
430 },
431 "outputs": [
432 {
433 "name": "stdout",
434 "output_type": "stream",
435 "text": [
436 "<class 'pandas.core.frame.DataFrame'>\n",
437 "Index: 2412 entries, A to ZUMZ\n",
438 "Data columns (total 3 columns):\n",
439 "sector 2372 non-null object\n",
440 "age 2412 non-null int64\n",
441 "size 2407 non-null category\n",
442 "dtypes: category(1), int64(1), object(1)\n",
443 "memory usage: 139.3+ KB\n"
444 ]
445 }
446 ],
447 "source": [
448 "stocks.info()"
449 ]
450 },
451 {
452 "cell_type": "code",
453 "execution_count": 20,
454 "metadata": {
455 "ExecuteTime": {
456 "end_time": "2018-11-09T16:12:38.369633Z",
457 "start_time": "2018-11-09T16:12:38.358581Z"
458 }
459 },
460 "outputs": [
461 {
462 "name": "stdout",
463 "output_type": "stream",
464 "text": [
465 "<class 'pandas.core.frame.DataFrame'>\n",
466 "Index: 2412 entries, A to ZUMZ\n",
467 "Data columns (total 28 columns):\n",
468 "size_1 2412 non-null uint8\n",
469 "size_2 2412 non-null uint8\n",
470 "size_3 2412 non-null uint8\n",
471 "size_4 2412 non-null uint8\n",
472 "size_5 2412 non-null uint8\n",
473 "size_6 2412 non-null uint8\n",
474 "size_7 2412 non-null uint8\n",
475 "size_8 2412 non-null uint8\n",
476 "size_9 2412 non-null uint8\n",
477 "size_10 2412 non-null uint8\n",
478 "age_0 2412 non-null uint8\n",
479 "age_1 2412 non-null uint8\n",
480 "age_2 2412 non-null uint8\n",
481 "age_3 2412 non-null uint8\n",
482 "age_4 2412 non-null uint8\n",
483 "age_5 2412 non-null uint8\n",
484 "Basic Industries 2412 non-null uint8\n",
485 "Capital Goods 2412 non-null uint8\n",
486 "Consumer Durables 2412 non-null uint8\n",
487 "Consumer Non-Durables 2412 non-null uint8\n",
488 "Consumer Services 2412 non-null uint8\n",
489 "Energy 2412 non-null uint8\n",
490 "Finance 2412 non-null uint8\n",
491 "Health Care 2412 non-null uint8\n",
492 "Miscellaneous 2412 non-null uint8\n",
493 "Public Utilities 2412 non-null uint8\n",
494 "Technology 2412 non-null uint8\n",
495 "Transportation 2412 non-null uint8\n",
496 "dtypes: uint8(28)\n",
497 "memory usage: 164.8+ KB\n"
498 ]
499 }
500 ],
501 "source": [
502 "stocks = pd.get_dummies(stocks, \n",
503 " columns=['size', 'age', 'sector'], \n",
504 " prefix=['size', 'age', ''], \n",
505 " prefix_sep=['_', '_', ''])\n",
506 "stocks.info()"
507 ]
508 },
509 {
510 "cell_type": "markdown",
511 "metadata": {},
512 "source": [
513 "### Combine data"
514 ]
515 },
516 {
517 "cell_type": "code",
518 "execution_count": 21,
519 "metadata": {
520 "ExecuteTime": {
521 "end_time": "2018-11-09T16:12:41.045331Z",
522 "start_time": "2018-11-09T16:12:38.370654Z"
523 },
524 "scrolled": true
525 },
526 "outputs": [
527 {
528 "name": "stdout",
529 "output_type": "stream",
530 "text": [
531 "<class 'pandas.core.frame.DataFrame'>\n",
532 "MultiIndex: 171162 entries, (A, 2011-03-31 00:00:00) to (ZUMZ, 2018-02-28 00:00:00)\n",
533 "Data columns (total 61 columns):\n",
534 "returns 171162 non-null float64\n",
535 "t-1 171162 non-null float64\n",
536 "t-2 171162 non-null float64\n",
537 "t-3 171162 non-null float64\n",
538 "t-4 171162 non-null float64\n",
539 "t-5 171162 non-null float64\n",
540 "t-6 171162 non-null float64\n",
541 "t-7 171162 non-null float64\n",
542 "t-8 171162 non-null float64\n",
543 "t-9 171162 non-null float64\n",
544 "t-10 171162 non-null float64\n",
545 "t-11 171162 non-null float64\n",
546 "t-12 171162 non-null float64\n",
547 "year_2011 171162 non-null uint8\n",
548 "year_2012 171162 non-null uint8\n",
549 "year_2013 171162 non-null uint8\n",
550 "year_2014 171162 non-null uint8\n",
551 "year_2015 171162 non-null uint8\n",
552 "year_2016 171162 non-null uint8\n",
553 "year_2017 171162 non-null uint8\n",
554 "year_2018 171162 non-null uint8\n",
555 "month_1 171162 non-null uint8\n",
556 "month_2 171162 non-null uint8\n",
557 "month_3 171162 non-null uint8\n",
558 "month_4 171162 non-null uint8\n",
559 "month_5 171162 non-null uint8\n",
560 "month_6 171162 non-null uint8\n",
561 "month_7 171162 non-null uint8\n",
562 "month_8 171162 non-null uint8\n",
563 "month_9 171162 non-null uint8\n",
564 "month_10 171162 non-null uint8\n",
565 "month_11 171162 non-null uint8\n",
566 "month_12 171162 non-null uint8\n",
567 "size_1 171162 non-null int8\n",
568 "size_2 171162 non-null int8\n",
569 "size_3 171162 non-null int8\n",
570 "size_4 171162 non-null int8\n",
571 "size_5 171162 non-null int8\n",
572 "size_6 171162 non-null int8\n",
573 "size_7 171162 non-null int8\n",
574 "size_8 171162 non-null int8\n",
575 "size_9 171162 non-null int8\n",
576 "size_10 171162 non-null int8\n",
577 "age_0 171162 non-null int8\n",
578 "age_1 171162 non-null int8\n",
579 "age_2 171162 non-null int8\n",
580 "age_3 171162 non-null int8\n",
581 "age_4 171162 non-null int8\n",
582 "age_5 171162 non-null int8\n",
583 "Basic Industries 171162 non-null int8\n",
584 "Capital Goods 171162 non-null int8\n",
585 "Consumer Durables 171162 non-null int8\n",
586 "Consumer Non-Durables 171162 non-null int8\n",
587 "Consumer Services 171162 non-null int8\n",
588 "Energy 171162 non-null int8\n",
589 "Finance 171162 non-null int8\n",
590 "Health Care 171162 non-null int8\n",
591 "Miscellaneous 171162 non-null int8\n",
592 "Public Utilities 171162 non-null int8\n",
593 "Technology 171162 non-null int8\n",
594 "Transportation 171162 non-null int8\n",
595 "dtypes: float64(13), int8(28), uint8(20)\n",
596 "memory usage: 25.3+ MB\n"
597 ]
598 }
599 ],
600 "source": [
601 "data = (returns\n",
602 " .reset_index('date')\n",
603 " .merge(stocks, left_index=True, right_index=True)\n",
604 " .dropna()\n",
605 " .set_index('date', append=True))\n",
606 "\n",
607 "s = len(returns.columns)\n",
608 "data.iloc[:, s:] = data.iloc[:, s:].astype(int).apply(pd.to_numeric, downcast='integer')\n",
609 "data.info()"
610 ]
611 },
612 {
613 "cell_type": "markdown",
614 "metadata": {},
615 "source": [
616 "### Store data"
617 ]
618 },
619 {
620 "cell_type": "code",
621 "execution_count": 22,
622 "metadata": {
623 "ExecuteTime": {
624 "end_time": "2018-11-09T16:12:41.079760Z",
625 "start_time": "2018-11-09T16:12:41.046398Z"
626 }
627 },
628 "outputs": [],
629 "source": [
630 "with pd.HDFStore('data.h5') as store:\n",
631 " store.put('data', data)"
632 ]
633 }
634 ],
635 "metadata": {
636 "kernelspec": {
637 "display_name": "Python 3",
638 "language": "python",
639 "name": "python3"
640 },
641 "language_info": {
642 "codemirror_mode": {
643 "name": "ipython",
644 "version": 3
645 },
646 "file_extension": ".py",
647 "mimetype": "text/x-python",
648 "name": "python",
649 "nbconvert_exporter": "python",
650 "pygments_lexer": "ipython3",
651 "version": "3.7.0"
652 },
653 "toc": {
654 "base_numbering": 1,
655 "nav_menu": {},
656 "number_sections": true,
657 "sideBar": true,
658 "skip_h1_title": false,
659 "title_cell": "Table of Contents",
660 "title_sidebar": "Contents",
661 "toc_cell": false,
662 "toc_position": {},
663 "toc_section_display": true,
664 "toc_window_display": true
665 }
666 },
667 "nbformat": 4,
668 "nbformat_minor": 2
669 }