ml-finance-python

python scripts for finance machine learning

git clone https://9o.is/git/ml-finance-python.git

00_data_prep.ipynb

(20120B)


      1 {
      2  "cells": [
      3   {
      4    "cell_type": "markdown",
      5    "metadata": {},
      6    "source": [
      7     "# How to prepare the data"
      8    ]
      9   },
     10   {
     11    "cell_type": "markdown",
     12    "metadata": {},
     13    "source": [
     14     "We use a simplified version of the data set constructed in Chapter 4, Alpha factor research. It consists of daily stock prices provided by Quandl for the 2010-2017 period and various engineered features. \n",
     15     "\n",
     16     "The decision tree models in this chapter are not equipped to handle missing or categorical variables, so we will apply dummy encoding to the latter after dropping any of the former."
     17    ]
     18   },
     19   {
     20    "cell_type": "code",
     21    "execution_count": 1,
     22    "metadata": {
     23     "ExecuteTime": {
     24      "end_time": "2018-11-09T16:12:33.669796Z",
     25      "start_time": "2018-11-09T16:12:33.468906Z"
     26     }
     27    },
     28    "outputs": [],
     29    "source": [
     30     "%matplotlib inline\n",
     31     "\n",
     32     "import warnings\n",
     33     "import os\n",
     34     "from pathlib import Path\n",
     35     "import quandl\n",
     36     "import numpy as np\n",
     37     "import pandas as pd\n",
     38     "\n",
     39     "import matplotlib.pyplot as plt\n",
     40     "import seaborn as sns\n",
     41     "import graphviz\n",
     42     "from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz, _tree\n",
     43     "from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression\n",
     44     "from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV\n",
     45     "from sklearn.metrics import roc_auc_score, roc_curve, mean_squared_error, precision_recall_curve\n",
     46     "from sklearn.preprocessing import Imputer\n",
     47     "import statsmodels.api as sm\n",
     48     "from scipy.interpolate import interp1d, interp2d"
     49    ]
     50   },
     51   {
     52    "cell_type": "code",
     53    "execution_count": 2,
     54    "metadata": {
     55     "ExecuteTime": {
     56      "end_time": "2018-11-09T16:12:33.673698Z",
     57      "start_time": "2018-11-09T16:12:33.671186Z"
     58     }
     59    },
     60    "outputs": [],
     61    "source": [
     62     "warnings.filterwarnings('ignore')\n",
     63     "plt.style.use('ggplot')"
     64    ]
     65   },
     66   {
     67    "cell_type": "markdown",
     68    "metadata": {},
     69    "source": [
     70     "## Get Data"
     71    ]
     72   },
     73   {
     74    "cell_type": "code",
     75    "execution_count": 3,
     76    "metadata": {
     77     "ExecuteTime": {
     78      "end_time": "2018-11-09T16:12:37.787049Z",
     79      "start_time": "2018-11-09T16:12:33.675024Z"
     80     }
     81    },
     82    "outputs": [
     83     {
     84      "name": "stdout",
     85      "output_type": "stream",
     86      "text": [
     87       "<class 'pandas.io.pytables.HDFStore'>\n",
     88       "File path: ../data/assets.h5\n",
     89       "/engineered_features            frame        (shape->[445640,33])  \n",
     90       "/fred/assets                    frame        (shape->[4826,5])     \n",
     91       "/quandl/wiki/prices             frame        (shape->[15389314,12])\n",
     92       "/quandl/wiki/stocks             frame        (shape->[1,2])        \n",
     93       "/sp500/prices                   frame        (shape->[37721,5])    \n",
     94       "/sp500/stocks                   frame        (shape->[1,7])        \n",
     95       "/us_equities/stocks             frame        (shape->[6834,6])     \n"
     96      ]
     97     }
     98    ],
     99    "source": [
    100     "with pd.HDFStore('../data/assets.h5') as store:\n",
    101     "    print(store.info())\n",
    102     "    prices = store['quandl/wiki/prices'].adj_close.unstack('ticker')\n",
    103     "    stocks = store['us_equities/stocks']"
    104    ]
    105   },
    106   {
    107    "cell_type": "code",
    108    "execution_count": 4,
    109    "metadata": {
    110     "ExecuteTime": {
    111      "end_time": "2018-11-09T16:12:37.806190Z",
    112      "start_time": "2018-11-09T16:12:37.788666Z"
    113     }
    114    },
    115    "outputs": [],
    116    "source": [
    117     "shared = prices.columns.intersection(stocks.index)\n",
    118     "prices = prices.loc['2010': '2018', shared]\n",
    119     "stocks = stocks.loc[shared, ['marketcap', 'ipoyear', 'sector']]"
    120    ]
    121   },
    122   {
    123    "cell_type": "code",
    124    "execution_count": 5,
    125    "metadata": {
    126     "ExecuteTime": {
    127      "end_time": "2018-11-09T16:12:37.812849Z",
    128      "start_time": "2018-11-09T16:12:37.807928Z"
    129     }
    130    },
    131    "outputs": [
    132     {
    133      "name": "stdout",
    134      "output_type": "stream",
    135      "text": [
    136       "<class 'pandas.core.frame.DataFrame'>\n",
    137       "Index: 2412 entries, A to ZUMZ\n",
    138       "Data columns (total 3 columns):\n",
    139       "marketcap    2407 non-null float64\n",
    140       "ipoyear      1065 non-null float64\n",
    141       "sector       2372 non-null object\n",
    142       "dtypes: float64(2), object(1)\n",
    143       "memory usage: 75.4+ KB\n"
    144      ]
    145     }
    146    ],
    147    "source": [
    148     "stocks.info()"
    149    ]
    150   },
    151   {
    152    "cell_type": "code",
    153    "execution_count": 6,
    154    "metadata": {
    155     "ExecuteTime": {
    156      "end_time": "2018-11-09T16:12:37.884662Z",
    157      "start_time": "2018-11-09T16:12:37.813852Z"
    158     }
    159    },
    160    "outputs": [
    161     {
    162      "name": "stdout",
    163      "output_type": "stream",
    164      "text": [
    165       "<class 'pandas.core.frame.DataFrame'>\n",
    166       "DatetimeIndex: 2113 entries, 2010-01-04 to 2018-03-27\n",
    167       "Columns: 2412 entries, A to ZUMZ\n",
    168       "dtypes: float64(2412)\n",
    169       "memory usage: 38.9 MB\n"
    170      ]
    171     }
    172    ],
    173    "source": [
    174     "prices.info()"
    175    ]
    176   },
    177   {
    178    "cell_type": "markdown",
    179    "metadata": {},
    180    "source": [
    181     "### Create monthly return series"
    182    ]
    183   },
    184   {
    185    "cell_type": "markdown",
    186    "metadata": {},
    187    "source": [
    188     "Remove outliers"
    189    ]
    190   },
    191   {
    192    "cell_type": "code",
    193    "execution_count": 8,
    194    "metadata": {
    195     "ExecuteTime": {
    196      "end_time": "2018-11-09T16:12:37.918524Z",
    197      "start_time": "2018-11-09T16:12:37.885693Z"
    198     }
    199    },
    200    "outputs": [],
    201    "source": [
    202     "returns = prices.resample('M').last().pct_change().stack().swaplevel()\n",
    203     "returns = (returns[returns.between(left=returns.quantile(.05), \n",
    204     "                                   right=returns.quantile(.95))].to_frame('returns'))"
    205    ]
    206   },
    207   {
    208    "cell_type": "markdown",
    209    "metadata": {},
    210    "source": [
    211     "### Lagged Returns"
    212    ]
    213   },
    214   {
    215    "cell_type": "code",
    216    "execution_count": 9,
    217    "metadata": {
    218     "ExecuteTime": {
    219      "end_time": "2018-11-09T16:12:38.090053Z",
    220      "start_time": "2018-11-09T16:12:37.919593Z"
    221     }
    222    },
    223    "outputs": [],
    224    "source": [
    225     "for t in range(1, 13):\n",
    226     "    returns[f't-{t}'] = returns.groupby(level='ticker').returns.shift(t)\n",
    227     "returns = returns.dropna()"
    228    ]
    229   },
    230   {
    231    "cell_type": "markdown",
    232    "metadata": {},
    233    "source": [
    234     "### Time Period Dummies"
    235    ]
    236   },
    237   {
    238    "cell_type": "code",
    239    "execution_count": 10,
    240    "metadata": {
    241     "ExecuteTime": {
    242      "end_time": "2018-11-09T16:12:38.143960Z",
    243      "start_time": "2018-11-09T16:12:38.091096Z"
    244     }
    245    },
    246    "outputs": [],
    247    "source": [
    248     "# returns = returns.reset_index('date')\n",
    249     "dates = returns.index.get_level_values('date')\n",
    250     "returns['year'] = dates.year\n",
    251     "returns['month'] = dates.month\n",
    252     "returns = pd.get_dummies(returns, columns=['year', 'month'])"
    253    ]
    254   },
    255   {
    256    "cell_type": "code",
    257    "execution_count": 11,
    258    "metadata": {
    259     "ExecuteTime": {
    260      "end_time": "2018-11-09T16:12:38.211032Z",
    261      "start_time": "2018-11-09T16:12:38.144952Z"
    262     }
    263    },
    264    "outputs": [
    265     {
    266      "name": "stdout",
    267      "output_type": "stream",
    268      "text": [
    269       "<class 'pandas.core.frame.DataFrame'>\n",
    270       "MultiIndex: 171162 entries, (AAON, 2011-02-28 00:00:00) to (ZTS, 2018-03-31 00:00:00)\n",
    271       "Data columns (total 33 columns):\n",
    272       "returns      171162 non-null float64\n",
    273       "t-1          171162 non-null float64\n",
    274       "t-2          171162 non-null float64\n",
    275       "t-3          171162 non-null float64\n",
    276       "t-4          171162 non-null float64\n",
    277       "t-5          171162 non-null float64\n",
    278       "t-6          171162 non-null float64\n",
    279       "t-7          171162 non-null float64\n",
    280       "t-8          171162 non-null float64\n",
    281       "t-9          171162 non-null float64\n",
    282       "t-10         171162 non-null float64\n",
    283       "t-11         171162 non-null float64\n",
    284       "t-12         171162 non-null float64\n",
    285       "year_2011    171162 non-null uint8\n",
    286       "year_2012    171162 non-null uint8\n",
    287       "year_2013    171162 non-null uint8\n",
    288       "year_2014    171162 non-null uint8\n",
    289       "year_2015    171162 non-null uint8\n",
    290       "year_2016    171162 non-null uint8\n",
    291       "year_2017    171162 non-null uint8\n",
    292       "year_2018    171162 non-null uint8\n",
    293       "month_1      171162 non-null uint8\n",
    294       "month_2      171162 non-null uint8\n",
    295       "month_3      171162 non-null uint8\n",
    296       "month_4      171162 non-null uint8\n",
    297       "month_5      171162 non-null uint8\n",
    298       "month_6      171162 non-null uint8\n",
    299       "month_7      171162 non-null uint8\n",
    300       "month_8      171162 non-null uint8\n",
    301       "month_9      171162 non-null uint8\n",
    302       "month_10     171162 non-null uint8\n",
    303       "month_11     171162 non-null uint8\n",
    304       "month_12     171162 non-null uint8\n",
    305       "dtypes: float64(13), uint8(20)\n",
    306       "memory usage: 20.7+ MB\n"
    307      ]
    308     }
    309    ],
    310    "source": [
    311     "returns.info()"
    312    ]
    313   },
    314   {
    315    "cell_type": "markdown",
    316    "metadata": {},
    317    "source": [
    318     "### Get stock characteristics"
    319    ]
    320   },
    321   {
    322    "cell_type": "markdown",
    323    "metadata": {},
    324    "source": [
    325     "#### Create age proxy"
    326    ]
    327   },
    328   {
    329    "cell_type": "code",
    330    "execution_count": 12,
    331    "metadata": {
    332     "ExecuteTime": {
    333      "end_time": "2018-11-09T16:12:38.216013Z",
    334      "start_time": "2018-11-09T16:12:38.212117Z"
    335     }
    336    },
    337    "outputs": [],
    338    "source": [
    339     "stocks['age'] = pd.qcut(stocks.ipoyear, q=5, labels=list(range(1, 6))).astype(float).fillna(0).astype(int)\n",
    340     "stocks = stocks.drop('ipoyear', axis=1)"
    341    ]
    342   },
    343   {
    344    "cell_type": "markdown",
    345    "metadata": {},
    346    "source": [
    347     "#### Create size proxy"
    348    ]
    349   },
    350   {
    351    "cell_type": "code",
    352    "execution_count": 15,
    353    "metadata": {},
    354    "outputs": [
    355     {
    356      "name": "stdout",
    357      "output_type": "stream",
    358      "text": [
    359       "<class 'pandas.core.frame.DataFrame'>\n",
    360       "Index: 2412 entries, A to ZUMZ\n",
    361       "Data columns (total 3 columns):\n",
    362       "marketcap    2407 non-null float64\n",
    363       "sector       2372 non-null object\n",
    364       "age          2412 non-null int64\n",
    365       "dtypes: float64(1), int64(1), object(1)\n",
    366       "memory usage: 155.4+ KB\n"
    367      ]
    368     }
    369    ],
    370    "source": [
    371     "stocks.info()"
    372    ]
    373   },
    374   {
    375    "cell_type": "code",
    376    "execution_count": 16,
    377    "metadata": {},
    378    "outputs": [
    379     {
    380      "data": {
    381       "text/plain": [
    382        "ticker\n",
    383        "A       1.960000e+10\n",
    384        "AA      8.540000e+09\n",
    385        "AAL     1.767000e+10\n",
    386        "AAMC    1.104900e+08\n",
    387        "AAN     3.200000e+09\n",
    388        "Name: marketcap, dtype: float64"
    389       ]
    390      },
    391      "execution_count": 16,
    392      "metadata": {},
    393      "output_type": "execute_result"
    394     }
    395    ],
    396    "source": [
    397     "stocks.marketcap.head()"
    398    ]
    399   },
    400   {
    401    "cell_type": "code",
    402    "execution_count": 18,
    403    "metadata": {
    404     "ExecuteTime": {
    405      "end_time": "2018-11-09T16:12:38.352174Z",
    406      "start_time": "2018-11-09T16:12:38.240412Z"
    407     }
    408    },
    409    "outputs": [],
    410    "source": [
    411     "stocks['size'] = pd.qcut(stocks.marketcap, q=10, labels=list(range(1, 11)))\n",
    412     "stocks = stocks.drop(['marketcap'], axis=1)"
    413    ]
    414   },
    415   {
    416    "cell_type": "markdown",
    417    "metadata": {},
    418    "source": [
    419     "#### Create Dummy variables"
    420    ]
    421   },
    422   {
    423    "cell_type": "code",
    424    "execution_count": 19,
    425    "metadata": {
    426     "ExecuteTime": {
    427      "end_time": "2018-11-09T16:12:38.357464Z",
    428      "start_time": "2018-11-09T16:12:38.353287Z"
    429     }
    430    },
    431    "outputs": [
    432     {
    433      "name": "stdout",
    434      "output_type": "stream",
    435      "text": [
    436       "<class 'pandas.core.frame.DataFrame'>\n",
    437       "Index: 2412 entries, A to ZUMZ\n",
    438       "Data columns (total 3 columns):\n",
    439       "sector    2372 non-null object\n",
    440       "age       2412 non-null int64\n",
    441       "size      2407 non-null category\n",
    442       "dtypes: category(1), int64(1), object(1)\n",
    443       "memory usage: 139.3+ KB\n"
    444      ]
    445     }
    446    ],
    447    "source": [
    448     "stocks.info()"
    449    ]
    450   },
    451   {
    452    "cell_type": "code",
    453    "execution_count": 20,
    454    "metadata": {
    455     "ExecuteTime": {
    456      "end_time": "2018-11-09T16:12:38.369633Z",
    457      "start_time": "2018-11-09T16:12:38.358581Z"
    458     }
    459    },
    460    "outputs": [
    461     {
    462      "name": "stdout",
    463      "output_type": "stream",
    464      "text": [
    465       "<class 'pandas.core.frame.DataFrame'>\n",
    466       "Index: 2412 entries, A to ZUMZ\n",
    467       "Data columns (total 28 columns):\n",
    468       "size_1                   2412 non-null uint8\n",
    469       "size_2                   2412 non-null uint8\n",
    470       "size_3                   2412 non-null uint8\n",
    471       "size_4                   2412 non-null uint8\n",
    472       "size_5                   2412 non-null uint8\n",
    473       "size_6                   2412 non-null uint8\n",
    474       "size_7                   2412 non-null uint8\n",
    475       "size_8                   2412 non-null uint8\n",
    476       "size_9                   2412 non-null uint8\n",
    477       "size_10                  2412 non-null uint8\n",
    478       "age_0                    2412 non-null uint8\n",
    479       "age_1                    2412 non-null uint8\n",
    480       "age_2                    2412 non-null uint8\n",
    481       "age_3                    2412 non-null uint8\n",
    482       "age_4                    2412 non-null uint8\n",
    483       "age_5                    2412 non-null uint8\n",
    484       "Basic Industries         2412 non-null uint8\n",
    485       "Capital Goods            2412 non-null uint8\n",
    486       "Consumer Durables        2412 non-null uint8\n",
    487       "Consumer Non-Durables    2412 non-null uint8\n",
    488       "Consumer Services        2412 non-null uint8\n",
    489       "Energy                   2412 non-null uint8\n",
    490       "Finance                  2412 non-null uint8\n",
    491       "Health Care              2412 non-null uint8\n",
    492       "Miscellaneous            2412 non-null uint8\n",
    493       "Public Utilities         2412 non-null uint8\n",
    494       "Technology               2412 non-null uint8\n",
    495       "Transportation           2412 non-null uint8\n",
    496       "dtypes: uint8(28)\n",
    497       "memory usage: 164.8+ KB\n"
    498      ]
    499     }
    500    ],
    501    "source": [
    502     "stocks = pd.get_dummies(stocks, \n",
    503     "                        columns=['size', 'age',  'sector'], \n",
    504     "                        prefix=['size', 'age', ''], \n",
    505     "                        prefix_sep=['_', '_', ''])\n",
    506     "stocks.info()"
    507    ]
    508   },
    509   {
    510    "cell_type": "markdown",
    511    "metadata": {},
    512    "source": [
    513     "### Combine data"
    514    ]
    515   },
    516   {
    517    "cell_type": "code",
    518    "execution_count": 21,
    519    "metadata": {
    520     "ExecuteTime": {
    521      "end_time": "2018-11-09T16:12:41.045331Z",
    522      "start_time": "2018-11-09T16:12:38.370654Z"
    523     },
    524     "scrolled": true
    525    },
    526    "outputs": [
    527     {
    528      "name": "stdout",
    529      "output_type": "stream",
    530      "text": [
    531       "<class 'pandas.core.frame.DataFrame'>\n",
    532       "MultiIndex: 171162 entries, (A, 2011-03-31 00:00:00) to (ZUMZ, 2018-02-28 00:00:00)\n",
    533       "Data columns (total 61 columns):\n",
    534       "returns                  171162 non-null float64\n",
    535       "t-1                      171162 non-null float64\n",
    536       "t-2                      171162 non-null float64\n",
    537       "t-3                      171162 non-null float64\n",
    538       "t-4                      171162 non-null float64\n",
    539       "t-5                      171162 non-null float64\n",
    540       "t-6                      171162 non-null float64\n",
    541       "t-7                      171162 non-null float64\n",
    542       "t-8                      171162 non-null float64\n",
    543       "t-9                      171162 non-null float64\n",
    544       "t-10                     171162 non-null float64\n",
    545       "t-11                     171162 non-null float64\n",
    546       "t-12                     171162 non-null float64\n",
    547       "year_2011                171162 non-null uint8\n",
    548       "year_2012                171162 non-null uint8\n",
    549       "year_2013                171162 non-null uint8\n",
    550       "year_2014                171162 non-null uint8\n",
    551       "year_2015                171162 non-null uint8\n",
    552       "year_2016                171162 non-null uint8\n",
    553       "year_2017                171162 non-null uint8\n",
    554       "year_2018                171162 non-null uint8\n",
    555       "month_1                  171162 non-null uint8\n",
    556       "month_2                  171162 non-null uint8\n",
    557       "month_3                  171162 non-null uint8\n",
    558       "month_4                  171162 non-null uint8\n",
    559       "month_5                  171162 non-null uint8\n",
    560       "month_6                  171162 non-null uint8\n",
    561       "month_7                  171162 non-null uint8\n",
    562       "month_8                  171162 non-null uint8\n",
    563       "month_9                  171162 non-null uint8\n",
    564       "month_10                 171162 non-null uint8\n",
    565       "month_11                 171162 non-null uint8\n",
    566       "month_12                 171162 non-null uint8\n",
    567       "size_1                   171162 non-null int8\n",
    568       "size_2                   171162 non-null int8\n",
    569       "size_3                   171162 non-null int8\n",
    570       "size_4                   171162 non-null int8\n",
    571       "size_5                   171162 non-null int8\n",
    572       "size_6                   171162 non-null int8\n",
    573       "size_7                   171162 non-null int8\n",
    574       "size_8                   171162 non-null int8\n",
    575       "size_9                   171162 non-null int8\n",
    576       "size_10                  171162 non-null int8\n",
    577       "age_0                    171162 non-null int8\n",
    578       "age_1                    171162 non-null int8\n",
    579       "age_2                    171162 non-null int8\n",
    580       "age_3                    171162 non-null int8\n",
    581       "age_4                    171162 non-null int8\n",
    582       "age_5                    171162 non-null int8\n",
    583       "Basic Industries         171162 non-null int8\n",
    584       "Capital Goods            171162 non-null int8\n",
    585       "Consumer Durables        171162 non-null int8\n",
    586       "Consumer Non-Durables    171162 non-null int8\n",
    587       "Consumer Services        171162 non-null int8\n",
    588       "Energy                   171162 non-null int8\n",
    589       "Finance                  171162 non-null int8\n",
    590       "Health Care              171162 non-null int8\n",
    591       "Miscellaneous            171162 non-null int8\n",
    592       "Public Utilities         171162 non-null int8\n",
    593       "Technology               171162 non-null int8\n",
    594       "Transportation           171162 non-null int8\n",
    595       "dtypes: float64(13), int8(28), uint8(20)\n",
    596       "memory usage: 25.3+ MB\n"
    597      ]
    598     }
    599    ],
    600    "source": [
    601     "data = (returns\n",
    602     "        .reset_index('date')\n",
    603     "        .merge(stocks, left_index=True, right_index=True)\n",
    604     "        .dropna()\n",
    605     "        .set_index('date', append=True))\n",
    606     "\n",
    607     "s = len(returns.columns)\n",
    608     "data.iloc[:, s:] = data.iloc[:, s:].astype(int).apply(pd.to_numeric, downcast='integer')\n",
    609     "data.info()"
    610    ]
    611   },
    612   {
    613    "cell_type": "markdown",
    614    "metadata": {},
    615    "source": [
    616     "### Store data"
    617    ]
    618   },
    619   {
    620    "cell_type": "code",
    621    "execution_count": 22,
    622    "metadata": {
    623     "ExecuteTime": {
    624      "end_time": "2018-11-09T16:12:41.079760Z",
    625      "start_time": "2018-11-09T16:12:41.046398Z"
    626     }
    627    },
    628    "outputs": [],
    629    "source": [
    630     "with pd.HDFStore('data.h5') as store:\n",
    631     "    store.put('data', data)"
    632    ]
    633   }
    634  ],
    635  "metadata": {
    636   "kernelspec": {
    637    "display_name": "Python 3",
    638    "language": "python",
    639    "name": "python3"
    640   },
    641   "language_info": {
    642    "codemirror_mode": {
    643     "name": "ipython",
    644     "version": 3
    645    },
    646    "file_extension": ".py",
    647    "mimetype": "text/x-python",
    648    "name": "python",
    649    "nbconvert_exporter": "python",
    650    "pygments_lexer": "ipython3",
    651    "version": "3.7.0"
    652   },
    653   "toc": {
    654    "base_numbering": 1,
    655    "nav_menu": {},
    656    "number_sections": true,
    657    "sideBar": true,
    658    "skip_h1_title": false,
    659    "title_cell": "Table of Contents",
    660    "title_sidebar": "Contents",
    661    "toc_cell": false,
    662    "toc_position": {},
    663    "toc_section_display": true,
    664    "toc_window_display": true
    665   }
    666  },
    667  "nbformat": 4,
    668  "nbformat_minor": 2
    669 }