ml-finance-python

python scripts for finance machine learning

git clone https://9o.is/git/ml-finance-python.git

00_build_dataset.ipynb

(25966B)


      1 {
      2  "cells": [
      3   {
      4    "cell_type": "markdown",
      5    "metadata": {},
      6    "source": [
      7     "# Train a Deep NN to predict Asset Price movements"
      8    ]
      9   },
     10   {
     11    "cell_type": "markdown",
     12    "metadata": {},
     13    "source": [
     14     "## Setup Docker for GPU acceleration"
     15    ]
     16   },
     17   {
     18    "cell_type": "markdown",
     19    "metadata": {},
     20    "source": [
     21     "`docker run -it -p 8889:8888 -v /path/to/machine-learning-for-trading/16_convolutions_neural_nets/cnn:/cnn --name tensorflow tensorflow/tensorflow:latest-gpu-py3 bash`"
     22    ]
     23   },
     24   {
     25    "cell_type": "markdown",
     26    "metadata": {},
     27    "source": [
     28     "## Imports & Settings"
     29    ]
     30   },
     31   {
     32    "cell_type": "code",
     33    "execution_count": 1,
     34    "metadata": {},
     35    "outputs": [],
     36    "source": [
     37     "import warnings\n",
     38     "warnings.filterwarnings('ignore')"
     39    ]
     40   },
     41   {
     42    "cell_type": "code",
     43    "execution_count": 2,
     44    "metadata": {},
     45    "outputs": [
     46     {
     47      "name": "stderr",
     48      "output_type": "stream",
     49      "text": [
     50       "Using TensorFlow backend.\n"
     51      ]
     52     }
     53    ],
     54    "source": [
     55     "import os\n",
     56     "from pathlib import Path\n",
     57     "from importlib import reload\n",
     58     "from joblib import dump, load\n",
     59     "\n",
     60     "import numpy as np\n",
     61     "import pandas as pd\n",
     62     "import matplotlib.pyplot as plt\n",
     63     "import seaborn as sns\n",
     64     "\n",
     65     "from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold\n",
     66     "from sklearn.metrics import roc_auc_score\n",
     67     "\n",
     68     "import tensorflow as tf\n",
     69     "from keras.models import Sequential\n",
     70     "from keras import backend as K\n",
     71     "from keras.wrappers.scikit_learn import KerasClassifier\n",
     72     "from keras.layers import Dense, Dropout, Activation\n",
     73     "from keras.models import load_model\n",
     74     "from keras.callbacks import Callback, EarlyStopping, TensorBoard, ModelCheckpoint"
     75    ]
     76   },
     77   {
     78    "cell_type": "code",
     79    "execution_count": 3,
     80    "metadata": {},
     81    "outputs": [],
     82    "source": [
     83     "np.random.seed(42)"
     84    ]
     85   },
     86   {
     87    "cell_type": "markdown",
     88    "metadata": {},
     89    "source": [
     90     "## Build Dataset"
     91    ]
     92   },
     93   {
     94    "cell_type": "markdown",
     95    "metadata": {},
     96    "source": [
     97     "We load the Quandl adjusted stock price data:"
     98    ]
     99   },
    100   {
    101    "cell_type": "code",
    102    "execution_count": 4,
    103    "metadata": {},
    104    "outputs": [
    105     {
    106      "name": "stdout",
    107      "output_type": "stream",
    108      "text": [
    109       "<class 'pandas.core.frame.DataFrame'>\n",
    110       "DatetimeIndex: 2896 entries, 2007-01-01 to 2018-03-27\n",
    111       "Columns: 3199 entries, A to ZUMZ\n",
    112       "dtypes: float64(3199)\n",
    113       "memory usage: 70.7 MB\n"
    114      ]
    115     }
    116    ],
    117    "source": [
    118     "prices = (pd.read_hdf('../data/assets.h5', 'quandl/wiki/prices')\n",
    119     "          .adj_close\n",
    120     "          .unstack().loc['2007':])\n",
    121     "prices.info()"
    122    ]
    123   },
    124   {
    125    "cell_type": "markdown",
    126    "metadata": {},
    127    "source": [
    128     "### Resample to weekly frequency"
    129    ]
    130   },
    131   {
    132    "cell_type": "markdown",
    133    "metadata": {},
    134    "source": [
    135     "We start by generating weekly returns for close to 2,500 stocks without missing data for the 2008-17 period, as follows:"
    136    ]
    137   },
    138   {
    139    "cell_type": "code",
    140    "execution_count": 5,
    141    "metadata": {},
    142    "outputs": [
    143     {
    144      "name": "stdout",
    145      "output_type": "stream",
    146      "text": [
    147       "<class 'pandas.core.frame.DataFrame'>\n",
    148       "DatetimeIndex: 522 entries, 2017-12-31 to 2008-01-06\n",
    149       "Freq: -1W-SUN\n",
    150       "Columns: 2489 entries, A to ZUMZ\n",
    151       "dtypes: float64(2489)\n",
    152       "memory usage: 9.9 MB\n"
    153      ]
    154     }
    155    ],
    156    "source": [
    157     "returns = (prices\n",
    158     "           .resample('W')\n",
    159     "           .last()\n",
    160     "           .pct_change()\n",
    161     "           .loc['2008': '2017']\n",
    162     "           .dropna(axis=1)\n",
    163     "           .sort_index(ascending=False))\n",
    164     "returns.info()"
    165    ]
    166   },
    167   {
    168    "cell_type": "code",
    169    "execution_count": 6,
    170    "metadata": {
    171     "scrolled": true
    172    },
    173    "outputs": [
    174     {
    175      "data": {
    176       "text/html": [
    177        "<div>\n",
    178        "<style scoped>\n",
    179        "    .dataframe tbody tr th:only-of-type {\n",
    180        "        vertical-align: middle;\n",
    181        "    }\n",
    182        "\n",
    183        "    .dataframe tbody tr th {\n",
    184        "        vertical-align: top;\n",
    185        "    }\n",
    186        "\n",
    187        "    .dataframe thead th {\n",
    188        "        text-align: right;\n",
    189        "    }\n",
    190        "</style>\n",
    191        "<table border=\"1\" class=\"dataframe\">\n",
    192        "  <thead>\n",
    193        "    <tr style=\"text-align: right;\">\n",
    194        "      <th>ticker</th>\n",
    195        "      <th>A</th>\n",
    196        "      <th>AAL</th>\n",
    197        "      <th>AAN</th>\n",
    198        "      <th>AAON</th>\n",
    199        "      <th>AAP</th>\n",
    200        "      <th>AAPL</th>\n",
    201        "      <th>AAWW</th>\n",
    202        "      <th>ABAX</th>\n",
    203        "      <th>ABC</th>\n",
    204        "      <th>ABCB</th>\n",
    205        "      <th>...</th>\n",
    206        "      <th>ZEUS</th>\n",
    207        "      <th>ZIGO</th>\n",
    208        "      <th>ZINC</th>\n",
    209        "      <th>ZION</th>\n",
    210        "      <th>ZIOP</th>\n",
    211        "      <th>ZIXI</th>\n",
    212        "      <th>ZLC</th>\n",
    213        "      <th>ZMH</th>\n",
    214        "      <th>ZQK</th>\n",
    215        "      <th>ZUMZ</th>\n",
    216        "    </tr>\n",
    217        "    <tr>\n",
    218        "      <th>date</th>\n",
    219        "      <th></th>\n",
    220        "      <th></th>\n",
    221        "      <th></th>\n",
    222        "      <th></th>\n",
    223        "      <th></th>\n",
    224        "      <th></th>\n",
    225        "      <th></th>\n",
    226        "      <th></th>\n",
    227        "      <th></th>\n",
    228        "      <th></th>\n",
    229        "      <th></th>\n",
    230        "      <th></th>\n",
    231        "      <th></th>\n",
    232        "      <th></th>\n",
    233        "      <th></th>\n",
    234        "      <th></th>\n",
    235        "      <th></th>\n",
    236        "      <th></th>\n",
    237        "      <th></th>\n",
    238        "      <th></th>\n",
    239        "      <th></th>\n",
    240        "    </tr>\n",
    241        "  </thead>\n",
    242        "  <tbody>\n",
    243        "    <tr>\n",
    244        "      <th>2017-12-31</th>\n",
    245        "      <td>-0.005642</td>\n",
    246        "      <td>-0.010648</td>\n",
    247        "      <td>-0.010184</td>\n",
    248        "      <td>-0.001361</td>\n",
    249        "      <td>-0.008553</td>\n",
    250        "      <td>-0.033027</td>\n",
    251        "      <td>-0.024938</td>\n",
    252        "      <td>-0.001814</td>\n",
    253        "      <td>-0.006922</td>\n",
    254        "      <td>-0.019329</td>\n",
    255        "      <td>...</td>\n",
    256        "      <td>-0.029797</td>\n",
    257        "      <td>0.000000</td>\n",
    258        "      <td>0.000000</td>\n",
    259        "      <td>-0.009741</td>\n",
    260        "      <td>0.022222</td>\n",
    261        "      <td>-0.015730</td>\n",
    262        "      <td>0.000000</td>\n",
    263        "      <td>0.000000</td>\n",
    264        "      <td>0.000000</td>\n",
    265        "      <td>-0.029138</td>\n",
    266        "    </tr>\n",
    267        "    <tr>\n",
    268        "      <th>2017-12-24</th>\n",
    269        "      <td>-0.003846</td>\n",
    270        "      <td>0.029965</td>\n",
    271        "      <td>0.090171</td>\n",
    272        "      <td>0.044034</td>\n",
    273        "      <td>-0.001490</td>\n",
    274        "      <td>0.006557</td>\n",
    275        "      <td>0.046087</td>\n",
    276        "      <td>0.032681</td>\n",
    277        "      <td>-0.007620</td>\n",
    278        "      <td>0.017598</td>\n",
    279        "      <td>...</td>\n",
    280        "      <td>0.032153</td>\n",
    281        "      <td>0.000000</td>\n",
    282        "      <td>0.000000</td>\n",
    283        "      <td>0.026395</td>\n",
    284        "      <td>-0.068966</td>\n",
    285        "      <td>-0.024123</td>\n",
    286        "      <td>0.000000</td>\n",
    287        "      <td>0.000000</td>\n",
    288        "      <td>0.000000</td>\n",
    289        "      <td>0.067164</td>\n",
    290        "    </tr>\n",
    291        "    <tr>\n",
    292        "      <th>2017-12-17</th>\n",
    293        "      <td>0.003413</td>\n",
    294        "      <td>0.000784</td>\n",
    295        "      <td>-0.052591</td>\n",
    296        "      <td>-0.014006</td>\n",
    297        "      <td>0.003888</td>\n",
    298        "      <td>0.026569</td>\n",
    299        "      <td>0.004367</td>\n",
    300        "      <td>0.008396</td>\n",
    301        "      <td>0.074625</td>\n",
    302        "      <td>0.026567</td>\n",
    303        "      <td>...</td>\n",
    304        "      <td>0.036715</td>\n",
    305        "      <td>0.000000</td>\n",
    306        "      <td>0.000000</td>\n",
    307        "      <td>-0.018064</td>\n",
    308        "      <td>-0.018059</td>\n",
    309        "      <td>0.075472</td>\n",
    310        "      <td>0.000000</td>\n",
    311        "      <td>0.000000</td>\n",
    312        "      <td>0.000000</td>\n",
    313        "      <td>-0.051887</td>\n",
    314        "    </tr>\n",
    315        "    <tr>\n",
    316        "      <th>2017-12-10</th>\n",
    317        "      <td>-0.019071</td>\n",
    318        "      <td>0.041012</td>\n",
    319        "      <td>-0.005359</td>\n",
    320        "      <td>-0.017882</td>\n",
    321        "      <td>0.010375</td>\n",
    322        "      <td>-0.009822</td>\n",
    323        "      <td>-0.028014</td>\n",
    324        "      <td>-0.010386</td>\n",
    325        "      <td>0.020600</td>\n",
    326        "      <td>-0.054271</td>\n",
    327        "      <td>...</td>\n",
    328        "      <td>-0.002410</td>\n",
    329        "      <td>0.000000</td>\n",
    330        "      <td>0.000000</td>\n",
    331        "      <td>0.016973</td>\n",
    332        "      <td>-0.015556</td>\n",
    333        "      <td>-0.055679</td>\n",
    334        "      <td>0.000000</td>\n",
    335        "      <td>0.000000</td>\n",
    336        "      <td>0.000000</td>\n",
    337        "      <td>0.062657</td>\n",
    338        "    </tr>\n",
    339        "    <tr>\n",
    340        "      <th>2017-12-03</th>\n",
    341        "      <td>-0.009660</td>\n",
    342        "      <td>0.009267</td>\n",
    343        "      <td>0.105501</td>\n",
    344        "      <td>0.013947</td>\n",
    345        "      <td>0.112630</td>\n",
    346        "      <td>-0.022404</td>\n",
    347        "      <td>0.073838</td>\n",
    348        "      <td>-0.028456</td>\n",
    349        "      <td>0.045796</td>\n",
    350        "      <td>0.024717</td>\n",
    351        "      <td>...</td>\n",
    352        "      <td>0.065742</td>\n",
    353        "      <td>0.000000</td>\n",
    354        "      <td>0.000000</td>\n",
    355        "      <td>0.080475</td>\n",
    356        "      <td>0.014656</td>\n",
    357        "      <td>-0.006637</td>\n",
    358        "      <td>0.000000</td>\n",
    359        "      <td>0.000000</td>\n",
    360        "      <td>0.000000</td>\n",
    361        "      <td>0.047244</td>\n",
    362        "    </tr>\n",
    363        "    <tr>\n",
    364        "      <th>2008-02-03</th>\n",
    365        "      <td>0.038265</td>\n",
    366        "      <td>0.252238</td>\n",
    367        "      <td>0.002941</td>\n",
    368        "      <td>0.095182</td>\n",
    369        "      <td>0.097833</td>\n",
    370        "      <td>0.028767</td>\n",
    371        "      <td>0.006245</td>\n",
    372        "      <td>-0.078058</td>\n",
    373        "      <td>0.036913</td>\n",
    374        "      <td>0.083217</td>\n",
    375        "      <td>...</td>\n",
    376        "      <td>0.137066</td>\n",
    377        "      <td>0.127561</td>\n",
    378        "      <td>0.286550</td>\n",
    379        "      <td>0.167722</td>\n",
    380        "      <td>-0.087879</td>\n",
    381        "      <td>0.069364</td>\n",
    382        "      <td>0.171949</td>\n",
    383        "      <td>0.193189</td>\n",
    384        "      <td>0.127811</td>\n",
    385        "      <td>0.149083</td>\n",
    386        "    </tr>\n",
    387        "    <tr>\n",
    388        "      <th>2008-01-27</th>\n",
    389        "      <td>-0.013963</td>\n",
    390        "      <td>-0.048762</td>\n",
    391        "      <td>0.191310</td>\n",
    392        "      <td>0.071788</td>\n",
    393        "      <td>0.043997</td>\n",
    394        "      <td>-0.194286</td>\n",
    395        "      <td>-0.008984</td>\n",
    396        "      <td>-0.090807</td>\n",
    397        "      <td>-0.034771</td>\n",
    398        "      <td>0.054572</td>\n",
    399        "      <td>...</td>\n",
    400        "      <td>0.018349</td>\n",
    401        "      <td>-0.026292</td>\n",
    402        "      <td>-0.046975</td>\n",
    403        "      <td>0.136418</td>\n",
    404        "      <td>-0.003021</td>\n",
    405        "      <td>0.145695</td>\n",
    406        "      <td>0.042164</td>\n",
    407        "      <td>-0.014553</td>\n",
    408        "      <td>0.141892</td>\n",
    409        "      <td>0.118666</td>\n",
    410        "    </tr>\n",
    411        "    <tr>\n",
    412        "      <th>2008-01-20</th>\n",
    413        "      <td>-0.065000</td>\n",
    414        "      <td>0.086627</td>\n",
    415        "      <td>-0.080541</td>\n",
    416        "      <td>-0.054762</td>\n",
    417        "      <td>-0.007176</td>\n",
    418        "      <td>-0.065609</td>\n",
    419        "      <td>0.015818</td>\n",
    420        "      <td>-0.019721</td>\n",
    421        "      <td>-0.015219</td>\n",
    422        "      <td>-0.044397</td>\n",
    423        "      <td>...</td>\n",
    424        "      <td>0.040573</td>\n",
    425        "      <td>0.010999</td>\n",
    426        "      <td>-0.167109</td>\n",
    427        "      <td>-0.051614</td>\n",
    428        "      <td>-0.054286</td>\n",
    429        "      <td>-0.124638</td>\n",
    430        "      <td>0.037172</td>\n",
    431        "      <td>-0.037312</td>\n",
    432        "      <td>-0.030144</td>\n",
    433        "      <td>-0.076969</td>\n",
    434        "    </tr>\n",
    435        "    <tr>\n",
    436        "      <th>2008-01-13</th>\n",
    437        "      <td>0.035375</td>\n",
    438        "      <td>-0.041902</td>\n",
    439        "      <td>-0.037818</td>\n",
    440        "      <td>-0.046538</td>\n",
    441        "      <td>-0.101486</td>\n",
    442        "      <td>-0.040878</td>\n",
    443        "      <td>-0.052095</td>\n",
    444        "      <td>0.097385</td>\n",
    445        "      <td>0.080137</td>\n",
    446        "      <td>-0.017313</td>\n",
    447        "      <td>...</td>\n",
    448        "      <td>-0.054176</td>\n",
    449        "      <td>-0.047993</td>\n",
    450        "      <td>-0.102381</td>\n",
    451        "      <td>0.037264</td>\n",
    452        "      <td>-0.022346</td>\n",
    453        "      <td>-0.172662</td>\n",
    454        "      <td>0.011799</td>\n",
    455        "      <td>0.051880</td>\n",
    456        "      <td>0.018692</td>\n",
    457        "      <td>-0.094249</td>\n",
    458        "    </tr>\n",
    459        "    <tr>\n",
    460        "      <th>2008-01-06</th>\n",
    461        "      <td>-0.072553</td>\n",
    462        "      <td>-0.156356</td>\n",
    463        "      <td>-0.068707</td>\n",
    464        "      <td>-0.133301</td>\n",
    465        "      <td>-0.065496</td>\n",
    466        "      <td>-0.098984</td>\n",
    467        "      <td>-0.029478</td>\n",
    468        "      <td>-0.098374</td>\n",
    469        "      <td>-0.037363</td>\n",
    470        "      <td>-0.132733</td>\n",
    471        "      <td>...</td>\n",
    472        "      <td>-0.027290</td>\n",
    473        "      <td>-0.075806</td>\n",
    474        "      <td>-0.004739</td>\n",
    475        "      <td>-0.081058</td>\n",
    476        "      <td>0.101538</td>\n",
    477        "      <td>-0.143737</td>\n",
    478        "      <td>-0.134100</td>\n",
    479        "      <td>0.000752</td>\n",
    480        "      <td>-0.133102</td>\n",
    481        "      <td>-0.269012</td>\n",
    482        "    </tr>\n",
    483        "  </tbody>\n",
    484        "</table>\n",
    485        "<p>10 rows × 2489 columns</p>\n",
    486        "</div>"
    487       ],
    488       "text/plain": [
    489        "ticker             A       AAL       AAN      AAON       AAP      AAPL  \\\n",
    490        "date                                                                     \n",
    491        "2017-12-31 -0.005642 -0.010648 -0.010184 -0.001361 -0.008553 -0.033027   \n",
    492        "2017-12-24 -0.003846  0.029965  0.090171  0.044034 -0.001490  0.006557   \n",
    493        "2017-12-17  0.003413  0.000784 -0.052591 -0.014006  0.003888  0.026569   \n",
    494        "2017-12-10 -0.019071  0.041012 -0.005359 -0.017882  0.010375 -0.009822   \n",
    495        "2017-12-03 -0.009660  0.009267  0.105501  0.013947  0.112630 -0.022404   \n",
    496        "2008-02-03  0.038265  0.252238  0.002941  0.095182  0.097833  0.028767   \n",
    497        "2008-01-27 -0.013963 -0.048762  0.191310  0.071788  0.043997 -0.194286   \n",
    498        "2008-01-20 -0.065000  0.086627 -0.080541 -0.054762 -0.007176 -0.065609   \n",
    499        "2008-01-13  0.035375 -0.041902 -0.037818 -0.046538 -0.101486 -0.040878   \n",
    500        "2008-01-06 -0.072553 -0.156356 -0.068707 -0.133301 -0.065496 -0.098984   \n",
    501        "\n",
    502        "ticker          AAWW      ABAX       ABC      ABCB  ...      ZEUS      ZIGO  \\\n",
    503        "date                                                ...                       \n",
    504        "2017-12-31 -0.024938 -0.001814 -0.006922 -0.019329  ... -0.029797  0.000000   \n",
    505        "2017-12-24  0.046087  0.032681 -0.007620  0.017598  ...  0.032153  0.000000   \n",
    506        "2017-12-17  0.004367  0.008396  0.074625  0.026567  ...  0.036715  0.000000   \n",
    507        "2017-12-10 -0.028014 -0.010386  0.020600 -0.054271  ... -0.002410  0.000000   \n",
    508        "2017-12-03  0.073838 -0.028456  0.045796  0.024717  ...  0.065742  0.000000   \n",
    509        "2008-02-03  0.006245 -0.078058  0.036913  0.083217  ...  0.137066  0.127561   \n",
    510        "2008-01-27 -0.008984 -0.090807 -0.034771  0.054572  ...  0.018349 -0.026292   \n",
    511        "2008-01-20  0.015818 -0.019721 -0.015219 -0.044397  ...  0.040573  0.010999   \n",
    512        "2008-01-13 -0.052095  0.097385  0.080137 -0.017313  ... -0.054176 -0.047993   \n",
    513        "2008-01-06 -0.029478 -0.098374 -0.037363 -0.132733  ... -0.027290 -0.075806   \n",
    514        "\n",
    515        "ticker          ZINC      ZION      ZIOP      ZIXI       ZLC       ZMH  \\\n",
    516        "date                                                                     \n",
    517        "2017-12-31  0.000000 -0.009741  0.022222 -0.015730  0.000000  0.000000   \n",
    518        "2017-12-24  0.000000  0.026395 -0.068966 -0.024123  0.000000  0.000000   \n",
    519        "2017-12-17  0.000000 -0.018064 -0.018059  0.075472  0.000000  0.000000   \n",
    520        "2017-12-10  0.000000  0.016973 -0.015556 -0.055679  0.000000  0.000000   \n",
    521        "2017-12-03  0.000000  0.080475  0.014656 -0.006637  0.000000  0.000000   \n",
    522        "2008-02-03  0.286550  0.167722 -0.087879  0.069364  0.171949  0.193189   \n",
    523        "2008-01-27 -0.046975  0.136418 -0.003021  0.145695  0.042164 -0.014553   \n",
    524        "2008-01-20 -0.167109 -0.051614 -0.054286 -0.124638  0.037172 -0.037312   \n",
    525        "2008-01-13 -0.102381  0.037264 -0.022346 -0.172662  0.011799  0.051880   \n",
    526        "2008-01-06 -0.004739 -0.081058  0.101538 -0.143737 -0.134100  0.000752   \n",
    527        "\n",
    528        "ticker           ZQK      ZUMZ  \n",
    529        "date                            \n",
    530        "2017-12-31  0.000000 -0.029138  \n",
    531        "2017-12-24  0.000000  0.067164  \n",
    532        "2017-12-17  0.000000 -0.051887  \n",
    533        "2017-12-10  0.000000  0.062657  \n",
    534        "2017-12-03  0.000000  0.047244  \n",
    535        "2008-02-03  0.127811  0.149083  \n",
    536        "2008-01-27  0.141892  0.118666  \n",
    537        "2008-01-20 -0.030144 -0.076969  \n",
    538        "2008-01-13  0.018692 -0.094249  \n",
    539        "2008-01-06 -0.133102 -0.269012  \n",
    540        "\n",
    541        "[10 rows x 2489 columns]"
    542       ]
    543      },
    544      "execution_count": 6,
    545      "metadata": {},
    546      "output_type": "execute_result"
    547     }
    548    ],
    549    "source": [
    550     "returns.head().append(returns.tail())"
    551    ]
    552   },
    553   {
    554    "cell_type": "markdown",
    555    "metadata": {},
    556    "source": [
    557     "### Create & stack 52-week sequences"
    558    ]
    559   },
    560   {
    561    "cell_type": "markdown",
    562    "metadata": {},
    563    "source": [
    564     "We'll use 52-week sequences, which we'll create in a stacked format:"
    565    ]
    566   },
    567   {
    568    "cell_type": "code",
    569    "execution_count": 7,
    570    "metadata": {},
    571    "outputs": [],
    572    "source": [
    573     "n = len(returns)\n",
    574     "T = 52 # weeks\n",
    575     "tcols = list(range(T))"
    576    ]
    577   },
    578   {
    579    "cell_type": "code",
    580    "execution_count": 8,
    581    "metadata": {},
    582    "outputs": [
    583     {
    584      "name": "stdout",
    585      "output_type": "stream",
    586      "text": [
    587       "0 50 100 150 200 250 300 350 400 450 <class 'pandas.core.frame.DataFrame'>\n",
    588       "RangeIndex: 1244500 entries, 0 to 1244499\n",
    589       "Data columns (total 25 columns):\n",
    590       "ticker    1244500 non-null object\n",
    591       "0         1244500 non-null float64\n",
    592       "1         1244500 non-null float64\n",
    593       "2         1244500 non-null float64\n",
    594       "3         1244500 non-null float64\n",
    595       "4         1244500 non-null float64\n",
    596       "5         1244500 non-null float64\n",
    597       "6         1244500 non-null float64\n",
    598       "7         1244500 non-null float64\n",
    599       "8         1244500 non-null float64\n",
    600       "9         1244500 non-null float64\n",
    601       "10        1244500 non-null float64\n",
    602       "11        1244500 non-null float64\n",
    603       "12        1244500 non-null float64\n",
    604       "13        1244500 non-null float64\n",
    605       "14        1244500 non-null float64\n",
    606       "15        1244500 non-null float64\n",
    607       "16        1244500 non-null float64\n",
    608       "17        1244500 non-null float64\n",
    609       "18        1244500 non-null float64\n",
    610       "19        1244500 non-null float64\n",
    611       "20        1244500 non-null float64\n",
    612       "21        1244500 non-null float64\n",
    613       "year      1244500 non-null int64\n",
    614       "month     1244500 non-null int64\n",
    615       "dtypes: float64(22), int64(2), object(1)\n",
    616       "memory usage: 237.4+ MB\n"
    617      ]
    618     }
    619    ],
    620    "source": [
    621     "data = pd.DataFrame()\n",
    622     "for i in range(n-T-1):\n",
    623     "    if i % 50 == 0:\n",
    624     "        print(i, end=' ', flush=True)\n",
    625     "    df = returns.iloc[i:i+T+1]\n",
    626     "    data = pd.concat([data, (df\n",
    627     "                             .reset_index(drop=True)\n",
    628     "                             .transpose()\n",
    629     "                             .reset_index()\n",
    630     "                             .assign(year=df.index[0].year,\n",
    631     "                                     month=df.index[0].month))],\n",
    632     "                     ignore_index=True)\n",
    633     "data.info()"
    634    ]
    635   },
    636   {
    637    "cell_type": "markdown",
    638    "metadata": {},
    639    "source": [
    640     "### Create categorical variables"
    641    ]
    642   },
    643   {
    644    "cell_type": "markdown",
    645    "metadata": {},
    646    "source": [
    647     "We create dummy variables for different time periods, namely months and years:"
    648    ]
    649   },
    650   {
    651    "cell_type": "code",
    652    "execution_count": 9,
    653    "metadata": {},
    654    "outputs": [
    655     {
    656      "name": "stdout",
    657      "output_type": "stream",
    658      "text": [
    659       "<class 'pandas.core.frame.DataFrame'>\n",
    660       "DatetimeIndex: 1244500 entries, 2008-06-01 to 2017-12-01\n",
    661       "Data columns (total 45 columns):\n",
    662       "ticker       1244500 non-null int64\n",
    663       "1            1244500 non-null float64\n",
    664       "2            1244500 non-null float64\n",
    665       "3            1244500 non-null float64\n",
    666       "4            1244500 non-null float64\n",
    667       "5            1244500 non-null float64\n",
    668       "6            1244500 non-null float64\n",
    669       "7            1244500 non-null float64\n",
    670       "8            1244500 non-null float64\n",
    671       "9            1244500 non-null float64\n",
    672       "10           1244500 non-null float64\n",
    673       "11           1244500 non-null float64\n",
    674       "12           1244500 non-null float64\n",
    675       "13           1244500 non-null float64\n",
    676       "14           1244500 non-null float64\n",
    677       "15           1244500 non-null float64\n",
    678       "16           1244500 non-null float64\n",
    679       "17           1244500 non-null float64\n",
    680       "18           1244500 non-null float64\n",
    681       "19           1244500 non-null float64\n",
    682       "20           1244500 non-null float64\n",
    683       "21           1244500 non-null float64\n",
    684       "label        1244500 non-null int64\n",
    685       "year_2008    1244500 non-null uint8\n",
    686       "year_2009    1244500 non-null uint8\n",
    687       "year_2010    1244500 non-null uint8\n",
    688       "year_2011    1244500 non-null uint8\n",
    689       "year_2012    1244500 non-null uint8\n",
    690       "year_2013    1244500 non-null uint8\n",
    691       "year_2014    1244500 non-null uint8\n",
    692       "year_2015    1244500 non-null uint8\n",
    693       "year_2016    1244500 non-null uint8\n",
    694       "year_2017    1244500 non-null uint8\n",
    695       "month_1      1244500 non-null uint8\n",
    696       "month_2      1244500 non-null uint8\n",
    697       "month_3      1244500 non-null uint8\n",
    698       "month_4      1244500 non-null uint8\n",
    699       "month_5      1244500 non-null uint8\n",
    700       "month_6      1244500 non-null uint8\n",
    701       "month_7      1244500 non-null uint8\n",
    702       "month_8      1244500 non-null uint8\n",
    703       "month_9      1244500 non-null uint8\n",
    704       "month_10     1244500 non-null uint8\n",
    705       "month_11     1244500 non-null uint8\n",
    706       "month_12     1244500 non-null uint8\n",
    707       "dtypes: float64(21), int64(2), uint8(22)\n",
    708       "memory usage: 254.0 MB\n"
    709      ]
    710     }
    711    ],
    712    "source": [
    713     "data[tcols] = (data[tcols].apply(lambda x: x.clip(lower=x.quantile(.01),\n",
    714     "                                                  upper=x.quantile(.99))))\n",
    715     "data.ticker = pd.factorize(data.ticker)[0]\n",
    716     "data['label'] = (data[0] > 0).astype(int)\n",
    717     "data['date'] = pd.to_datetime(data.assign(day=1)[['year', 'month', 'day']])\n",
    718     "data = pd.get_dummies((data.drop(0, axis=1)\n",
    719     "                       .set_index('date')\n",
    720     "                       .apply(pd.to_numeric)),\n",
    721     "                      columns=['year', 'month']).sort_index()\n",
    722     "data.info()"
    723    ]
    724   },
    725   {
    726    "cell_type": "code",
    727    "execution_count": 10,
    728    "metadata": {},
    729    "outputs": [],
    730    "source": [
    731     "data.to_hdf('data.h5', 'returns_daily')"
    732    ]
    733   },
    734   {
    735    "cell_type": "code",
    736    "execution_count": 11,
    737    "metadata": {},
    738    "outputs": [
    739     {
    740      "data": {
    741       "text/plain": [
    742        "(1244500, 45)"
    743       ]
    744      },
    745      "execution_count": 11,
    746      "metadata": {},
    747      "output_type": "execute_result"
    748     }
    749    ],
    750    "source": [
    751     "data.shape"
    752    ]
    753   }
    754  ],
    755  "metadata": {
    756   "kernelspec": {
    757    "display_name": "Python 3",
    758    "language": "python",
    759    "name": "python3"
    760   },
    761   "language_info": {
    762    "codemirror_mode": {
    763     "name": "ipython",
    764     "version": 3
    765    },
    766    "file_extension": ".py",
    767    "mimetype": "text/x-python",
    768    "name": "python",
    769    "nbconvert_exporter": "python",
    770    "pygments_lexer": "ipython3",
    771    "version": "3.6.8"
    772   },
    773   "toc": {
    774    "base_numbering": 1,
    775    "nav_menu": {},
    776    "number_sections": true,
    777    "sideBar": true,
    778    "skip_h1_title": true,
    779    "title_cell": "Table of Contents",
    780    "title_sidebar": "Contents",
    781    "toc_cell": false,
    782    "toc_position": {
    783     "height": "calc(100% - 180px)",
    784     "left": "10px",
    785     "top": "150px",
    786     "width": "282.222px"
    787    },
    788    "toc_section_display": true,
    789    "toc_window_display": true
    790   }
    791  },
    792  "nbformat": 4,
    793  "nbformat_minor": 2
    794 }