ml-finance-python

python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
02_sklearn_gbm_tuning.ipynb

(11616B)
      1 {
      2  "cells": [
      3   {
      4    "cell_type": "markdown",
      5    "metadata": {},
      6    "source": [
      7     "# GBM Hyperparameter Tuning with sklearn"
      8    ]
      9   },
     10   {
     11    "cell_type": "markdown",
     12    "metadata": {},
     13    "source": [
     14     "## Imports & Settings"
     15    ]
     16   },
     17   {
     18    "cell_type": "code",
     19    "execution_count": 2,
     20    "metadata": {},
     21    "outputs": [],
     22    "source": [
     23     "from time import time\n",
     24     "import numpy as np\n",
     25     "import pandas as pd\n",
     26     "import warnings\n",
     27     "from sklearn.ensemble import GradientBoostingClassifier\n",
     28     "from sklearn.model_selection import GridSearchCV\n",
     29     "from itertools import product\n",
     30     "from sklearn.externals import joblib\n",
     31     "from pathlib import Path\n",
     32     "\n",
     33     "warnings.filterwarnings('ignore')\n",
     34     "np.random.seed(42)"
     35    ]
     36   },
     37   {
     38    "cell_type": "markdown",
     39    "metadata": {},
     40    "source": [
     41     "## Create one-hot encoding"
     42    ]
     43   },
     44   {
     45    "cell_type": "code",
     46    "execution_count": 5,
     47    "metadata": {},
     48    "outputs": [],
     49    "source": [
     50     "def get_one_hot_data(df, cols=('year', 'month', 'age', 'msize')):\n",
     51     "    cols = list(cols)\n",
     52     "    df = pd.get_dummies(df,\n",
     53     "                        columns=cols + ['sector'],\n",
     54     "                        prefix=cols + [''],\n",
     55     "                        prefix_sep=['_'] * len(cols) + [''])\n",
     56     "    return df.rename(columns={c: c.replace('.0', '').replace(' ', '_').lower() for c in df.columns})"
     57    ]
     58   },
     59   {
     60    "cell_type": "markdown",
     61    "metadata": {},
     62    "source": [
     63     "## Create holdout test set"
     64    ]
     65   },
     66   {
     67    "cell_type": "code",
     68    "execution_count": 6,
     69    "metadata": {},
     70    "outputs": [],
     71    "source": [
     72     "def get_holdout_set(target, features, period=6):\n",
     73     "    idx = pd.IndexSlice\n",
     74     "    label = target.name\n",
     75     "    dates = np.sort(target.index.get_level_values('date').unique())\n",
     76     "    cv_start, cv_end = dates[0], dates[-period - 2]\n",
     77     "    holdout_start, holdout_end = dates[-period - 1], dates[-1]\n",
     78     "\n",
     79     "    df = features.join(target.to_frame())\n",
     80     "    train = df.loc[idx[:, cv_start: cv_end], :]\n",
     81     "    y_train, X_train = train[label], train.drop(label, axis=1)\n",
     82     "\n",
     83     "    test = df.loc[idx[:, holdout_start: holdout_end], :]\n",
     84     "    y_test, X_test = test[label], test.drop(label, axis=1)\n",
     85     "    return y_train, X_train, y_test, X_test"
     86    ]
     87   },
     88   {
     89    "cell_type": "markdown",
     90    "metadata": {},
     91    "source": [
     92     "## Custom TimeSeriesSplit"
     93    ]
     94   },
     95   {
     96    "cell_type": "code",
     97    "execution_count": 8,
     98    "metadata": {},
     99    "outputs": [],
    100    "source": [
    101     "class OneStepTimeSeriesSplit:\n",
    102     "    \"\"\"Generates tuples of train_idx, test_idx pairs\n",
    103     "    Assumes the index contains a level labeled 'date'\"\"\"\n",
    104     "\n",
    105     "    def __init__(self, n_splits=3, test_period_length=1, shuffle=False):\n",
    106     "        self.n_splits = n_splits\n",
    107     "        self.test_period_length = test_period_length\n",
    108     "        self.shuffle = shuffle\n",
    109     "        self.test_end = n_splits * test_period_length\n",
    110     "\n",
    111     "    @staticmethod\n",
    112     "    def chunks(l, n):\n",
    113     "        for i in range(0, len(l), n):\n",
    114     "            yield l[i:i + n]\n",
    115     "\n",
    116     "    def split(self, X, y=None, groups=None):\n",
    117     "        unique_dates = (X\n",
    118     "                            .index\n",
    119     "                            .get_level_values('date')\n",
    120     "                            .unique()\n",
    121     "                            .sort_values(ascending=False)\n",
    122     "        [:self.test_end])\n",
    123     "\n",
    124     "        dates = X.reset_index()[['date']]\n",
    125     "        for test_date in self.chunks(unique_dates, self.test_period_length):\n",
    126     "            train_idx = dates[dates.date < min(test_date)].index\n",
    127     "            test_idx = dates[dates.date.isin(test_date)].index\n",
    128     "            if self.shuffle:\n",
    129     "                np.random.shuffle(list(train_idx))\n",
    130     "            yield train_idx, test_idx\n",
    131     "    \n",
    132     "    def get_n_splits(self, X, y, groups=None):\n",
    133     "        return self.n_splits            "
    134    ]
    135   },
    136   {
    137    "cell_type": "markdown",
    138    "metadata": {},
    139    "source": [
    140     "## Instantiate GradientBoostingClassifier"
    141    ]
    142   },
    143   {
    144    "cell_type": "code",
    145    "execution_count": 9,
    146    "metadata": {},
    147    "outputs": [],
    148    "source": [
    149     "gb_clf = GradientBoostingClassifier(loss='deviance',\n",
    150     "                                    learning_rate=0.1,\n",
    151     "                                    n_estimators=100,\n",
    152     "                                    subsample=1.0,\n",
    153     "                                    criterion='friedman_mse',\n",
    154     "                                    min_samples_split=2,\n",
    155     "                                    min_samples_leaf=1,\n",
    156     "                                    min_weight_fraction_leaf=0.0,\n",
    157     "                                    max_depth=3,\n",
    158     "                                    min_impurity_decrease=0.0,\n",
    159     "                                    min_impurity_split=None,\n",
    160     "                                    init=None,\n",
    161     "                                    random_state=None,\n",
    162     "                                    max_features=None,\n",
    163     "                                    verbose=0,\n",
    164     "                                    max_leaf_nodes=None,\n",
    165     "                                    warm_start=False,\n",
    166     "                                    presort='auto',\n",
    167     "                                    validation_fraction=0.1,\n",
    168     "                                    n_iter_no_change=None,\n",
    169     "                                    tol=0.0001)"
    170    ]
    171   },
    172   {
    173    "cell_type": "markdown",
    174    "metadata": {},
    175    "source": [
    176     "## Load Data"
    177    ]
    178   },
    179   {
    180    "cell_type": "markdown",
    181    "metadata": {},
    182    "source": [
    183     "We use the dataset generated by the notebook [feature-engineering](../04_alpha_factor_research/00_data/feature_engineering.ipynb) from [Chapter 4 on Alpha Factor Research](../04_alpha_factor_research) that needs to be executed first."
    184    ]
    185   },
    186   {
    187    "cell_type": "code",
    188    "execution_count": 3,
    189    "metadata": {},
    190    "outputs": [],
    191    "source": [
    192     "DATA_STORE = Path('../../data/assets.h5')"
    193    ]
    194   },
    195   {
    196    "cell_type": "code",
    197    "execution_count": 1,
    198    "metadata": {},
    199    "outputs": [],
    200    "source": [
    201     "def get_data(start='2000', end='2018', holding_period=1, dropna=False):\n",
    202     "    idx = pd.IndexSlice\n",
    203     "    target = f'target_{holding_period}m'\n",
    204     "    with pd.HDFStore(DATA_STORE) as store:\n",
    205     "        df = store['engineered_features']\n",
    206     "\n",
    207     "    if start is not None and end is not None:\n",
    208     "        df = df.loc[idx[:, start: end], :]\n",
    209     "    if dropna:\n",
    210     "        df = df.dropna()\n",
    211     "\n",
    212     "    y = (df[target] > 0).astype(int)\n",
    213     "    X = df.drop([c for c in df.columns if c.startswith('target')], axis=1)\n",
    214     "    return y, X"
    215    ]
    216   },
    217   {
    218    "cell_type": "code",
    219    "execution_count": null,
    220    "metadata": {},
    221    "outputs": [],
    222    "source": [
    223     "n_splits = 12\n",
    224     "y, features = get_data()\n",
    225     "X = get_one_hot_data(features).dropna()\n",
    226     "\n",
    227     "y, X, y_test, X_test = get_holdout_set(target=y,\n",
    228     "                                       features=X)"
    229    ]
    230   },
    231   {
    232    "cell_type": "code",
    233    "execution_count": null,
    234    "metadata": {},
    235    "outputs": [],
    236    "source": [
    237     "with pd.HDFStore('model_tuning.h5') as store:\n",
    238     "    store.put('holdout/features', X_test)\n",
    239     "    store.put('holdout/target', y_test)"
    240    ]
    241   },
    242   {
    243    "cell_type": "markdown",
    244    "metadata": {},
    245    "source": [
    246     "## Setup GridSearchCV"
    247    ]
    248   },
    249   {
    250    "cell_type": "markdown",
    251    "metadata": {},
    252    "source": [
    253     "The [GridSearchCV](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html) class in sklearn's [model_selection](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection) module facilitates the systematic evaluation of all combinations of the hyperparameter values that we would like to test.\n",
    254     " \n",
    255     "In the following code, we will illustrate this functionality for seven tuning parameters that will result in a total of 24 x 32 x 4 = 576 different model configurations."
    256    ]
    257   },
    258   {
    259    "cell_type": "markdown",
    260    "metadata": {},
    261    "source": [
    262     "### Parameter Grid"
    263    ]
    264   },
    265   {
    266    "cell_type": "markdown",
    267    "metadata": {},
    268    "source": [
    269     "First we define the cross-validation iterator:"
    270    ]
    271   },
    272   {
    273    "cell_type": "markdown",
    274    "metadata": {},
    275    "source": [
    276     "cv = OneStepTimeSeriesSplit(n_splits=n_splits)"
    277    ]
    278   },
    279   {
    280    "cell_type": "markdown",
    281    "metadata": {},
    282    "source": [
    283     "And next the parameter grid"
    284    ]
    285   },
    286   {
    287    "cell_type": "code",
    288    "execution_count": 10,
    289    "metadata": {},
    290    "outputs": [],
    291    "source": [
    292     "param_grid = dict(\n",
    293     "        learning_rate=[.01, .1, .2],\n",
    294     "        max_depth=list(range(3, 13, 3)),\n",
    295     "        max_features=['sqrt', .8, 1],\n",
    296     "        min_impurity_decrease=[0, .01],\n",
    297     "        min_samples_split=[10, 50],\n",
    298     "        n_estimators=[100, 300],\n",
    299     "        subsample=[.8, 1],\n",
    300     ")"
    301    ]
    302   },
    303   {
    304    "cell_type": "code",
    305    "execution_count": 11,
    306    "metadata": {},
    307    "outputs": [
    308     {
    309      "name": "stdout",
    310      "output_type": "stream",
    311      "text": [
    312       "# Models = : 576\n"
    313      ]
    314     }
    315    ],
    316    "source": [
    317     "all_params = list(product(*param_grid.values()))\n",
    318     "print('# Models = :', len(all_params))"
    319    ]
    320   },
    321   {
    322    "cell_type": "markdown",
    323    "metadata": {},
    324    "source": [
    325     "### Instantiate GridSearchCV"
    326    ]
    327   },
    328   {
    329    "cell_type": "code",
    330    "execution_count": null,
    331    "metadata": {},
    332    "outputs": [],
    333    "source": [
    334     "gs = GridSearchCV(gb_clf,\n",
    335     "                  param_grid,\n",
    336     "                  cv=cv,\n",
    337     "                  scoring='roc_auc',\n",
    338     "                  verbose=3,\n",
    339     "                  n_jobs=-1,\n",
    340     "                  return_train_score=True)\n",
    341     "\n"
    342    ]
    343   },
    344   {
    345    "cell_type": "markdown",
    346    "metadata": {},
    347    "source": [
    348     "### Fit GridSearchCV"
    349    ]
    350   },
    351   {
    352    "cell_type": "markdown",
    353    "metadata": {},
    354    "source": [
    355     "This can take several days..."
    356    ]
    357   },
    358   {
    359    "cell_type": "code",
    360    "execution_count": null,
    361    "metadata": {},
    362    "outputs": [],
    363    "source": [
    364     "start = time()\n",
    365     "gs.fit(X=X, y=y)\n",
    366     "done = time()"
    367    ]
    368   },
    369   {
    370    "cell_type": "markdown",
    371    "metadata": {},
    372    "source": [
    373     "### Persist Results"
    374    ]
    375   },
    376   {
    377    "cell_type": "code",
    378    "execution_count": null,
    379    "metadata": {},
    380    "outputs": [],
    381    "source": [
    382     "print(f'Done in {done:.2f}s')\n",
    383     "joblib.dump(gs, 'gbm_gridsearch.joblib')"
    384    ]
    385   }
    386  ],
    387  "metadata": {
    388   "kernelspec": {
    389    "display_name": "Python 3",
    390    "language": "python",
    391    "name": "python3"
    392   },
    393   "language_info": {
    394    "codemirror_mode": {
    395     "name": "ipython",
    396     "version": 3
    397    },
    398    "file_extension": ".py",
    399    "mimetype": "text/x-python",
    400    "name": "python",
    401    "nbconvert_exporter": "python",
    402    "pygments_lexer": "ipython3",
    403    "version": "3.7.0"
    404   },
    405   "toc": {
    406    "base_numbering": 1,
    407    "nav_menu": {},
    408    "number_sections": true,
    409    "sideBar": true,
    410    "skip_h1_title": true,
    411    "title_cell": "Table of Contents",
    412    "title_sidebar": "Contents",
    413    "toc_cell": false,
    414    "toc_position": {},
    415    "toc_section_display": true,
    416    "toc_window_display": false
    417   }
    418  },
    419  "nbformat": 4,
    420  "nbformat_minor": 2
    421 }