ml-finance-python
python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
02_sklearn_gbm_tuning.ipynb
(11616B)
1 {
2 "cells": [
3 {
4 "cell_type": "markdown",
5 "metadata": {},
6 "source": [
7 "# GBM Hyperparameter Tuning with sklearn"
8 ]
9 },
10 {
11 "cell_type": "markdown",
12 "metadata": {},
13 "source": [
14 "## Imports & Settings"
15 ]
16 },
17 {
18 "cell_type": "code",
19 "execution_count": 2,
20 "metadata": {},
21 "outputs": [],
22 "source": [
23 "from time import time\n",
24 "import numpy as np\n",
25 "import pandas as pd\n",
26 "import warnings\n",
27 "from sklearn.ensemble import GradientBoostingClassifier\n",
28 "from sklearn.model_selection import GridSearchCV\n",
29 "from itertools import product\n",
30 "from sklearn.externals import joblib\n",
31 "from pathlib import Path\n",
32 "\n",
33 "warnings.filterwarnings('ignore')\n",
34 "np.random.seed(42)"
35 ]
36 },
37 {
38 "cell_type": "markdown",
39 "metadata": {},
40 "source": [
41 "## Create one-hot encoding"
42 ]
43 },
44 {
45 "cell_type": "code",
46 "execution_count": 5,
47 "metadata": {},
48 "outputs": [],
49 "source": [
50 "def get_one_hot_data(df, cols=('year', 'month', 'age', 'msize')):\n",
51 " cols = list(cols)\n",
52 " df = pd.get_dummies(df,\n",
53 " columns=cols + ['sector'],\n",
54 " prefix=cols + [''],\n",
55 " prefix_sep=['_'] * len(cols) + [''])\n",
56 " return df.rename(columns={c: c.replace('.0', '').replace(' ', '_').lower() for c in df.columns})"
57 ]
58 },
59 {
60 "cell_type": "markdown",
61 "metadata": {},
62 "source": [
63 "## Create holdout test set"
64 ]
65 },
66 {
67 "cell_type": "code",
68 "execution_count": 6,
69 "metadata": {},
70 "outputs": [],
71 "source": [
72 "def get_holdout_set(target, features, period=6):\n",
73 " idx = pd.IndexSlice\n",
74 " label = target.name\n",
75 " dates = np.sort(target.index.get_level_values('date').unique())\n",
76 " cv_start, cv_end = dates[0], dates[-period - 2]\n",
77 " holdout_start, holdout_end = dates[-period - 1], dates[-1]\n",
78 "\n",
79 " df = features.join(target.to_frame())\n",
80 " train = df.loc[idx[:, cv_start: cv_end], :]\n",
81 " y_train, X_train = train[label], train.drop(label, axis=1)\n",
82 "\n",
83 " test = df.loc[idx[:, holdout_start: holdout_end], :]\n",
84 " y_test, X_test = test[label], test.drop(label, axis=1)\n",
85 " return y_train, X_train, y_test, X_test"
86 ]
87 },
88 {
89 "cell_type": "markdown",
90 "metadata": {},
91 "source": [
92 "## Custom TimeSeriesSplit"
93 ]
94 },
95 {
96 "cell_type": "code",
97 "execution_count": 8,
98 "metadata": {},
99 "outputs": [],
100 "source": [
101 "class OneStepTimeSeriesSplit:\n",
102 " \"\"\"Generates tuples of train_idx, test_idx pairs\n",
103 " Assumes the index contains a level labeled 'date'\"\"\"\n",
104 "\n",
105 " def __init__(self, n_splits=3, test_period_length=1, shuffle=False):\n",
106 " self.n_splits = n_splits\n",
107 " self.test_period_length = test_period_length\n",
108 " self.shuffle = shuffle\n",
109 " self.test_end = n_splits * test_period_length\n",
110 "\n",
111 " @staticmethod\n",
112 " def chunks(l, n):\n",
113 " for i in range(0, len(l), n):\n",
114 " yield l[i:i + n]\n",
115 "\n",
116 " def split(self, X, y=None, groups=None):\n",
117 " unique_dates = (X\n",
118 " .index\n",
119 " .get_level_values('date')\n",
120 " .unique()\n",
121 " .sort_values(ascending=False)\n",
122 " [:self.test_end])\n",
123 "\n",
124 " dates = X.reset_index()[['date']]\n",
125 " for test_date in self.chunks(unique_dates, self.test_period_length):\n",
126 " train_idx = dates[dates.date < min(test_date)].index\n",
127 " test_idx = dates[dates.date.isin(test_date)].index\n",
128 " if self.shuffle:\n",
129 " np.random.shuffle(list(train_idx))\n",
130 " yield train_idx, test_idx\n",
131 " \n",
132 " def get_n_splits(self, X, y, groups=None):\n",
133 " return self.n_splits "
134 ]
135 },
136 {
137 "cell_type": "markdown",
138 "metadata": {},
139 "source": [
140 "## Instantiate GradientBoostingClassifier"
141 ]
142 },
143 {
144 "cell_type": "code",
145 "execution_count": 9,
146 "metadata": {},
147 "outputs": [],
148 "source": [
149 "gb_clf = GradientBoostingClassifier(loss='deviance',\n",
150 " learning_rate=0.1,\n",
151 " n_estimators=100,\n",
152 " subsample=1.0,\n",
153 " criterion='friedman_mse',\n",
154 " min_samples_split=2,\n",
155 " min_samples_leaf=1,\n",
156 " min_weight_fraction_leaf=0.0,\n",
157 " max_depth=3,\n",
158 " min_impurity_decrease=0.0,\n",
159 " min_impurity_split=None,\n",
160 " init=None,\n",
161 " random_state=None,\n",
162 " max_features=None,\n",
163 " verbose=0,\n",
164 " max_leaf_nodes=None,\n",
165 " warm_start=False,\n",
166 " presort='auto',\n",
167 " validation_fraction=0.1,\n",
168 " n_iter_no_change=None,\n",
169 " tol=0.0001)"
170 ]
171 },
172 {
173 "cell_type": "markdown",
174 "metadata": {},
175 "source": [
176 "## Load Data"
177 ]
178 },
179 {
180 "cell_type": "markdown",
181 "metadata": {},
182 "source": [
183 "We use the dataset generated by the notebook [feature-engineering](../04_alpha_factor_research/00_data/feature_engineering.ipynb) from [Chapter 4 on Alpha Factor Research](../04_alpha_factor_research) that needs to be executed first."
184 ]
185 },
186 {
187 "cell_type": "code",
188 "execution_count": 3,
189 "metadata": {},
190 "outputs": [],
191 "source": [
192 "DATA_STORE = Path('../../data/assets.h5')"
193 ]
194 },
195 {
196 "cell_type": "code",
197 "execution_count": 1,
198 "metadata": {},
199 "outputs": [],
200 "source": [
201 "def get_data(start='2000', end='2018', holding_period=1, dropna=False):\n",
202 " idx = pd.IndexSlice\n",
203 " target = f'target_{holding_period}m'\n",
204 " with pd.HDFStore(DATA_STORE) as store:\n",
205 " df = store['engineered_features']\n",
206 "\n",
207 " if start is not None and end is not None:\n",
208 " df = df.loc[idx[:, start: end], :]\n",
209 " if dropna:\n",
210 " df = df.dropna()\n",
211 "\n",
212 " y = (df[target] > 0).astype(int)\n",
213 " X = df.drop([c for c in df.columns if c.startswith('target')], axis=1)\n",
214 " return y, X"
215 ]
216 },
217 {
218 "cell_type": "code",
219 "execution_count": null,
220 "metadata": {},
221 "outputs": [],
222 "source": [
223 "n_splits = 12\n",
224 "y, features = get_data()\n",
225 "X = get_one_hot_data(features).dropna()\n",
226 "\n",
227 "y, X, y_test, X_test = get_holdout_set(target=y,\n",
228 " features=X)"
229 ]
230 },
231 {
232 "cell_type": "code",
233 "execution_count": null,
234 "metadata": {},
235 "outputs": [],
236 "source": [
237 "with pd.HDFStore('model_tuning.h5') as store:\n",
238 " store.put('holdout/features', X_test)\n",
239 " store.put('holdout/target', y_test)"
240 ]
241 },
242 {
243 "cell_type": "markdown",
244 "metadata": {},
245 "source": [
246 "## Setup GridSearchCV"
247 ]
248 },
249 {
250 "cell_type": "markdown",
251 "metadata": {},
252 "source": [
253 "The [GridSearchCV](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html) class in sklearn's [model_selection](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection) module facilitates the systematic evaluation of all combinations of the hyperparameter values that we would like to test.\n",
254 " \n",
255 "In the following code, we will illustrate this functionality for seven tuning parameters that will result in a total of 24 x 32 x 4 = 576 different model configurations."
256 ]
257 },
258 {
259 "cell_type": "markdown",
260 "metadata": {},
261 "source": [
262 "### Parameter Grid"
263 ]
264 },
265 {
266 "cell_type": "markdown",
267 "metadata": {},
268 "source": [
269 "First we define the cross-validation iterator:"
270 ]
271 },
272 {
273 "cell_type": "markdown",
274 "metadata": {},
275 "source": [
276 "cv = OneStepTimeSeriesSplit(n_splits=n_splits)"
277 ]
278 },
279 {
280 "cell_type": "markdown",
281 "metadata": {},
282 "source": [
283 "And next the parameter grid"
284 ]
285 },
286 {
287 "cell_type": "code",
288 "execution_count": 10,
289 "metadata": {},
290 "outputs": [],
291 "source": [
292 "param_grid = dict(\n",
293 " learning_rate=[.01, .1, .2],\n",
294 " max_depth=list(range(3, 13, 3)),\n",
295 " max_features=['sqrt', .8, 1],\n",
296 " min_impurity_decrease=[0, .01],\n",
297 " min_samples_split=[10, 50],\n",
298 " n_estimators=[100, 300],\n",
299 " subsample=[.8, 1],\n",
300 ")"
301 ]
302 },
303 {
304 "cell_type": "code",
305 "execution_count": 11,
306 "metadata": {},
307 "outputs": [
308 {
309 "name": "stdout",
310 "output_type": "stream",
311 "text": [
312 "# Models = : 576\n"
313 ]
314 }
315 ],
316 "source": [
317 "all_params = list(product(*param_grid.values()))\n",
318 "print('# Models = :', len(all_params))"
319 ]
320 },
321 {
322 "cell_type": "markdown",
323 "metadata": {},
324 "source": [
325 "### Instantiate GridSearchCV"
326 ]
327 },
328 {
329 "cell_type": "code",
330 "execution_count": null,
331 "metadata": {},
332 "outputs": [],
333 "source": [
334 "gs = GridSearchCV(gb_clf,\n",
335 " param_grid,\n",
336 " cv=cv,\n",
337 " scoring='roc_auc',\n",
338 " verbose=3,\n",
339 " n_jobs=-1,\n",
340 " return_train_score=True)\n",
341 "\n"
342 ]
343 },
344 {
345 "cell_type": "markdown",
346 "metadata": {},
347 "source": [
348 "### Fit GridSearchCV"
349 ]
350 },
351 {
352 "cell_type": "markdown",
353 "metadata": {},
354 "source": [
355 "This can take several days..."
356 ]
357 },
358 {
359 "cell_type": "code",
360 "execution_count": null,
361 "metadata": {},
362 "outputs": [],
363 "source": [
364 "start = time()\n",
365 "gs.fit(X=X, y=y)\n",
366 "done = time()"
367 ]
368 },
369 {
370 "cell_type": "markdown",
371 "metadata": {},
372 "source": [
373 "### Persist Results"
374 ]
375 },
376 {
377 "cell_type": "code",
378 "execution_count": null,
379 "metadata": {},
380 "outputs": [],
381 "source": [
382 "print(f'Done in {done:.2f}s')\n",
383 "joblib.dump(gs, 'gbm_gridsearch.joblib')"
384 ]
385 }
386 ],
387 "metadata": {
388 "kernelspec": {
389 "display_name": "Python 3",
390 "language": "python",
391 "name": "python3"
392 },
393 "language_info": {
394 "codemirror_mode": {
395 "name": "ipython",
396 "version": 3
397 },
398 "file_extension": ".py",
399 "mimetype": "text/x-python",
400 "name": "python",
401 "nbconvert_exporter": "python",
402 "pygments_lexer": "ipython3",
403 "version": "3.7.0"
404 },
405 "toc": {
406 "base_numbering": 1,
407 "nav_menu": {},
408 "number_sections": true,
409 "sideBar": true,
410 "skip_h1_title": true,
411 "title_cell": "Table of Contents",
412 "title_sidebar": "Contents",
413 "toc_cell": false,
414 "toc_position": {},
415 "toc_section_display": true,
416 "toc_window_display": false
417 }
418 },
419 "nbformat": 4,
420 "nbformat_minor": 2
421 }