ml-finance-python
python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
yelp_sentiment.ipynb
(80149B)
1 {
2 "cells": [
3 {
4 "cell_type": "code",
5 "execution_count": 1,
6 "metadata": {
7 "ExecuteTime": {
8 "end_time": "2018-12-28T02:16:30.922402Z",
9 "start_time": "2018-12-28T02:16:30.638781Z"
10 }
11 },
12 "outputs": [
13 {
14 "name": "stderr",
15 "output_type": "stream",
16 "text": [
17 "[nltk_data] Downloading package stopwords to /home/stefan/nltk_data...\n",
18 "[nltk_data] Package stopwords is already up-to-date!\n"
19 ]
20 },
21 {
22 "data": {
23 "text/plain": [
24 "True"
25 ]
26 },
27 "execution_count": 1,
28 "metadata": {},
29 "output_type": "execute_result"
30 }
31 ],
32 "source": [
33 "import nltk\n",
34 "nltk.download('stopwords')"
35 ]
36 },
37 {
38 "cell_type": "code",
39 "execution_count": 2,
40 "metadata": {
41 "ExecuteTime": {
42 "end_time": "2018-12-28T02:16:31.728129Z",
43 "start_time": "2018-12-28T02:16:30.925942Z"
44 }
45 },
46 "outputs": [],
47 "source": [
48 "from pathlib import Path\n",
49 "import numpy as np\n",
50 "import pandas as pd\n",
51 "from gensim.models import Doc2Vec\n",
52 "from gensim.models.doc2vec import TaggedDocument\n",
53 "import logging\n",
54 "import warnings\n",
55 "from random import shuffle\n",
56 "import lightgbm as lgb\n",
57 "from sklearn.model_selection import train_test_split\n",
58 "from nltk import RegexpTokenizer\n",
59 "from nltk.corpus import stopwords\n",
60 "from sklearn.linear_model import LogisticRegression\n",
61 "from sklearn.ensemble import RandomForestClassifier\n",
62 "from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score\n",
63 "from sklearn.utils import class_weight\n",
64 "import umap"
65 ]
66 },
67 {
68 "cell_type": "code",
69 "execution_count": 3,
70 "metadata": {
71 "ExecuteTime": {
72 "end_time": "2018-12-28T02:16:35.063506Z",
73 "start_time": "2018-12-28T02:16:35.061306Z"
74 }
75 },
76 "outputs": [],
77 "source": [
78 "warnings.filterwarnings('ignore')\n",
79 "pd.set_option('display.expand_frame_repr', False)\n",
80 "np.random.seed(42)"
81 ]
82 },
83 {
84 "cell_type": "code",
85 "execution_count": 4,
86 "metadata": {
87 "ExecuteTime": {
88 "end_time": "2018-12-28T02:16:35.246590Z",
89 "start_time": "2018-12-28T02:16:35.238924Z"
90 }
91 },
92 "outputs": [],
93 "source": [
94 "logging.basicConfig(\n",
95 " filename='doc2vec.log',\n",
96 " level=logging.DEBUG,\n",
97 " format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',\n",
98 " datefmt='%H:%M:%S')"
99 ]
100 },
101 {
102 "cell_type": "markdown",
103 "metadata": {},
104 "source": [
105 "## Load Data"
106 ]
107 },
108 {
109 "cell_type": "code",
110 "execution_count": 38,
111 "metadata": {
112 "ExecuteTime": {
113 "end_time": "2018-12-28T02:31:45.448079Z",
114 "start_time": "2018-12-28T02:31:21.942362Z"
115 }
116 },
117 "outputs": [],
118 "source": [
119 "df = pd.read_parquet('combined.parquet', engine='fastparquet').loc[:, ['stars', 'text']]"
120 ]
121 },
122 {
123 "cell_type": "code",
124 "execution_count": 39,
125 "metadata": {
126 "ExecuteTime": {
127 "end_time": "2018-12-28T02:31:45.481577Z",
128 "start_time": "2018-12-28T02:31:45.449302Z"
129 }
130 },
131 "outputs": [
132 {
133 "data": {
134 "text/plain": [
135 "5 2641880\n",
136 "4 1335957\n",
137 "1 858139\n",
138 "3 673206\n",
139 "2 487813\n",
140 "Name: stars, dtype: int64"
141 ]
142 },
143 "execution_count": 39,
144 "metadata": {},
145 "output_type": "execute_result"
146 }
147 ],
148 "source": [
149 "df.stars.value_counts()"
150 ]
151 },
152 {
153 "cell_type": "code",
154 "execution_count": 91,
155 "metadata": {
156 "ExecuteTime": {
157 "end_time": "2018-12-28T02:49:18.719387Z",
158 "start_time": "2018-12-28T02:49:18.715253Z"
159 }
160 },
161 "outputs": [],
162 "source": [
163 "stars = range(1, 6)"
164 ]
165 },
166 {
167 "cell_type": "code",
168 "execution_count": 40,
169 "metadata": {
170 "ExecuteTime": {
171 "end_time": "2018-12-28T02:32:45.883173Z",
172 "start_time": "2018-12-28T02:32:45.238120Z"
173 }
174 },
175 "outputs": [],
176 "source": [
177 "sample = pd.concat([df[df.stars==s].sample(n=100000) for s in stars])"
178 ]
179 },
180 {
181 "cell_type": "code",
182 "execution_count": 41,
183 "metadata": {
184 "ExecuteTime": {
185 "end_time": "2018-12-28T02:32:47.838880Z",
186 "start_time": "2018-12-28T02:32:47.743143Z"
187 }
188 },
189 "outputs": [
190 {
191 "name": "stdout",
192 "output_type": "stream",
193 "text": [
194 "<class 'pandas.core.frame.DataFrame'>\n",
195 "Int64Index: 500000 entries, 52085 to 3365007\n",
196 "Data columns (total 2 columns):\n",
197 "stars 500000 non-null int64\n",
198 "text 500000 non-null object\n",
199 "dtypes: int64(1), object(1)\n",
200 "memory usage: 11.4+ MB\n"
201 ]
202 }
203 ],
204 "source": [
205 "sample.info()"
206 ]
207 },
208 {
209 "cell_type": "code",
210 "execution_count": 66,
211 "metadata": {
212 "ExecuteTime": {
213 "end_time": "2018-12-28T00:07:55.455558Z",
214 "start_time": "2018-12-28T00:07:55.438396Z"
215 }
216 },
217 "outputs": [],
218 "source": [
219 "sample.stars = (sample.stars == 5).astype(int)"
220 ]
221 },
222 {
223 "cell_type": "code",
224 "execution_count": 42,
225 "metadata": {
226 "ExecuteTime": {
227 "end_time": "2018-12-28T02:32:54.195893Z",
228 "start_time": "2018-12-28T02:32:54.187161Z"
229 }
230 },
231 "outputs": [
232 {
233 "data": {
234 "text/plain": [
235 "5 100000\n",
236 "4 100000\n",
237 "3 100000\n",
238 "2 100000\n",
239 "1 100000\n",
240 "Name: stars, dtype: int64"
241 ]
242 },
243 "execution_count": 42,
244 "metadata": {},
245 "output_type": "execute_result"
246 }
247 ],
248 "source": [
249 "sample.stars.value_counts()"
250 ]
251 },
252 {
253 "cell_type": "code",
254 "execution_count": 43,
255 "metadata": {
256 "ExecuteTime": {
257 "end_time": "2018-12-28T02:33:04.902169Z",
258 "start_time": "2018-12-28T02:33:03.174809Z"
259 }
260 },
261 "outputs": [],
262 "source": [
263 "sample.to_parquet('yelp_sample_5.parquet')"
264 ]
265 },
266 {
267 "cell_type": "code",
268 "execution_count": 12,
269 "metadata": {
270 "ExecuteTime": {
271 "end_time": "2018-12-28T02:18:17.982897Z",
272 "start_time": "2018-12-28T02:18:17.121098Z"
273 }
274 },
275 "outputs": [],
276 "source": [
277 "sample = pd.read_parquet('yelp_sample.parquet').reset_index(drop=True)"
278 ]
279 },
280 {
281 "cell_type": "code",
282 "execution_count": 44,
283 "metadata": {
284 "ExecuteTime": {
285 "end_time": "2018-12-28T02:33:08.533674Z",
286 "start_time": "2018-12-28T02:33:08.526044Z"
287 }
288 },
289 "outputs": [
290 {
291 "data": {
292 "text/html": [
293 "<div>\n",
294 "<style scoped>\n",
295 " .dataframe tbody tr th:only-of-type {\n",
296 " vertical-align: middle;\n",
297 " }\n",
298 "\n",
299 " .dataframe tbody tr th {\n",
300 " vertical-align: top;\n",
301 " }\n",
302 "\n",
303 " .dataframe thead th {\n",
304 " text-align: right;\n",
305 " }\n",
306 "</style>\n",
307 "<table border=\"1\" class=\"dataframe\">\n",
308 " <thead>\n",
309 " <tr style=\"text-align: right;\">\n",
310 " <th></th>\n",
311 " <th>stars</th>\n",
312 " <th>text</th>\n",
313 " </tr>\n",
314 " </thead>\n",
315 " <tbody>\n",
316 " <tr>\n",
317 " <th>52085</th>\n",
318 " <td>1</td>\n",
319 " <td>Just terrible.\\n\\nI used to love Chili's - it ...</td>\n",
320 " </tr>\n",
321 " <tr>\n",
322 " <th>527763</th>\n",
323 " <td>1</td>\n",
324 " <td>I love Cold Stone ice cream, but this location...</td>\n",
325 " </tr>\n",
326 " <tr>\n",
327 " <th>3797997</th>\n",
328 " <td>1</td>\n",
329 " <td>I don't understand why people give this place ...</td>\n",
330 " </tr>\n",
331 " <tr>\n",
332 " <th>4715860</th>\n",
333 " <td>1</td>\n",
334 " <td>Terrible disappointment. It was a special cel...</td>\n",
335 " </tr>\n",
336 " <tr>\n",
337 " <th>2230375</th>\n",
338 " <td>1</td>\n",
339 " <td>Staff is awful. One called his coworker a bitc...</td>\n",
340 " </tr>\n",
341 " </tbody>\n",
342 "</table>\n",
343 "</div>"
344 ],
345 "text/plain": [
346 " stars text\n",
347 "52085 1 Just terrible.\\n\\nI used to love Chili's - it ...\n",
348 "527763 1 I love Cold Stone ice cream, but this location...\n",
349 "3797997 1 I don't understand why people give this place ...\n",
350 "4715860 1 Terrible disappointment. It was a special cel...\n",
351 "2230375 1 Staff is awful. One called his coworker a bitc..."
352 ]
353 },
354 "execution_count": 44,
355 "metadata": {},
356 "output_type": "execute_result"
357 }
358 ],
359 "source": [
360 "sample.head()"
361 ]
362 },
363 {
364 "cell_type": "code",
365 "execution_count": 17,
366 "metadata": {
367 "ExecuteTime": {
368 "end_time": "2018-12-28T02:19:00.180749Z",
369 "start_time": "2018-12-28T02:18:56.814179Z"
370 }
371 },
372 "outputs": [
373 {
374 "data": {
375 "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYoAAAEKCAYAAAAMzhLIAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAIABJREFUeJzt3XuYXNV55/vvW7e+qVvdaklIqAUtLBkQ2BagYGzGyQTGWHgyluPgE2EfB58wh5wcM4nnjM8EZh57Ep5cTGYm2H7scQYbYuJEBg/jOIpDjLl5HBJH0GAZkGShBoTUkpBa91tf6vLOH3tXqyiqq3Z3V3dd9Ps8Tz1Ve+21V63dau23116Xbe6OiIjIZGK1roCIiNQ3BQoRESlLgUJERMpSoBARkbIUKEREpCwFChERKUuBQkREylKgEBGRshQoRESkrEStK1ANCxcu9P7+/lpXQ0SkoTz33HOH3H1RpXxNESj6+/sZGBiodTVERBqKmb0eJZ9uPYmISFkKFCIiUpYChYiIlKVAISIiZSlQiIhIWQoUIiJSlgKFiIiUpUAxRXp0rIicaxQopuCvt+zl3X/4BNv3n6h1VURE5owCxRRs33+SgyfH+MR9z7Dr0OlaV0dEZE5EChRmts7MdpjZoJndUWJ/i5k9FO7fbGb9YXqvmT1lZqfM7MsF+TvNbEvB65CZfSHc90kzGy7Y96+rc6ozd3xknI5UnJw7H//6Zg6eHK11lUREZl3FQGFmceArwI3AauBmM1tdlO1W4Ki7rwTuAe4O00eBzwKfKczs7ifdfU3+BbwOfKcgy0MF+78+nRObDcdH0iztbuMb/9fPsffYCN/9yd5aV0lEZNZFaVFcDQy6+6vuPg48CKwvyrMeeCD8/DBwvZmZu59296cJAkZJZrYKWAz8/ZRrP8eOnUkzvy3JO/u66WlPsuvwmVpXSURk1kUJFMuAPQXbQ2FayTzungGOA70R63AzQQuicDjRr5jZC2b2sJktj1jOrDs+kqa7LQnABb0d7FagEJFzQJRAYSXSiseIRskzmQ3Atwq2/wbod/d3Ao9ztqXy5i80u83MBsxsYHh4OOJXzUy+RQFw4YJ2Xj+iDm0RaX5RAsUQUPhXfR+wb7I8ZpYA5gNHKhVsZu8CEu7+XD7N3Q+7+1i4+TXgqlLHuvu97r7W3dcuWlTxuRsztnHzbg6dGuONE6Ns3LybE6Np9h4dIZ3Nzfp3i4jUUpRA8SywysxWmFmKoAWwqSjPJuCW8PNNwJMebWbazby5NYGZLS3Y/BCwPUI5sy6bc8YyOdqScQB6O1LkHPYdG6lxzUREZlfFJ9y5e8bMbgceBeLA/e6+1czuAgbcfRNwH/BNMxskaElsyB9vZruALiBlZh8GbnD3beHu/wP4YNFX/paZfQjIhGV9cgbnVzWj6SwAbakgUCzoaAHg9cNnuLC3o2b1EhGZbZEeherujwCPFKV9ruDzKPDRSY7tL1PuRSXS7gTujFKvuTSSDxTJfKBIAfD6EXVoi0hz08zsiEbGg0DRHrYoOlsTJGLG7sPq0BaR5qZAEVFxiyJmRk9Hit1qUYhIk1OgiOhM2KJoDVsUEHRov665FCLS5BQoIsq3KNpTZ7t1FoQtCi09LiLNTIEiopHxDHD21hMEgeLMeJZDp8ZrVS0RkVmnQBHRyHiWVCJGPHZ2Enp+5JP6KUSkmSlQRDSSzr6pNQGFgUIjn0SkeSlQRDQynp0YGpvX057CDHVoi0hTU6CI6Ew6S2tRiyIZj7Gkq1WryIpIU1OgiKhUiwLgggXtmp0tIk1NgSKi0RJ9FADLF7QzdFSBQkSalwJFRGfGSweKvp42Dp4cYyyTrUGtRERmnwJFBKPpLJmcT6wcW2hZdxvusP/YpE97FRFpaAoUERwfSQOUDBR9Pe0ADB3VcylEpDkpUEQwESgmufUEsPeY+ilEpDkpUERw7MzkLYol81uJmVoUItK8FCgiyLco2pNvfc5TMh5j6fw29ipQiEiTUqCI4NiZYNG/Ui0KCDq01aIQkWYV6VGo57pyfRQbN+8mnc3x2qHTbNy8eyL9Y+++YM7qJyIym9SiiOD4SBoDWpKlf1zd7SmOj6TJ5vRcChFpPpEChZmtM7MdZjZoZneU2N9iZg+F+zebWX+Y3mtmT5nZKTP7ctExPwzL3BK+Fpcrq5aOj6RpTcaJmZXc39OexIETYctDRKSZVAwUZhYHvgLcCKwGbjaz1UXZbgWOuvtK4B7g7jB9FPgs8JlJiv+4u68JXwcrlFUzx0fSk/ZPQNCiADh6Rg8wEpHmE6VFcTUw6O6vuvs48CCwvijPeuCB8PPDwPVmZu5+2t2fJggYUZUsawrHV92xM+mS/RN5Pe1JAI6eUYtCRJpPlECxDNhTsD0UppXM4+4Z4DjQG6HsPwtvO322IBhMt6xZc3wkXXLl2Lz5bUmMs6OjRESaSZRAUeqv+eJe2yh5in3c3d8BvC98fWIqZZnZbWY2YGYDw8PDFb5qZvJ9FJNJxGN0tibUohCRphQlUAwBywu2+4B9k+UxswQwHzhSrlB33xu+nwQ2EtziilyWu9/r7mvdfe2iRYsinMb0jaazpOLlf1Td7Sm1KESkKUUJFM8Cq8xshZmlgA3ApqI8m4Bbws83AU+6+6QtCjNLmNnC8HMS+CXgpemUNRfSWScWK99N0tOeVGe2iDSlihPu3D1jZrcDjwJx4H5332pmdwED7r4JuA/4ppkNEvz1vyF/vJntArqAlJl9GLgBeB14NAwSceBx4GvhIZOWVSvpbI4KDQq621O8uPc4OfdJh9GKiDSiSDOz3f0R4JGitM8VfB4FPjrJsf2TFHvVJPknLatWMtkc8QoX/572FDkP5lLkh8uKiDQDzcyOIJ1z4hVuPXVriKyINCkFiggy2VzFPorutiBQHNfsbBFpMgoUFeRyTs6peOtpfrsChYg0JwWKCtK5HEDFW08tiThtybiGyIpI01GgqCCdDUbmRhnJ1N2eVItCRJqOAkUFmWy0FgUES3koUIhIs1GgqCDfoogaKI5p1JOINBkFigoy+T6KSLeeUoyks4xlsrNdLRGROaNAUUEm30cRoUUxMURWrQoRaSIKFBWkJ/ooKufNT7o7pn4KEWkiChQVTGXU03y1KESkCSlQVJBvUSQi3HrqbA0fYDSiuRQi0jwUKCrI5KL3UcRjRpeGyIpIk1GgqGBiHkXEpcM1RFZEmo0CRQXpKYx6gqBDW53ZItJMFCgqmMo8CgiGyJ4YSZPL1fShfCIiVaNAUUF6Ckt4QHDrKZNzDp9Wh7aINAcFigqmsoQHMPF0u33HRmatTiIic0mBooKpzMyGs3Mp9h9XoBCR5qBAUcF0+igA9h4bnbU6iYjMpUiBwszWmdkOMxs0sztK7G8xs4fC/ZvNrD9M7zWzp8zslJl9uSB/u5n9rZn9zMy2mtnnC/Z90syGzWxL+PrXMz/N6Zvqrae2VJxk3HTrSUSaRsVAYWZx4CvAjcBq4GYzW12U7VbgqLuvBO4B7g7TR4HPAp8pUfR/cfdLgCuAa83sxoJ9D7n7mvD19SmdUZXl51FEjBOYGd3tKYaOnpnFWomIzJ0oLYqrgUF3f9Xdx4EHgfVFedYDD4SfHwauNzNz99Pu/jRBwJjg7mfc/anw8zjwPNA3g/OYNVMd9QSwoD3F0FG1KESkOUQJFMuAPQXbQ2FayTzungGOA71RKmBm3cC/Ap4oSP4VM3vBzB42s+VRypktU731BMGkuz1H1KIQkeYQJVCUukIWzyaLkuetBZslgG8BX3L3V8PkvwH63f2dwOOcbakUH3ubmQ2Y2cDw8HClr5q2qXZmAyzoSHFiNKM1n0SkKUQJFENA4V/1fcC+yfKEF//5wJEIZd8L7HT3L+QT3P2wu4+Fm18Drip1oLvf6+5r3X3tokWLInzV9Ex1CQ+AnnAuhVoVItIMogSKZ4FVZrbCzFLABmBTUZ5NwC3h55uAJ929bIvCzH6fIKB8uih9acHmh4DtEeo4azLTuPXU0xEECnVoi0gzSFTK4O4ZM7sdeBSIA/e7+1YzuwsYcPdNwH3AN81skKAlsSF/vJntArqAlJl9GLgBOAH8R+BnwPMW3Nb5cjjC6bfM7ENAJizrk1U612lJZ3OYRXtwUd6CiRaFOrRFpPFVDBQA7v4I8EhR2ucKPo8CH53k2P5Jii155XX3O4E7o9RrLqRzOZJRnoNaoDUZo7MloRaFiDQFzcyuIJN1klO47QTBXIq+Be3s0RBZEWkCChQVZLI5ElNsUQAs72lTZ7aINAUFigrSOScZn1qLAmD5gnaGjo5QoU9fRKTuKVBUkMnmSMSm16IYSWc5dErPpRCRxqZAUUE66ySm0aLo62kHYI86tEWkwSlQVJDO5khNp49iQRAotOaTiDQ6BYoKMtNuUbQBmp0tIo1PgaKCTG56fRQdLQl6O7TcuIg0PgWKCtLZ6Y16AoK5FJqdLSINToGigkxuevMoIJxLoRaFiDQ4BYoK0hknMcWZ2Xl9Pe3sPTpCNqe5FCLSuBQoKkjncqQS0/sx9fe2k8m5np8tIg1NgaKCTHb6LYr+hR0AvHbodDWrJCIypxQoKkhPc60ngBVhoNh1WIFCRBqXAkUFmWmu9QSwuLOF9lRcLQoRaWgKFBVMd60nCJYbv7C3g10KFCLSwBQoKpjuWk95Kxa2s+uwhsiKSONSoKhgums95fX3drDnyBnS2VwVayUiMncUKCrI5Gbaouggk3MtDigiDUuBooL0DPoooGDkk/opRKRBRboCmtk6M9thZoNmdkeJ/S1m9lC4f7OZ9YfpvWb2lJmdMrMvFx1zlZm9GB7zJTOzMH2BmT1mZjvD956Zn+b0Zaa51tPGzbvZuHk3z+8+BsDDzw2xcfPualdPRGTWVQwUZhYHvgLcCKwGbjaz1UXZbgWOuvtK4B7g7jB9FPgs8JkSRX8VuA1YFb7Whel3AE+4+yrgiXC7Zmay1hNARypOSyLG4dNjVayViMjciXIFvBoYdPdX3X0ceBBYX5RnPfBA+Plh4HozM3c/7e5PEwSMCWa2FOhy9x978FDpPwc+XKKsBwrS55y7B6vHTnNmNgRDZBfOa+GwHokqIg0qSqBYBuwp2B4K00rmcfcMcBzorVDm0CRlnufu+8Oy9gOLSxVgZreZ2YCZDQwPD0c4janLhIv5JWfQogDonZfi0Cm1KESkMUW5Apb6c7p4OdQoeWaS/62Z3e9197XuvnbRokVTOTSyTDao0kxuPQH0drRw7EyaTE5DZEWk8US5Ag4Bywu2+4B9k+UxswQwHzhSocy+Sco8EN6ayt+iOhihjrMiHV7Yp7uER97CeSkcOHJat59EpPFECRTPAqvMbIWZpYANwKaiPJuAW8LPNwFPhn0PJYW3lE6a2TXhaKdfA/66RFm3FKTPuYkWxQz6KAB657UAqJ9CRBpSolIGd8+Y2e3Ao0AcuN/dt5rZXcCAu28C7gO+aWaDBC2JDfnjzWwX0AWkzOzDwA3uvg34TeAbQBvwd+EL4PPAt83sVmA38NFqnOh0ZMLZ1DO99bRwXgpA/RQi0pAqBgoAd38EeKQo7XMFn0eZ5ILu7v2TpA8Al5dIPwxcH6Ves208e/bW00xW4GhPJehsSXDghAKFiDQezcwuI3/raaajngAWd7Vw8ORo5YwiInVGgaKM/Cilmd56Aljc1crBE2Pk9PxsEWkwChRlpPMtihl2ZgOc19nKeDbHXj0/W0QajAJFGdWaRwFwXlcw8unlAydnXJaIyFxSoCgjPXHraeYtisWdrQC8fODUjMsSEZlLChRlpDPhqKcZLDOe15aK09WaYKdaFCLSYBQoyji71tPMWxQA53W18vJBBQoRaSwKFGWkqzThLm9xZwuDB09p5JOINBQFijLOzqOoXotiNJ1jz9EzVSlPRGQuKFCUMTGPogp9FBDMpQB1aItIY1GgKCNd5RbF4k4NkRWRxqNAUUa1+yhak3HOn9+qQCEiDUWBooxq91EAvH1JJzveUKAQkcahQFHG2QcXVe/HdOnSLl4ZPsV4Rk+7E5HGoEBRRrUeXFRo9dIu0llnp+ZTiEiDUKAoo9p9FACrz+8CYNu+E1UrU0RkNilQlFHtmdkA/b0dtCXjbN+vFoWINAYFijLyaz1Vax4FQDxmXLykk237j1etTBGR2aRAUUZ6FloUENx+2rbvBO5aykNE6p8CRRmZbI5EzDCrcqBY2sWJ0YweYiQiDSFSoDCzdWa2w8wGzeyOEvtbzOyhcP9mM+sv2HdnmL7DzD4Qpl1sZlsKXifM7NPhvt81s70F+z5YnVOdukzOq/IsimL5Dm31U4hII6gYKMwsDnwFuBFYDdxsZquLst0KHHX3lcA9wN3hsauBDcBlwDrgv5lZ3N13uPsad18DXAWcAf6qoLx78vvd/ZGZneL0pbO5qjyLotglSzox08gnEWkMUa6CVwOD7v6qu48DDwLri/KsBx4IPz8MXG/B/Zr1wIPuPuburwGDYXmFrgdecffXp3sSsyWTnZ0WRXsqwYreDnVoi0hDiBIolgF7CraHwrSSedw9AxwHeiMeuwH4VlHa7Wb2gpndb2Y9pSplZreZ2YCZDQwPD0c4jalLZ3NVnUNR6NLzu9i2Xy0KEal/Ua6Cpf6kLh6uM1messeaWQr4EPA/CvZ/FXgbsAbYD/zXUpVy93vdfa27r120aNHktZ+BdNZJVTlQbNy8m42bd5PO5NhzZIT7n36NjZt3V/U7RESqKcpVcAhYXrDdB+ybLI+ZJYD5wJEIx94IPO/uB/IJ7n7A3bPungO+xltvVc2ZTC43K7eeAM7vbgNg6KhGPolIfYsSKJ4FVpnZirAFsAHYVJRnE3BL+Pkm4EkPJglsAjaEo6JWAKuAZwqOu5mi205mtrRg85eBl6KeTLVlsl7VdZ4KXbCgnZjBa4f0ECMRqW+JShncPWNmtwOPAnHgfnffamZ3AQPuvgm4D/immQ0StCQ2hMduNbNvA9uADPApd88CmFk78H7gN4q+8o/NbA3BLapdJfbPmXQ2V9WVYwu1JuOc393Gq4dOz0r5IiLVUjFQAIRDVB8pSvtcwedR4KOTHPsHwB+USD9D0OFdnP6JKHWaC7M1jyLvooUd/MPgYS05LiJ1TTOzy0hnc1Vd56nYioXzyLqz+8iZWfsOEZGZUqAoI53NVX3UU6H+XvVTiEj9U6AoY7Ym3OW1JOMsUz+FiNQ5BYoy0jmftQl3eSsWzmPoyAgj49lZ/R4RkelSoCgjk82RnKXhsXkXLeog687zu4/O6veIiEyXAkUZs33rCeDCcD7Fj185PKvfIyIyXQoUZczmWk95Lck4fT3t/Gjn7KxXJSIyUwoUZaRzszvqKe/t53XywtBxhk+Ozfp3iYhMlQJFGbO5hEehi5d0AvCjl9WqEJH6o0BRRjo7+6OeAJbOb2XhvBZ+qEAhInVIgaKMTC5HcpY7swFiZvzC2xfxo5eHyeaKV3AXEaktBYoygltPc/Mj+sVLFnF8JM2WPRomKyL1RYGijPHs3LQoAN63chExgx/u0O0nEakvkVaPPVdlZnGZ8WJ/++J+lve0853n97J0fttE+sfefcGcfL+IyGTUophELufknFmfcFfo4iWd7D02wonR9Jx9p4hIJQoUk0jngmdEzFWLAuCSJV0A7Nh/cs6+U0SkEgWKSWSyweijuZhHkXdeVwvd7Um2v3Fizr5TRKQSBYpJTASKOWxRmBmXLOnileFTpLN66p2I1AcFikmMZ/O3nuauRQFw6ZJO0lnnlYN6mJGI1IdIgcLM1pnZDjMbNLM7SuxvMbOHwv2bzay/YN+dYfoOM/tAQfouM3vRzLaY2UBB+gIze8zMdobvPTM7xenJ1KCPAmDFwg5SiRjb31A/hYjUh4rDY80sDnwFeD8wBDxrZpvcfVtBtluBo+6+0sw2AHcDv2pmq4ENwGXA+cDjZvZ2d88/pecX3f1Q0VfeATzh7p8Pg9IdwO/M4BynZOPm3QAcOT0OwMCuo/gcTpZOxGOsWjyPHW+cwP38uftiEZFJRPlz+Wpg0N1fdfdx4EFgfVGe9cAD4eeHgevNzML0B919zN1fAwbD8sopLOsB4MMR6lh1uXApjTluUABw6ZIuToxm2HdsdO6/XESkSJTL4DJgT8H2UJhWMo+7Z4DjQG+FYx34gZk9Z2a3FeQ5z933h2XtBxZHO5XqyobNiJjNbR8FBPMpYgYv7j0+598tIlIsyszsUlfK4psxk+Upd+y17r7PzBYDj5nZz9z9RxHqE3xhEFxuA7jggurPXs5OtCjmPlB0tCRYtbiTnw4dI5dzYjWog4hIXpQWxRCwvGC7D9g3WR4zSwDzgSPljnX3/PtB4K84e0vqgJktDctaChwsVSl3v9fd17r72kWLFkU4jamZCBQ1aFEArLmgm+MjaTa/dqQm3y8ikhclUDwLrDKzFWaWIuic3lSUZxNwS/j5JuBJd/cwfUM4KmoFsAp4xsw6zKwTwMw6gBuAl0qUdQvw19M7tZnJee1aFBD0U6QSMb77k701+X4RkbyKgSLsc7gdeBTYDnzb3bea2V1m9qEw231Ar5kNAv8fwUgl3H0r8G1gG/B94FPhiKfzgKfN7KfAM8Dfuvv3w7I+D7zfzHYSjLT6fHVOdWryLYpa3fZJJWJcfn4Xj7y4n9F0tvIBIiKzJNLqse7+CPBIUdrnCj6PAh+d5Ng/AP6gKO1V4F2T5D8MXB+lXrOp1reeANYs7+H53cd4YvtB/uU7l9asHiJybtPM7EmMZYIJd6lE7X5EFy3qYHFnC995fqhmdRARUaCYxHgYKFpqGChiZtx0VR9P7TjI0NEzNauHiJzbFCgmMZYJ+gVakvGa1uPj11wIwDf/6fWa1kNEzl0KFJMYq4MWBcCy7jZuWL2Eh57do05tEakJBYpJjGVyxGxun0cxmVve28+xM2k2bSmeviIiMvsUKCYxlsmRSsSwGo56yrvmogVcfF4n3/jHXfhcrlAoIkLE4bHnovFMlpZEbfsn4Oxqtpcu7eK7W/byh4/8jBULO/jYu6u/bImISClqUUxiLJOref9EoTXLu2lPxXl653CtqyIi55j6uRLWmfE6CxSpRIxrLupl+xsnGT45VuvqiMg5pH6uhHUmaFHU/tZToWsu6iURM54eLH7Wk4jI7FGgmMRYJlvTWdmlzGtJcOUFPfxk91EOnVKrQkTmRn1dCetIvfVR5F27ciGZnPNn//BarasiIueI+rsS1omxdK7uWhQAizpbeMey+dz/9C4OntCjUkVk9tXflbBOjNdhH0XeDavPI5PLcc/jO2tdFRE5ByhQlJDJ5si605Kszx9P77wWPv7uC/n2wB4GD56sdXVEpMnV55Wwxuplnady/s11K2lLxrn7+ztqXRURaXKamV1CIwSKR7ce4L1v6+UH2w7wh3+7nf6FHQCasS0iVVe/V8IaGp94aFF99lHkvfdtC+lqTfB3L+3XGlAiMmsUKEqYeBZFHbcoIJit/S8uPY89R0fYuu9ErasjIk2qvq+ENdIIt57yrrigh8WdLTy69Y2J53yLiFRTpCuhma0zsx1mNmhmd5TY32JmD4X7N5tZf8G+O8P0HWb2gTBtuZk9ZWbbzWyrmf12Qf7fNbO9ZrYlfH1w5qc5NWcDRX3fegKIx4x1ly3h8OlxLRgoIrOiYqAwszjwFeBGYDVws5mtLsp2K3DU3VcC9wB3h8euBjYAlwHrgP8WlpcB/p27XwpcA3yqqMx73H1N+HpkRmc4DeMNcusp7+IlnVx+fhePbz/IS3uP17o6ItJkolwJrwYG3f1Vdx8HHgTWF+VZDzwQfn4YuN6CJ/6sBx509zF3fw0YBK529/3u/jyAu58EtgPLZn461TE20ZndGIHCzPjwmmW0t8T59ENb9MhUEamqKFfCZcCegu0h3npRn8jj7hngONAb5djwNtUVwOaC5NvN7AUzu9/MeiLUsaoaqY8ir70lwU1X9jF48BR3fW+bRkGJSNVEuRKWehZo8VVosjxljzWzecD/BD7t7vlhO18F3gasAfYD/7VkpcxuM7MBMxsYHq7uvfmxdI64GYl44wQKgFXndfIbv3ARGzfv5ktPDNa6OiLSJKJcCYeA5QXbfcC+yfKYWQKYDxwpd6yZJQmCxF+6+3fyGdz9gLtn3T0HfI3g1tdbuPu97r7W3dcuWrQowmlEV49LjEf1Ox+4hJuu6uOex1/m/qe1wqyIzFyUq+GzwCozW2FmKYLO6U1FeTYBt4SfbwKe9ODexyZgQzgqagWwCngm7L+4D9ju7n9SWJCZLS3Y/GXgpame1EyNZ3J1u85TJbGY8fmPvIMPXHYed31vG997oTimi4hMTcUlPNw9Y2a3A48CceB+d99qZncBA+6+ieCi/00zGyRoSWwIj91qZt8GthGMdPqUu2fN7J8BnwBeNLMt4Vf9h3CE0x+b2RqCW1S7gN+o4vlGUq/Poohi4+bdQDBr+2f7T/LpB7ewde8JfufGS2pcMxFpVNYMnZ5r1671gYGBqpS1cfNu7n/6NcazOf6fX3hbVcqslVNjGb76w0HSWefRf/vzLOtuq3WVRKSOmNlz7r62Ur7G/LN5lo1lsg3boig0ryXBr72nn3Q2xyfvf4ZjZ8ZrXSURaUCNfzWcBWOZ+ny63XSc19XKJ665kNcPn+HXv/EsZ8Yzta6SiDSY5rgaVtlYHT/dbjouWjSPL928hi17jvGbf/G8JuSJyJQoUJTQyMNjJ7Pu8qX80Ufewf96eZhbH3iW02NqWYhINM11NawCdw+fl91cP5qNm3eTzcFHr+rjHwcPc+MX/557f/RqraslIg1AT7grksk5OW+s5Tum4ooLekjGYzw0sIc/eWwHbak4H7v6AuKxUpPoRUTUoniLRlznaaouXzaff3PdSs7vbuOz332Jm/70H9l16HStqyUidUotiiJj6fwS483TmV3K4s5Wbr12BVv2HONvXtjHDff8iH/5zqWsvbCHj19zYa2rJyJ1RIGiSKMtMT4TZsYVF/SwYmEHDz8/xF/9ZC8Du45w8ZJO1vYvqHX1RKRONP/VcIrG87eeGnStp+nobk/x69eu4Feu7OP4SJqb/vTH/OZEkGrMAAAMrklEQVRfPKfbUSICqEXxFo30GNRqiplx1YU9vGPZfE6MpvnT//UKj28/wK/+3HJ+5co+1izvJljLUUTONQoURcYa7DGo1ZZKxFg4r4Xfvn4Vj28/wLee2cNf/NNuFnSk+L/fdxE3X72c7vZUraspInNIgaLI+Dkw6imKztYkv3xFHzdevpSt+07w/O6j3P39n/HFJ17mI1f28evX9rNycWetqykic0CBosi5eutpMq3JOFdd2MNVF/aw//gIP37lMN9+dg8bN+9m1eJ5vPdtC/lP/2o1Mc3DEGlaChRF8reezoVRT1O1dH4bH7myjxsuW8Izrx1h82uHeeDHu/j7wWE+9K7zuXblQtYs7ybZYI+QFZHyFCiKjGVyJGKmmcplzGtJcN0li/n5ty/kpb3HeWX4NF98YidfeHwnLYkYq8/v4p3L5vOety3kvSt76WpN1rrKIjIDChRFmmmJ8dmWiMVYs7yHNct7uPHyJbw6fJrXD59m77FRvvXMHh748evEY8YVy7v5+bcv4n2rFvLOvm4FYZEGo0BRpBkXBJwL7akEly+bz+XL5gOQzTm7j5xh58GT7Dxwinsee5k/eexl2pJxrrt0MSsXzWPhvBTnd7dx+bL5nNfVWuMzEJHJKFAUSGdzvDp8iqXz9cjQmYrHjBULO1ixsIMbVsPpsQyDw6cYPHCKp3ce4pEX9lP4EN7O1gRX9y/g8mXzuXRpFysXd3DBgg617kTqgAJFgce2HeDEaIb1a7R8RbV1tCR4V1837+rrBoIWx5nxDEdOj7P32Ah7j46w+8gZntxxkMLHuLen4sxrSbB8QTurFs/jgt52ejtSLOhooa+njeUL2pnXol9jkdkU6X+Yma0DvgjEga+7++eL9rcAfw5cBRwGftXdd4X77gRuBbLAb7n7o+XKNLMVwIPAAuB54BPuPicPe/7zH++iuz3JxUs0P2C2xWNGZ2uSztYkF/Z2TKSPZbIMnxxj+OQYR86MM5bOMTKe5fCpcX62fx+nx9/6dL72VJzO1gQ97Sn6ezu4aFEHiztb6G5PMb89yfy2JN1twfv8tiQJjcoSmZKKgcLM4sBXgPcDQ8CzZrbJ3bcVZLsVOOruK81sA3A38KtmthrYAFwGnA88bmZvD4+ZrMy7gXvc/UEz+9Ow7K9W42TLefnASf7p1SN84LIlxLRURc20JOL09bTT19Necv94JseZ8QynxoLWyJHT45weyzCWyXFqLMPA60f5wbY3yHnJwwHobEnQ1Zakqy1JeypOeypOWzJ8T8VpSyboaInT056id16K1uTZOTVGsJhiazJGR0uCeS0J2lNxOlIJUokYqUSMRMy03Ik0lSgtiquBQXd/FcDMHgTWA4WBYj3wu+Hnh4EvW/A/ZT3woLuPAa+Z2WBYHqXKNLPtwHXAx8I8D4Tlzkqg2LbvBC/uPcZl58/nLzfvJpWIsfbCntn4KqmS4GKcors9NWkwyeackXSW0fEsZ9JZRsazjKQznBnPMpLfHs8yms5y7Mw4B0/kSGed8WyOdCbHeDbHeCZHmVhTuZ7xGMm4kUzESMZjJGNGPG7ELRh6nYjFgvd4fvvN6a3JGO2pBG2pOO3JOK3JOBODxcIgZARrdMUMYjE7+9kMC9/jMaMlEQS1lkQMJ3iKozvkHHLu5MJtx8nlmDhvD+8Bvunn4Pm3cJ+/ZddEmhccaRjJuJFKxGgJA2oqHp9IS8RimAUtzfyrIxUE7JgZmZyDQzwe/KxiFr5XaQSd+5v/tacS6N39TfndnWzOiRf8wZDO5nCHZDxIy4W/o/l/HzNjLJPl9FiWlkSM9lQcdzg+kubkaIb5bUm62hKMpnPsPTbCqbEM589vZeG8ljmZ7BolUCwD9hRsDwHvniyPu2fM7DjQG6b/U9Gxy8LPpcrsBY65e6ZE/qr7wbY3+MLjOye2P3LlMjp0v7vhxWPGvPCv/enKuTOaznJqLEMm++aLiHvwH38sk2Msk2U8E3zO5pxseJHIvzI5J5vLkcsFZWY9eIJiLue4O2NpP5ueO3vhTofBajzrpDM50tlgxYCZBK9mZMZE4HCAfMDzMCDm8xFc/GMWBC0zJoJpJucTF/JSCoNvYRmOk86eDQrJuJHzs8sAxSz4wyaTDX4P8mnJeGxiBYh8WiIemzgOmBhCni1oGsdj9qZtCP4g+b31l3Hz1RfM6OdYSZT/SaXCVfGPdLI8k6WXuklcLv9bK2V2G3BbuHnKzHaUyjcV9wRvC4FDMy2rhlT/2lL9a6eR6w7TrP/H/vDsLZhpiPSUsiiBYghYXrDdB+ybJM+QmSWA+cCRCseWSj8EdJtZImxVlPouANz9XuDeCPWfEjMbcPe11S53rqj+taX6104j1x3qu/5Rhn88C6wysxVmliLonN5UlGcTcEv4+SbgSQ9u+m0CNphZSziaaRXwzGRlhsc8FZZBWOZfT//0RERkpiq2KMI+h9uBRwmGst7v7lvN7C5gwN03AfcB3ww7q48QXPgJ832boOM7A3zK3bMApcoMv/J3gAfN7PeBn4Rli4hIjVhxb/+5zsxuC29rNSTVv7ZU/9pp5LpDfddfgUJERMrSFFURESlLgaKAma0zsx1mNmhmd9S6PqWY2f1mdtDMXipIW2Bmj5nZzvC9J0w3M/tSeD4vmNmVtas5mNlyM3vKzLab2VYz++0Gq3+rmT1jZj8N6/97YfoKM9sc1v+hcIAG4SCOh8L6bzaz/lrWP8/M4mb2EzP7XrjdMPU3s11m9qKZbTGzgTCtUX5/us3sYTP7Wfh/4D2NUncFipCdXarkRmA1cLMFS5DUm28A64rS7gCecPdVwBPhNgTnsip83cYcLIVSQQb4d+5+KXAN8KnwZ9wo9R8DrnP3dwFrgHVmdg1nl51ZBRwlWHYGCpa2IZimc3cN6lzKbwPbC7Ybrf6/6O5rCoaSNsrvzxeB77v7JcC7CP4NGqPuwXR+vYD3AI8WbN8J3Fnrek1S137gpYLtHcDS8PNSYEf4+b8DN5fKVw8vgqHP72/E+gPtBItWvptg/k+i+PeIYFTfe8LPiTCf1bjefQQXpOuA7xFMcm2k+u8CFhal1f3vD9AFvFb882uEuru7WhQFSi1VMmvLh1TZee6+HyB8Xxym1+05hbcxrgA200D1D2/bbAEOAo8BrzD5sjNvWtoGyC9tU0tfAP49kF8votyyOfVYfwd+YGbPWbA6AzTG789FwDDwZ+Ftv6+bWQeNUXcFigKRlw9pIHV5TmY2D/ifwKfd/US5rCXSalp/d8+6+xqCv8yvBi4tlS18r6v6m9kvAQfd/bnC5BJZ67L+oWvd/UqCWzOfMrOfL5O3nuqfAK4EvuruVwCnOXubqZR6qrsCRYEoS5XUqwNmthQgfD8YptfdOZlZkiBI/KW7fydMbpj657n7MeCHBH0t3RYsXQNvruNE/e3NS9vUyrXAh8xsF8EzX64jaGE0Sv1x933h+0HgrwiCdSP8/gwBQ+6+Odx+mCBwNELdFSgKRFmqpF4VLqFSuOzJJuDXwhEU1wDH883cWjAzI5hpv93d/6RgV6PUf5GZdYef24B/QdAhOdmyM5MtbVMT7n6nu/e5ez/B7/eT7v5xGqT+ZtZhZp35z8ANwEs0wO+Pu78B7DGzi8Ok6wlWrKj7ugPqzC58AR8EXia47/wfa12fSer4LWA/kCb4q+NWgvvGTwA7w/cFYV4jGMn1CvAisLbGdf9nBM3nF4At4euDDVT/dxIsK/MCwQXqc2H6RQRrmA0C/wNoCdNbw+3BcP9Ftf79KTiXfw58r5HqH9bzp+Fra/7/aAP9/qwBBsLfn+8CPY1Sd83MFhGRsnTrSUREylKgEBGRshQoRESkLAUKEREpS4FCRETKUqAQmaZwNdD/d5rH9pvZx6pdJ5HZoEAhMn3dwLQCBcHCjgoU0hAUKESm7/PA28JnI/xnM/v/zezZ8PkB+WdV/Fy43RrOLN5qZpeHx74vPPbf1vQsRCrQhDuRaQpXwP2eu19uZjcQLHPxGwSzajcBf+zuPzKz3yeY5dxGsN7PH5nZPwc+4+6/VJPKi0xBonIWEYnghvD1k3B7HsFDZ34E3EWwltgo8Fs1qZ3IDChQiFSHAX/k7v+9xL4FBIEjSdCyOD2XFROZKfVRiEzfSaAz/Pwo8OvhszYws2Vmln8Izb3AZ4G/5OzjRAuPFalralGITJO7HzazfzCzl4C/AzYCPw5WU+cU8H+a2Tog4+4bw+ey/6OZXQf8PZAxs58C33D3e2p0GiIVqTNbRETK0q0nEREpS4FCRETKUqAQEZGyFChERKQsBQoRESlLgUJERMpSoBARkbIUKEREpKz/DecErv0c3J59AAAAAElFTkSuQmCC\n",
376 "text/plain": [
377 "<Figure size 432x288 with 1 Axes>"
378 ]
379 },
380 "metadata": {
381 "needs_background": "light"
382 },
383 "output_type": "display_data"
384 }
385 ],
386 "source": [
387 "sns.distplot(sample.text.str.split().str.len());"
388 ]
389 },
390 {
391 "cell_type": "markdown",
392 "metadata": {},
393 "source": [
394 "## Doc2Vec"
395 ]
396 },
397 {
398 "cell_type": "markdown",
399 "metadata": {},
400 "source": [
401 "### Basic text cleaning"
402 ]
403 },
404 {
405 "cell_type": "code",
406 "execution_count": 45,
407 "metadata": {
408 "ExecuteTime": {
409 "end_time": "2018-12-28T02:33:19.647682Z",
410 "start_time": "2018-12-28T02:33:19.644233Z"
411 }
412 },
413 "outputs": [],
414 "source": [
415 "tokenizer = RegexpTokenizer(r'\\w+')\n",
416 "stopword_set = set(stopwords.words('english'))\n",
417 "\n",
418 "def clean(review):\n",
419 " tokens = tokenizer.tokenize(review)\n",
420 " return ' '.join([t for t in tokens if t not in stopword_set])"
421 ]
422 },
423 {
424 "cell_type": "code",
425 "execution_count": 46,
426 "metadata": {
427 "ExecuteTime": {
428 "end_time": "2018-12-28T02:33:35.354851Z",
429 "start_time": "2018-12-28T02:33:20.198492Z"
430 }
431 },
432 "outputs": [],
433 "source": [
434 "sample.text = sample.text.str.lower().apply(clean)"
435 ]
436 },
437 {
438 "cell_type": "code",
439 "execution_count": 47,
440 "metadata": {
441 "ExecuteTime": {
442 "end_time": "2018-12-28T02:33:35.368047Z",
443 "start_time": "2018-12-28T02:33:35.355960Z"
444 }
445 },
446 "outputs": [
447 {
448 "data": {
449 "text/html": [
450 "<div>\n",
451 "<style scoped>\n",
452 " .dataframe tbody tr th:only-of-type {\n",
453 " vertical-align: middle;\n",
454 " }\n",
455 "\n",
456 " .dataframe tbody tr th {\n",
457 " vertical-align: top;\n",
458 " }\n",
459 "\n",
460 " .dataframe thead th {\n",
461 " text-align: right;\n",
462 " }\n",
463 "</style>\n",
464 "<table border=\"1\" class=\"dataframe\">\n",
465 " <thead>\n",
466 " <tr style=\"text-align: right;\">\n",
467 " <th></th>\n",
468 " <th>stars</th>\n",
469 " <th>text</th>\n",
470 " </tr>\n",
471 " </thead>\n",
472 " <tbody>\n",
473 " <tr>\n",
474 " <th>3713191</th>\n",
475 " <td>1</td>\n",
476 " <td>called 938 placed order informer ian manager a...</td>\n",
477 " </tr>\n",
478 " <tr>\n",
479 " <th>3632813</th>\n",
480 " <td>3</td>\n",
481 " <td>ok best tip sell stuff buffalo exchange sharin...</td>\n",
482 " </tr>\n",
483 " <tr>\n",
484 " <th>1414414</th>\n",
485 " <td>5</td>\n",
486 " <td>afford rooms well worth money absolutely amazi...</td>\n",
487 " </tr>\n",
488 " <tr>\n",
489 " <th>4609094</th>\n",
490 " <td>3</td>\n",
491 " <td>little bit pricier nw competition peak hours d...</td>\n",
492 " </tr>\n",
493 " <tr>\n",
494 " <th>4996179</th>\n",
495 " <td>4</td>\n",
496 " <td>great pigging comfort food visiting great frie...</td>\n",
497 " </tr>\n",
498 " <tr>\n",
499 " <th>1826950</th>\n",
500 " <td>5</td>\n",
501 " <td>went sun auto fri mar 9th dealt patrick mantan...</td>\n",
502 " </tr>\n",
503 " <tr>\n",
504 " <th>4210188</th>\n",
505 " <td>5</td>\n",
506 " <td>went nail salon must say impressed level custo...</td>\n",
507 " </tr>\n",
508 " <tr>\n",
509 " <th>1354353</th>\n",
510 " <td>5</td>\n",
511 " <td>rita must love custard black cherry little bit...</td>\n",
512 " </tr>\n",
513 " <tr>\n",
514 " <th>2760</th>\n",
515 " <td>1</td>\n",
516 " <td>drittes goa pfaffing erlebt absolut nix unterh...</td>\n",
517 " </tr>\n",
518 " <tr>\n",
519 " <th>1118726</th>\n",
520 " <td>1</td>\n",
521 " <td>visited week ago im finally writing review pla...</td>\n",
522 " </tr>\n",
523 " </tbody>\n",
524 "</table>\n",
525 "</div>"
526 ],
527 "text/plain": [
528 " stars text\n",
529 "3713191 1 called 938 placed order informer ian manager a...\n",
530 "3632813 3 ok best tip sell stuff buffalo exchange sharin...\n",
531 "1414414 5 afford rooms well worth money absolutely amazi...\n",
532 "4609094 3 little bit pricier nw competition peak hours d...\n",
533 "4996179 4 great pigging comfort food visiting great frie...\n",
534 "1826950 5 went sun auto fri mar 9th dealt patrick mantan...\n",
535 "4210188 5 went nail salon must say impressed level custo...\n",
536 "1354353 5 rita must love custard black cherry little bit...\n",
537 "2760 1 drittes goa pfaffing erlebt absolut nix unterh...\n",
538 "1118726 1 visited week ago im finally writing review pla..."
539 ]
540 },
541 "execution_count": 47,
542 "metadata": {},
543 "output_type": "execute_result"
544 }
545 ],
546 "source": [
547 "sample.sample(n=10)"
548 ]
549 },
550 {
551 "cell_type": "code",
552 "execution_count": 48,
553 "metadata": {
554 "ExecuteTime": {
555 "end_time": "2018-12-28T02:33:57.874953Z",
556 "start_time": "2018-12-28T02:33:55.863246Z"
557 }
558 },
559 "outputs": [
560 {
561 "name": "stdout",
562 "output_type": "stream",
563 "text": [
564 "<class 'pandas.core.frame.DataFrame'>\n",
565 "Int64Index: 485681 entries, 52085 to 3365007\n",
566 "Data columns (total 2 columns):\n",
567 "stars 485681 non-null int64\n",
568 "text 485681 non-null object\n",
569 "dtypes: int64(1), object(1)\n",
570 "memory usage: 11.1+ MB\n"
571 ]
572 }
573 ],
574 "source": [
575 "sample = sample[sample.text.str.split().str.len()>10]\n",
576 "sample.info()"
577 ]
578 },
579 {
580 "cell_type": "markdown",
581 "metadata": {},
582 "source": [
583 "### Create sentence stream"
584 ]
585 },
586 {
587 "cell_type": "code",
588 "execution_count": 49,
589 "metadata": {
590 "ExecuteTime": {
591 "end_time": "2018-12-28T02:34:06.903097Z",
592 "start_time": "2018-12-28T02:34:01.100396Z"
593 }
594 },
595 "outputs": [],
596 "source": [
597 "sentences = []\n",
598 "for i, (_, text) in enumerate(sample.values):\n",
599 " sentences.append(TaggedDocument(words=text.split(), tags=[i]))"
600 ]
601 },
602 {
603 "cell_type": "markdown",
604 "metadata": {},
605 "source": [
606 "### Formulate the model"
607 ]
608 },
609 {
610 "cell_type": "code",
611 "execution_count": 50,
612 "metadata": {
613 "ExecuteTime": {
614 "end_time": "2018-12-28T02:34:06.906431Z",
615 "start_time": "2018-12-28T02:34:06.904273Z"
616 }
617 },
618 "outputs": [],
619 "source": [
620 "size=300\n",
621 "window=5\n",
622 "min_count=0\n",
623 "epochs=5\n",
624 "negative=5\n",
625 "dm = 1\n",
626 "dm_concat=0\n",
627 "dbow_words=0\n",
628 "workers = 8"
629 ]
630 },
631 {
632 "cell_type": "code",
633 "execution_count": null,
634 "metadata": {},
635 "outputs": [],
636 "source": [
637 "model = Doc2Vec(documents=sentences,\n",
638 " dm=1,\n",
639 " size=size,\n",
640 " window=window,\n",
641 " min_count=min_count,\n",
642 " workers=workers,\n",
643 " epochs=epochs,\n",
644 " negative=negative,\n",
645 " dm_concat=dm_concat,\n",
646 " dbow_words=dbow_words)"
647 ]
648 },
649 {
650 "cell_type": "code",
651 "execution_count": 51,
652 "metadata": {
653 "ExecuteTime": {
654 "end_time": "2018-12-28T02:37:01.442409Z",
655 "start_time": "2018-12-28T02:34:07.761377Z"
656 }
657 },
658 "outputs": [],
659 "source": [
660 "model = Doc2Vec(documents=sentences,\n",
661 " dm=dm,\n",
662 " size=size,\n",
663 " window=window,\n",
664 " min_count=min_count,\n",
665 " workers=workers,\n",
666 " epochs=epochs,\n",
667 " negative=negative,\n",
668 " dm_concat=dm_concat,\n",
669 " dbow_words=dbow_words)"
670 ]
671 },
672 {
673 "cell_type": "code",
674 "execution_count": 90,
675 "metadata": {
676 "ExecuteTime": {
677 "end_time": "2018-12-28T01:56:03.959942Z",
678 "start_time": "2018-12-28T01:52:41.532880Z"
679 }
680 },
681 "outputs": [],
682 "source": [
683 "model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs)"
684 ]
685 },
686 {
687 "cell_type": "code",
688 "execution_count": 52,
689 "metadata": {
690 "ExecuteTime": {
691 "end_time": "2018-12-28T02:37:01.604728Z",
692 "start_time": "2018-12-28T02:37:01.443607Z"
693 }
694 },
695 "outputs": [
696 {
697 "data": {
698 "text/html": [
699 "<div>\n",
700 "<style scoped>\n",
701 " .dataframe tbody tr th:only-of-type {\n",
702 " vertical-align: middle;\n",
703 " }\n",
704 "\n",
705 " .dataframe tbody tr th {\n",
706 " vertical-align: top;\n",
707 " }\n",
708 "\n",
709 " .dataframe thead th {\n",
710 " text-align: right;\n",
711 " }\n",
712 "</style>\n",
713 "<table border=\"1\" class=\"dataframe\">\n",
714 " <thead>\n",
715 " <tr style=\"text-align: right;\">\n",
716 " <th></th>\n",
717 " <th>token</th>\n",
718 " <th>similarity</th>\n",
719 " </tr>\n",
720 " </thead>\n",
721 " <tbody>\n",
722 " <tr>\n",
723 " <th>0</th>\n",
724 " <td>great</td>\n",
725 " <td>0.869434</td>\n",
726 " </tr>\n",
727 " <tr>\n",
728 " <th>1</th>\n",
729 " <td>decent</td>\n",
730 " <td>0.824517</td>\n",
731 " </tr>\n",
732 " <tr>\n",
733 " <th>2</th>\n",
734 " <td>ok</td>\n",
735 " <td>0.759463</td>\n",
736 " </tr>\n",
737 " <tr>\n",
738 " <th>3</th>\n",
739 " <td>bad</td>\n",
740 " <td>0.749852</td>\n",
741 " </tr>\n",
742 " <tr>\n",
743 " <th>4</th>\n",
744 " <td>amazing</td>\n",
745 " <td>0.748687</td>\n",
746 " </tr>\n",
747 " <tr>\n",
748 " <th>5</th>\n",
749 " <td>awesome</td>\n",
750 " <td>0.733886</td>\n",
751 " </tr>\n",
752 " <tr>\n",
753 " <th>6</th>\n",
754 " <td>okay</td>\n",
755 " <td>0.719382</td>\n",
756 " </tr>\n",
757 " <tr>\n",
758 " <th>7</th>\n",
759 " <td>tasty</td>\n",
760 " <td>0.707188</td>\n",
761 " </tr>\n",
762 " <tr>\n",
763 " <th>8</th>\n",
764 " <td>nice</td>\n",
765 " <td>0.702621</td>\n",
766 " </tr>\n",
767 " <tr>\n",
768 " <th>9</th>\n",
769 " <td>delicious</td>\n",
770 " <td>0.692123</td>\n",
771 " </tr>\n",
772 " </tbody>\n",
773 "</table>\n",
774 "</div>"
775 ],
776 "text/plain": [
777 " token similarity\n",
778 "0 great 0.869434\n",
779 "1 decent 0.824517\n",
780 "2 ok 0.759463\n",
781 "3 bad 0.749852\n",
782 "4 amazing 0.748687\n",
783 "5 awesome 0.733886\n",
784 "6 okay 0.719382\n",
785 "7 tasty 0.707188\n",
786 "8 nice 0.702621\n",
787 "9 delicious 0.692123"
788 ]
789 },
790 "execution_count": 52,
791 "metadata": {},
792 "output_type": "execute_result"
793 }
794 ],
795 "source": [
796 "pd.DataFrame(model.most_similar('good'), columns=['token', 'similarity'])"
797 ]
798 },
799 {
800 "cell_type": "markdown",
801 "metadata": {},
802 "source": [
803 "## Persist Model"
804 ]
805 },
806 {
807 "cell_type": "code",
808 "execution_count": 53,
809 "metadata": {
810 "ExecuteTime": {
811 "end_time": "2018-12-28T02:37:02.197070Z",
812 "start_time": "2018-12-28T02:37:01.605570Z"
813 }
814 },
815 "outputs": [],
816 "source": [
817 "model.save('sample5.model')"
818 ]
819 },
820 {
821 "cell_type": "code",
822 "execution_count": 6,
823 "metadata": {
824 "ExecuteTime": {
825 "end_time": "2018-12-28T00:54:04.864287Z",
826 "start_time": "2018-12-28T00:54:03.581152Z"
827 }
828 },
829 "outputs": [],
830 "source": [
831 "model = Doc2Vec.load('sample.model')"
832 ]
833 },
834 {
835 "cell_type": "markdown",
836 "metadata": {},
837 "source": [
838 "## Evaluate"
839 ]
840 },
841 {
842 "cell_type": "code",
843 "execution_count": 62,
844 "metadata": {
845 "ExecuteTime": {
846 "end_time": "2018-12-28T02:38:50.845013Z",
847 "start_time": "2018-12-28T02:38:50.804633Z"
848 }
849 },
850 "outputs": [],
851 "source": [
852 "y = sample.stars.sub(1)"
853 ]
854 },
855 {
856 "cell_type": "code",
857 "execution_count": 55,
858 "metadata": {
859 "ExecuteTime": {
860 "end_time": "2018-12-28T02:37:03.062359Z",
861 "start_time": "2018-12-28T02:37:02.201730Z"
862 }
863 },
864 "outputs": [],
865 "source": [
866 "X = np.zeros(shape=(len(y), size))\n",
867 "for i in range(len(sample)):\n",
868 " X[i] = model.docvecs[i]"
869 ]
870 },
871 {
872 "cell_type": "code",
873 "execution_count": 56,
874 "metadata": {
875 "ExecuteTime": {
876 "end_time": "2018-12-28T02:37:03.065414Z",
877 "start_time": "2018-12-28T02:37:03.063244Z"
878 }
879 },
880 "outputs": [
881 {
882 "data": {
883 "text/plain": [
884 "(485681, 300)"
885 ]
886 },
887 "execution_count": 56,
888 "metadata": {},
889 "output_type": "execute_result"
890 }
891 ],
892 "source": [
893 "X.shape"
894 ]
895 },
896 {
897 "cell_type": "markdown",
898 "metadata": {},
899 "source": [
900 "### Train-Test Split"
901 ]
902 },
903 {
904 "cell_type": "code",
905 "execution_count": 63,
906 "metadata": {
907 "ExecuteTime": {
908 "end_time": "2018-12-28T02:39:04.085998Z",
909 "start_time": "2018-12-28T02:39:03.631216Z"
910 }
911 },
912 "outputs": [],
913 "source": [
914 "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)"
915 ]
916 },
917 {
918 "cell_type": "code",
919 "execution_count": 64,
920 "metadata": {
921 "ExecuteTime": {
922 "end_time": "2018-12-28T02:39:06.672781Z",
923 "start_time": "2018-12-28T02:39:06.665668Z"
924 }
925 },
926 "outputs": [
927 {
928 "name": "stdout",
929 "output_type": "stream",
930 "text": [
931 "Baseline Score: 20.16%\n"
932 ]
933 }
934 ],
935 "source": [
936 "mode = pd.Series(y_train).mode().iloc[0]\n",
937 "baseline = accuracy_score(y_true=y_test, y_pred=np.full_like(y_test, fill_value=mode))\n",
938 "print(f'Baseline Score: {baseline:.2%}')"
939 ]
940 },
941 {
942 "cell_type": "code",
943 "execution_count": 26,
944 "metadata": {
945 "ExecuteTime": {
946 "end_time": "2018-12-27T23:35:39.040108Z",
947 "start_time": "2018-12-27T23:35:38.953294Z"
948 }
949 },
950 "outputs": [],
951 "source": [
952 "class_weights = class_weight.compute_class_weight('balanced',\n",
953 " np.unique(y_train),\n",
954 " y_train)"
955 ]
956 },
957 {
958 "cell_type": "code",
959 "execution_count": 27,
960 "metadata": {
961 "ExecuteTime": {
962 "end_time": "2018-12-27T23:35:39.240067Z",
963 "start_time": "2018-12-27T23:35:39.237696Z"
964 }
965 },
966 "outputs": [
967 {
968 "data": {
969 "text/plain": [
970 "array([0.52585038, 1.59482003, 2.12184306])"
971 ]
972 },
973 "execution_count": 27,
974 "metadata": {},
975 "output_type": "execute_result"
976 }
977 ],
978 "source": [
979 "class_weights"
980 ]
981 },
982 {
983 "cell_type": "markdown",
984 "metadata": {},
985 "source": [
986 "## LightGBM"
987 ]
988 },
989 {
990 "cell_type": "code",
991 "execution_count": 65,
992 "metadata": {
993 "ExecuteTime": {
994 "end_time": "2018-12-28T02:39:10.223141Z",
995 "start_time": "2018-12-28T02:39:10.217963Z"
996 }
997 },
998 "outputs": [],
999 "source": [
1000 "train_data = lgb.Dataset(data=X_train, label=y_train)\n",
1001 "test_data = train_data.create_valid(X_test, label=y_test)"
1002 ]
1003 },
1004 {
1005 "cell_type": "code",
1006 "execution_count": 66,
1007 "metadata": {
1008 "ExecuteTime": {
1009 "end_time": "2018-12-28T02:39:10.908403Z",
1010 "start_time": "2018-12-28T02:39:10.901251Z"
1011 }
1012 },
1013 "outputs": [],
1014 "source": [
1015 "params = {'objective': 'multiclass',\n",
1016 " 'num_classes': 5}"
1017 ]
1018 },
1019 {
1020 "cell_type": "code",
1021 "execution_count": 67,
1022 "metadata": {
1023 "ExecuteTime": {
1024 "end_time": "2018-12-28T02:42:04.449691Z",
1025 "start_time": "2018-12-28T02:39:11.555708Z"
1026 },
1027 "scrolled": true
1028 },
1029 "outputs": [
1030 {
1031 "name": "stdout",
1032 "output_type": "stream",
1033 "text": [
1034 "[25]\ttraining's multi_logloss: 1.50257\tvalid_1's multi_logloss: 1.51211\n",
1035 "[50]\ttraining's multi_logloss: 1.45251\tvalid_1's multi_logloss: 1.4704\n",
1036 "[75]\ttraining's multi_logloss: 1.41546\tvalid_1's multi_logloss: 1.44103\n",
1037 "[100]\ttraining's multi_logloss: 1.38507\tvalid_1's multi_logloss: 1.41809\n",
1038 "[125]\ttraining's multi_logloss: 1.35921\tvalid_1's multi_logloss: 1.39942\n",
1039 "[150]\ttraining's multi_logloss: 1.33601\tvalid_1's multi_logloss: 1.38295\n",
1040 "[175]\ttraining's multi_logloss: 1.31554\tvalid_1's multi_logloss: 1.36904\n",
1041 "[200]\ttraining's multi_logloss: 1.29656\tvalid_1's multi_logloss: 1.35624\n",
1042 "[225]\ttraining's multi_logloss: 1.27918\tvalid_1's multi_logloss: 1.34486\n",
1043 "[250]\ttraining's multi_logloss: 1.26276\tvalid_1's multi_logloss: 1.33447\n"
1044 ]
1045 }
1046 ],
1047 "source": [
1048 "lgb_model = lgb.train(params=params,\n",
1049 " train_set=train_data,\n",
1050 " num_boost_round=250,\n",
1051 " valid_sets=[train_data, test_data],\n",
1052 " verbose_eval=25)"
1053 ]
1054 },
1055 {
1056 "cell_type": "code",
1057 "execution_count": 72,
1058 "metadata": {
1059 "ExecuteTime": {
1060 "end_time": "2018-12-28T02:43:01.722585Z",
1061 "start_time": "2018-12-28T02:43:00.450410Z"
1062 }
1063 },
1064 "outputs": [],
1065 "source": [
1066 "y_pred = np.argmax(lgb_model.predict(X_test), axis=1)"
1067 ]
1068 },
1069 {
1070 "cell_type": "code",
1071 "execution_count": 88,
1072 "metadata": {
1073 "ExecuteTime": {
1074 "end_time": "2018-12-28T02:48:08.595370Z",
1075 "start_time": "2018-12-28T02:48:08.514152Z"
1076 }
1077 },
1078 "outputs": [],
1079 "source": [
1080 "cm = confusion_matrix(y_true=y_test, y_pred=y_pred)"
1081 ]
1082 },
1083 {
1084 "cell_type": "code",
1085 "execution_count": 99,
1086 "metadata": {
1087 "ExecuteTime": {
1088 "end_time": "2018-12-28T03:56:59.129287Z",
1089 "start_time": "2018-12-28T03:56:58.910377Z"
1090 }
1091 },
1092 "outputs": [
1093 {
1094 "data": {
1095 "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWQAAAD8CAYAAABAWd66AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAIABJREFUeJzs3Xd4k9XbwPHvSdIFZe9dBGQpG0QUUAFllyF7r7KHbGQPUURBlL2RvX4oKIjKElzsPZS9aZltoUmb9Lx/pNSWbm2bNO/94cplcs55ntzHJ71zcp6ltNYIIYRwPIOjAxBCCGEnCVkIIZyEJGQhhHASkpCFEMJJSEIWQggnIQlZCCGchCRkIYRwEpKQhRDCSUhCFkIIJ2FK6TfwqjfT5U4F/HNhD0eHkCJezuPt6BCSnaueiPr33WBHh5AiyhTwVv91HV7l+yV6q4ccm/2f3y85yQhZCCGcRIqPkIUQIlWptDvOlIQshHAtBqOjI/jXJCELIVyLcqpp4SSRhCyEcC1peMoi7UYuhBCxUSrxjwRXpeoqpS4opS4qpUbGUl9DKXVUKWVVSr0fpbycUup3pdQZpdRJpVSrxIQuI2QhhGtJphGyUsoIzAHqADeBQ0qprVrrs1GaXQc6A0NfWPwZ0FFr/bdSKi9wRCm1U2v9OL73lIQshHAtyTeHXAW4qLW+bF+tWgf4ApEJWWt9NaIuPOqCWuu/ojy/rZTyB3IAkpCFEP+PJN9RFvmAG1Fe3wReS+pKlFJVAHfgUkJtZQ5ZCOFalCHRD6WUn1LqcJSHX9Q1xbL2JJ37qZTKA6wEumitwxNqLyNkIYRrScKUhdZ6IbAwjuqbQIEor/MDtxMfhsoIfA+M0Vr/kZhlZIQshHAtSRghJ+AQUEwpVVgp5Q60BrYmKgR7+y3A11rrjYkNXRKyEMK1JFNC1lpbgX7ATuAcsEFrfUYpNUkp1RhAKVVZKXUTaAEsUEqdiVi8JVAD6KyUOh7xKJdQ6DJlIYRwLcbkO3Vaa70d2P5C2bgozw9hn8p4cblVwKqkvp8kZCGEa5FTp4UQwknIqdPJZ/4Hdbi2tieH53WILGv2ZjGOzO/I0+8HUaFYrniXNxgUv89ux+YJvpFly4bX5eDc9kzs9EZk2cg2r9Gw6kvJ34FECA21MLJvR4b6teaDbi1Yv2J+jDY/btvE4O4tGdqzDWMGduXGtcsAnD99nCE9WjGyTwfu3LIfIvk0OIgpI/qiHXw19nFjRvFW9ddp5tsw1nqtNZ9MnULDunV4v2kjzp21T7ddvXKZ1i2a0aJpY04cPwaA1WrFr1tnQkJCUiv8WI0fM4q3a7xO8yax9+nK5Ut0bNeKyuVfYcWyJZHlDx8+pHOHNjRv0pDdu36OLB/Uvzf+/vdSPO74uOrnL1Iynjqd2pwuIa/86Sy+Y7ZEKztz7QGtJ2/jwOmbCS7fz7c8F64/jHz9ik92AKr0WcUbr+QjYzp3cmdJT6Xiufnuj8vJG3wiubm5M/6z+Xy2cB3TF6zh+KHf+OvsqWht3nynLjMWb+CzBWvxbdWJFfNmALBt0yqGjp9Om259+XGbfeftplWLaNq2K8rBHzDfJs2Yt2BxnPUH9v/C9WtX2bbjR8ZNmMyUSRMA2LhhPQM/GMJnX3zJimVLAdiwfi0NG/ni5eWVGqHHqXGTZsydH3efMmXKzPCRo+nYuVu08h+2f0cj36Z8vXodK5bZl9+3dzclSpYmZ874BxUpzVU/f5GS7yiLVOd0Ef16+hYPg8zRyi7ceMjftx4luGy+7N7UrVKYZTtPR5aF2cLxcjehFLibDNjCNWM7vM7klb8le+yJpZTCyysdADarFZvVGuPLOl36f26nZDGHRH7YjUYToaEWQs1mjEYTd2/f4OH9AEqXrZhq8celYqXKZMyUKc76Pbt30ahxE5RSlClbjqCgQAIC/DGZTFjMZswhZkxuJgIDA9m3dw+NfJukYvSxS6hPWbNl45VXy2AyRZ/9e96n0NBQDAYDVquV1StX0KlLtzjWlHpc9fMXKQ2PkF1qDnl6z7cYvWQ/3l7ukWUXbjzkRkAQv3/VjrW7z1Ekb2aUghOXAhwYKdhsNkb0ac/dWzeo69uSYiVfjdHmh2838N2mVVitVsZPt/+sbNqmCwtmTMHdw4P+Iyfz9YIvaN25d2qH/6/4+98jV+7cka9z5cqN/717tG7TjjEfjiA0NJSx4yeyYN4cevj1cp4R179Qr0EjRg0fwndbv2Hg4GFsWLeGho2bOHzE/5xLf/7+P16gXinVRWu9LDmD+S/qVSmM/+NnHLvoT/VXox+FMmzBvsjnmyb40v/LnxneugplCudg17FrLPvh9IurS3FGo5HPFqzlaXAQ08cP4fqVixQsXDRam7q+Lanr25L9u3awefVi+o2YROGixZk6ewUAZ08eJUu2HGg0MyaPxGQy0bHXB2TOki3V+5MoscwxKqXIkzcvS5avBOD6tWsEBPhTuPBLfDhyGGFhYfTtPxAfn8KpHe1/kiFDBmbPs58AFvjkCcuWLGLGrK+YOH4MQYGBdOjUhbLlyjssPpf+/DnhVERi/ZfIJ8ZVEfX8cOuN3//DWyTe66Xy0rDqS5xf3pWvR9bnrbIFWDqsbrQ2Dau+xNG/7pHO043ShbLR/uPvaftOSbw8HPdDIb13BkqXrcTxQ3FPobzx9nsc/HVvtDKtNZtXL+b99t3Z+PVCWnXqSfVa9dm+ZV0KR/zv5cyVm3t370a+vnfvLjly5ozW5qsvZ9K3/0DWrF5JgwaN6NO3Pwvmzk7tUJPVgvlz6O7Xix3bv6dUqdJMmDyVr2bNcHRYgIt+/tLwlEW8CTniwsqxPU4Bce6Z0Fov1FpX0lpXMhV4PdmDjs245b9StMNiSnReSsdPtrP3xA26Tv8hst5kNNDXtzwzNx8mnYcpcrBmMCjcTan7E+fJ40c8DQ4CwGIxc/Lon+Qr6BOtzZ2b1yOfH/3zAHnyF4xWv/fHbVR47U28M2TEYjGjDAYMBkWoOfr8uzN56+132Lb1G7TWnDxxHG/vDOTI8U9CPnzoIDlz5qJQIR/MISH2PhmNhDhxnxJy7dpVAvz9qVS5SmSflILQ0FCHxeTyn780vFMvoaFhLuA94MU9agpIkb1iK0bUo3qZAmTP6MnFld2ZvPJ3HgWbmdH7bbJn8uJ/E305eTmAxmO2kCdreuYOqkPTcd8kuN5ejcqyatc5QixWTl25j1JwaG4Hdh6+wpOnlpToSpweP7zP7GnjCQ+3obXm9Zq1qVi1BuuWz6PIy6WoXK0mO75dz6mjBzGaTHh7Z6Df8H9+kFjMIez78TvGTJsDQKP32/PZhGGY3NwYNHpqqvYlqhFDB3P40EEeP35EnXdq0Ltvf6xWKwAtW7Wheo2aHPhlHw3r1cHT04tJU/6JVWvNogXzmP75FwA0b9GKUSOGYrNZGT12giO6A8DIYf/06d1aNejd558+tWjVhvv3A2jbqjlPg4NRBgOrV63gf99ux9vbvlNs9pcz6TfgAwDq1W/IoAF9WbPqa/r0G+CwPrnq5y+SEybaxFLxHTuolFoCLNNaH4ilbo3Wum1Cb+BVb6aTHJyYfP5c2MPRIaSIl/N4J9wojXGWQ2OT2993gx0dQoooU8D7P88jePkuSPRWD/m2p1PNW8Q7QtZax3mMTmKSsRBCpDonnBtOLJc67E0IIdLylIUkZCGEa5ERshBCOIe0fEKRJGQhhEuRhCyEEE5CGSQhCyGEU5ARshBCOAlJyEII4SQkIQshhLNIu/lYErIQwrXICFkIIZyEwSBn6gkhhFOQEbIQQjiLtJuPJSELIVyLjJCFEMJJSEIWQggnIadOx+P7mR1T+i1SXc81Rx0dQopY1K6Co0NIdrkzezo6hBSR0UvGUnFJyyPktHt8iBBCxEIplehHItZVVyl1QSl1USk1Mpb6Gkqpo0opq1Lq/RfqOiml/o54dEpM7PI1K4RwKck1QlZKGYE5QB3gJnBIKbVVa302SrPrQGdg6AvLZgXGA5UADRyJWPbFG0ZHIyNkIYRLScYRchXgotb6stY6FFgH+EZtoLW+qrU+CYS/sOx7wE9a64cRSfgnoG5CbygJWQjhWlQSHvHLB9yI8vpmRFli/KtlZcpCCOFSknLqtFLKD/CLUrRQa73weXUsi+jErvrfLCsJWQjhUpIyhxyRfBfGUX0TKBDldX7gdiJXfRN464Vl9ya0kExZCCFcS/JNWRwCiimlCiul3IHWwNZERrETeFcplUUplQV4N6IsXpKQhRAuJbl26mmtrUA/7In0HLBBa31GKTVJKdU44r0qK6VuAi2ABUqpMxHLPgQmY0/qh4BJEWXxkikLIYRLSc4TQ7TW24HtL5SNi/L8EPbpiNiWXQosTcr7SUIWQriUtHymniRkIYRLkWtZCCGEk5ARshBCOAlJyEII4STScD6WhCyEcC0yQhZCCCdhkJ16QgjhHNLwADltnKkXbrPx0aBOzJk8NEbdz9+uZWLftkwZ0IEvxvbngf8dAO7evMbUwV2YMrAjl8+fAsBms/LF2AGEWsypGn9sWlfKx+puFVnVtSITG5XA3RjzU1SrRHbWdKvI6m72NgAFs3qxrFN5vu5SgVfyZgDAqODLVq/iYXLc5gwNtTCiT0cG92jNwK4tWLd8fow2Z04eZWjPtrSoU4Xf9/0cWX7rxlWG9WrH4B6tuXDmJGDfVhOG9cZiDkm1PsRm6sQxNKhdnfYtfWOtX/31Ujq1aUanNs1o39KX6pVfJfDJYx49ekjvru1p39KXX/bsimw/YnA/AgL8Uyv8WPnfu8uwft3o3rYJPdo1ZcuG1XG2vXDuNPWql2f/np8AuHHtKn27tqZXpxacPX0CAJvVyoiBfpgdvK2eMxhUoh/OJk2MkHd/t4HcBXwwP3sao65A4ZcZNWMp7h6e7NvxP7Ysn0v34ZM5sPMbmnbsTbacedjy9Tx6jnyVX3Zs4bW36+Lu4djb+uTwdqdFxXy0XXIYizWcKb4lqV0yJ9tP34tskz+LJx2rFqTnqhMEWaxkSecGQJNyeZi77wp3npjpU7MwH35zjqbl8/LDGX8s1hcvyZp63NzcmfD5fLy80mG1hjFmYDcqVHmDl0u9GtkmR87c9Bs+ka0bV0Zb9sdt/6N99/7kyJ2XVYu/Ynjp6ezcuomatevj4emV2l2Jpn6jJjRv2ZbJ40fFWt+uY1fadewKwIFf9rB+9ddkzJSZjWtXUa+hL7Xeq8+Q/n7UeLsWB37ZQ/ESpciRI2dqdiEGo9GIX/+hFCtekmdPn9KvW2sqVK5KocJForWz2WwsmfsFFatUiyzb/u1GuvYaSO48eVkybxbjps7guy0bqP1eQzwdvK2ec+kRslKqhFKqllLK+4XyBC+2nBwe3ffn9OHfeKNOo1jri5epGJlgXypemkcP7KMPo8lEqMVCqMWM0WjkWXAQpw4doOrb9VIj7AQZDQoPkwGjAk+TgfvBodHqfcvmYdPR2wRZrAA8ehYGgNWm8TAZ8DQZsYZrvD2MvFk0W7Rk7ghKKby80gH2EZPVao1x8ZacufPiU6RYjJ0uJpOJ0FD7tjIZTTwNDuLw77/w1rsNUyv8OJWrUImMmTIlqu3PP2ynznv1AXufLBYLYaGhKGXAarWyYc1K2nbokpLhJkq27DkoVrwkAOnSp6dAoZe4H8uo/dtNa3nzrdpkzpI1suz535XZbMZkMhEcFMgfv+6jdr3Y/z4dITlv4ZTa4h0hK6UGAH2xX1hjiVJqoNb624jqqcAPKRwfGxd/QdNOfbGEPEuw7a8/fUfpilUBqFm/Ocu/mIw1LJS2fUawff1S6rbo7BQbISA4lDUHb7Cl92tYrDYOXnnMwavR7+xSIIt9tLGgXVkMBsWSA9f448ojNh+9zbiGxXEzGpi282+6vlGIFb9fd0Q3YrDZbAzv3Z67t25Q17clL5d8NeGFgLq+Lfhq2njCQkPp+cFoNny9iObtujnFtkosc0gIf/x+gMEjRgNQp24DJowezo7vt9JnwGC2bFxH3QaN8fRyjlHkc3fv3OLS3+cpUTr6trofcI/fftnNtC8X8de5M5HljZu1ZvqUMYSFhjJg+FhWL1tAm049nGpbOVEoSZbQlEUPoKLWOlgp5QNsUkr5aK1nkZiL1/1Hpw79SobMWShUtAR/nYr/Ts9/7v2BaxfPM3jqHACy5sjN4I/sz/3v3OTxw/vkzl+IZTMnYguz0qhdD3LlK5jSXYhVBg8T1Ytlp/n8gwRZrHzkW5L3SuVk59l/Rikmg6JAFi/6rD1JzgwezG9XlnZLDnMvyELftfZ51vyZPcnu7c7VB88Y16A4bkbFwv3XuPHIMXN5RqORzxeu5WlwENPGDeH6lYsULFw0weVy5MrDpBn2S9LeuXWDRw8CyFfAh1kfj8VqDaNN597kLVAopcP/Tw7s30uZsuXJmCkzAN4ZMvDZl/MACAx8wqrlS5j62Rd8MnkcQUGBtGnfmVfKlHNkyIQ8e8bk0UPoNWAY6dNH+wHM/FnT6dZ7EEajMVp5ztx5mD57CQC3bl7nwf0AChQqzKeTPiQsLIxOPfqSv6BPanUhVkm5QL2zSShyo9Y6GOz3jsJ+weV6SqkZxJOQlVJ+SqnDSqnD321Y8a+Du3TuJCcPHmB0j2Ys+WwcF04eYdmMCTHanTt+iB82rqD36Gm4ubnHqN+6cgGN2/mxZ9sGqtR8j4Ztu/H9uiRdhClZVfbJzJ0nZh6HhGEL1+z76z6v5ssYrY1/kIVf/n6ALVxz54mZ6w+eRY6an+tZw4eF+6/SsmI+fjzrz+ID1+j2huMTV3rvDLxSrhLHDv2W5GXXLJ1D6y69+X7LOmrUqkerTj3ZsDKu64c7j107d1A7YrriRcsWzaNTNz9+/mE7xUuW5sNxU5g/+4tUjjA6qzWMyaMH88679Xnzrdox6v86f4aPx4+gY/N67N/7E1999hG//bI7WpvlC76iU4++fLNxDW+/24AO3fuwaumC1OpCnJRK/MPZJDRCvquUKqe1Pg4QMVJuiP2ScnH+Ho16Ff7d5x8k9pYnMTTp2JsmHXsD8Nepo/z0zRq6DJ4Qrc2NyxdYM28a/cfPJGPmrDHW8dfpY2TOlp2ceQsQarGglMJgMDr0SIu7gRZK582Ah8mAxRpOpUJZOHc3KFqbX/5+QJ2IHX2ZvEwUyJqOW4//ibl8gUwEBIdy85EZTzcD4RpsWuPp5pjRwZPHjzCZTKT3zoDFYubkkT9p0jpRdz6PdObEEbJmy0ne/AUJtZhRBvu2spgdf1RMfIKDgjh29BDjpnwSo+7G9WvcDwigfMXK/H3hPB6eHqAUoaGhsawpdWitmfHxBAoUeonmrTvG2ubrTTsin382ZSyvvVGDajXeiSw7eeww2XPkJF+BQljMZgxKYTQYsFgcf6SFM02fJFVCCbkjYI1aEHHR5o5KKYd9FW5bvYiCRUtQ9rXqbF42B0tICIs+HQNAluy56DPm0+exsmPDcroPnwzAm+/5smzGBMJtNtr0Huao8Dl7J4g9F+6zonMFrOGav+4F8+2JO/R4sxDn7gZx4OJD/rjyiCqFs7CmW0XCNczee5lA8z+bonO1goz55hwA35y4w8SGJTAaFNN/vOiQPj16cJ/Zn47HZrOhtaZazdpUer0Ga5fNo2jxUlSuVpOL588wbfxQngYHcvj3/axbsYBZSzcC9m21adUShoyzJ7U6DZrxxdTR2Gw2eg6K/QiH1DD+w6EcO3yIx48f06TeO3Tr2de+wxJo+n4rAPbt+ZkqVd+I3KkZ1cI5s/DrOxCAOnXrM3LIADasXUX3Xv1SrxMvOHPyGLt++I7CRYrRu1NLALr07I//Pfshow2btox3ea01a5YvYvTk6QDU923OtImjsNls9B86OmWDT4Q0nI9RWv/rAWyi/JcRsrMa/e2ZhBulQYvaVXB0CMkud2bHHuKYUoLN1oQbpUE+2T3/czqtOHlPonPOkbFvO1X6ThPHIQshRGKl5RGyJGQhhEtxxjPwEksSshDCpbjyTj0hhEhT0nA+loQshHAtMkIWQggnkYbzsSRkIYRrkZ16QgjhJGTKQgghnIQkZCGEcBJpOB9LQhZCuBYZIQshhJNIw/k4bdzkVAghEis5b3KqlKqrlLqglLqolBoZS72HUmp9RP2fETfyQCnlppRaoZQ6pZQ6p5RK1CULJSELIVyKQalEP+KjlDICc4B6QCmgjVKq1AvNugGPtNZFgZnAtIjyFoCH1vpVoCLQ83myjjf2JPRTCCGcXjLeMaQKcFFrfVlrHQqsA3xfaOMLPL8t0iaglrJPYmsgvVLKBHgBoUBgQm8oCVkI4VKS8a7T+YAbUV7fjCiLtU3EzTueANmwJ+enwB3gOvCZ1vphQm8oCVkI4VIMKvGPqPf/jHj4RVlVbBn7xYvfx9WmCmAD8gKFgSFKqZcSij3Fj7IolTdjwo3SmA/rF3d0CCli2bFbjg4h2bV5JY+jQ0gRbsY0fChBvP77HV6Scup01Pt/xuImUCDK6/zA7Tja3IyYnsgEPATaAj9orcMAf6XUr0Al4HK8sSc6ciGESANUEv4l4BBQTClVWCnlDrQGtr7QZivw/G6+7wO7tf2+eNeBd5RdeqAqcD6hN5TjkIUQLiW5ri2ktbYqpfoBOwEjsFRrfUYpNQk4rLXeCiwBViqlLmIfGbeOWHwOsAw4jX1aY5nW+mRC7ykJWQjhUpLzTD2t9XZg+wtl46I8N2M/xO3F5YJjK0+IJGQhhEtJy2fqSUIWQriUhE74cGaSkIUQLkUuUC+EEE4iDQ+QJSELIVyLTFkIIYSTSLvpWBKyEMLFyAXqhRDCSaThfXqSkIUQrkWOshBCCCchUxZCCOEk0vAAWRKyEMK1yAhZCCGcRNpNx05+PWT/u3cY2KsLHVo0olNLXzatXRmjjdaaWZ9NpW3TenRp05S/zp8F4PrVK/To0JKubZtx+uRxAKxWK4P7dMdsDknVfsQm3GZjxtBuLJk6Itb647/t5tNBHZg+qCOrv5gEgP+t68wc3p3Ph3Th6oXTANhsVhZM/IBQiznVYo+Lp8lAx4p5GfG2D8Pf8qFQlugXGy+SzYspdYsyuEYhBtcoRJ1i2QBI726k3xsFGFrTh1dye0e271I5Lxk9jKnZhRhCQy2MHdCJUb3bMtyvJZtWLoi13R+//MQwv5YM92vJ7E/GAHD7xlVG9+vAqN5t+fus/cqLNpuVqSP7YDE7bnuFhloY1a8jw3q2YXD3lmxYEXufAP745Wda1qnEpQv2v6vzp48z1K81o/p25O4t+92NngYH8dHIftgvA+x4RoNK9MPZOPUI2Wgy0XfQMF4uUYpnT5/So2NLKr1WDZ+XikS2+fO3/dy8fp3V/9vO2dMnmfHJZOYvX8vWLRvx6zeIPHnysWD2TF759Au+3byed+s3wtPTy4G9stu/fRO58hfC/OxpjLqAOzfY/b/V9Jsyl3TeGQh68giAP37aSoN2PcmSMzfbVy3AZ9gUft/5LRVqvou7x3+/08J/1eSVnFwIeMrXR25jVOBmjPl9f+VhCEsORr8zSfl8GTh0I5DjtwLpUbUAp+8GUypXem49sRBosaVW+LFyc3Nn9LR5eHqlw2q1MmlId8pWqkaxkq9Gtrl76zpb1y9nwueLSZ8hI08e22+dtnv7Flp37UeOXHlYt3Q2g0p9ys/fbebNWvXx8HTc9nJzc2f89PmRfRr3QTfKVa7Gy6VejdYu5NlTdnyznmIlXoks+27TaoaM+5SAe7f5cdsmOvb6gM2rFtO0TRenmSpwljj+DaceIWfLnoOXS9jvup0ufXoK+bxEQMC9aG0O7NvDew0ao5Si9KtlCQ4K4sH9AEwmE6FmC2ZzCCaTiaCgQH7bv5f3GjR2RFeiefzAn3NHfqdKrQax1v/583e8Ubcp6bwzAJAhUxYADEYjYaEWwiwWjEYTIU+DOHv4VyrVrJtqscfFw2TgpWxe/Hn9CQA2DWZreKKWtYXbb0lkMhrQWmNQUKNwFvZcSvCekClOKYWnVzoAbFYrNqs1xh/87h3fUKdhC9JnsN+uLFPmrIB9QBFqMWOxmDGaTDwNDuLon/upXjv27Z5aEtMngPXL59O4ZUfc3N0jy4wmE6Ghlsg+3b19k4cP/ClVtmKqxZ+QZLzrdKpLcISslKoCaK31IaVUKaAucD7iws2p5s7tW/x94RylSpeJVn4/4B45c+WOfJ0jZy4C/O/RtEUbpo4fRWhYGENHjWPF4vl06OLnFN+e3y77ioYdemMOeRZrfcBt+0/B2aP7EB4ezrstu1Ci/Gu8UbcZa7/6CJs1jOZ+Q/lp4wpqNe/oFH3Kls6NpxYbrcvlJm9GD24+NvPNGX9CbdF/xhbK4sWQGoV4YrGy7UwA94JDOXYrkHYV8lApfya+PxdANZ/MHL4ZSJjNOX4Ch9tsjO7fgXu3b1KnUQuKRhkxgn2EDDBhcDfCw8Np3r4HZStVo06jFsyfPp6wsDC6DRjFltWLadLaOUaS4TYbI/p04O7tG7zXuAXFSkbv05WL57kfcJeKVauzbeM/U4VNW3dm4cyPcPfwoN+ISaxc+AWtOvVO7fDj5bLXslBKjQfqASal1E/Aa8BeYKRSqrzW+qOUDxGePXvGuBEf0H/wCNJ7e0eri23eSilFrtx5mLVgOQA3b1znQYA/BX0KM2XcSKxhYXTr1Z8ChXxSIfrozh7+De9MWchfpDgXTx+LtU24zcb9OzfpPfFLHj/wZ+7Y/gyduZwsOXLRZ9KXANy/c5PAh/fJma8ga76cgs0aRt3W3cmRt0Cs60xpBgX5Mnmy5bQ/1x+b8S2dk3eKZuWHCw8i29x8YmHKz5cItWlK5ExPl8r5+GTPFczW8MhpDC83A28XzcryQ7doUSYXXm5G9l1+yLVHjptzNRiNfDx3DU+Dg5g5aRg3rl6kgE/RyHqbzca92zcY8+kCHt6/x6Shfkybv47sOXMzZrp9fvbu7Rs8ehhA3gI+zP10HFZrGC069iJP/kIO69P0BfY+fTZhKNevXKRgYXufwsPDWTFvBn2GTYixnE/R4nz01XIAzp48SpZsOdBoZk4ZhdFkomPPQWTOki0VexJTGs7HCU5ZvA/ynz6dAAAgAElEQVS8AdQA+gJNtNaTgPeAVnEtFPXW2iuXLf5PAVqtYYwbMYjadRtQ4506Mepz5MyN/727ka8D/O+RPUfOaG0Wz51Ft1792bx+NXXqNqRLz74sXzTvP8X1b129cIqzh37lo94tWf3FRC6ePsqaWZOjtcmULQelK7+J0WQiW6685MhbgIA7N6O12bF2Ee+16caB7ZupUL0O77Xqyo8blqVmV6J5YrbyxGzl+mN74jx5J4h8maLPk1qs4ZEj5vP+TzEaFOndo++0q/NyNn7++wHl82Xk5hMz60/cpX6JHKnTiQSk985AyTIVOXn492jlWbPnpELVGphMJnLmzkfe/AUjR83PbVg+lxYde7Hz2/W88U5d3u/Qk/+tXpSa4ccqvXcGSpWtyPEofTKHPOPG1UtMHNqTvu0b8fe503w6bnDkjj2wD4T+t3oJzdt1Z9PKRbTs2JMateqxY8s6R3QjGqVUoh/OJqGEbNVa27TWz4BLWutAAK11CBDnBKHWeqHWupLWulKHLt3/dXBaa6ZNHkchn5do1a5TrG3eqPEWO7/fitaaM6dOkN7bm2zZ//kDPn7kENlz5iJ/wUJYzGYMBoXRYMRsccyRFvXb9WTsws2MnreBdoPGU/SVCrQdODZam1eqVOfi6aMAPA18TMCdG2TLlTey/tKZ42TKmoMceQoQajHbP1wGA2GhllTtS1RBFhuPQ8LIkd4NgGLZ03EvKDRamwxRjpgokNkTpeBp6D877bKndyOTh4nLD0JwNyq0BrTG5MC94YGPH/E0OAiAUIuZM8cOkqeAT7Q2larV5NzJIwAEPXnMnZvXyZknX2T9uZNHyJotJ7nzFcRiMaOUAYPBQKjFMdvrxT6dOnqQfFH6lC69N0s272LOqm3MWbWNYiVfYfikGRQpXiqyzb4fv6PCa2/inSGjvU8GhVIGLE5wtI9RqUQ/nE1Cc8ihSql0EQk5ctZeKZWJeBJycjl14hg/bt/GS0WL0a1tcwB69B3Ivbt3APBt3oqqb9Tgj1/307ZpPTw8vRg57p/Rptaar5cuYOLHnwPQsOn7TBk7EpvNyuCRY2O+oQP9sG4JBYoUp3TlNylergp/nTjEp4M6YDAYaNihD+kzZALsffp589d0GDwRgKp1GrFm1mTCbTaa+Q1xZBfYctqfdhXyYjQoHj4LZd3xu7xeyB7379eeUCZPBqr5ZCY8XBMWrll15Ha05euVyM6O8/cBOHYriC6V81H9pSz8cOF+qvfluccP7zP/8wmE28LROpzXatSmwmvV2fT1fAoXK0nF12tSpuLrnDryJ8P8WmIwGGjbfSAZMmYG7Nvrm7VLGfDhxwC8U68pc6eNxWaz0aX/SIf06dHD+8z5dDzh4fY+vV6jDhWrVmf98vkUebkklarVjHd5i9nMvp++Y/QncwBo2Lwdn08cjsnNjYEfpsosZryc8Gi2RFPxHTuolPLQWsf4GldKZQfyaK1PJfQGdwPDnGPPTDI6dM3xe/9Twt6rjx0dQrJr80oeR4eQItyMaTjrxKNswQz/uWODt55PdM6Z0biEU/2PjHeEHFsyjii/Dzhu2CKEEHFwxrnhxHLqE0OEECKp0vKUhSRkIYRLScMDZEnIQgjXYkrDGVkSshDCpaThfCwJWQjhWlz21GkhhEhr0nA+du6rvQkhRFIZVOIfCVFK1VVKXVBKXVRKxTiTRynloZRaH1H/p1LKJ0pdGaXU70qpM0qpU0qpBK+5KiNkIYRLSa4LzyuljMAcoA5wEziklNqqtT4bpVk34JHWuqhSqjUwDWillDIBq4AOWusTSqlsQFhC7ykjZCGES0nGEXIV4KLW+rLWOhRYB/i+0MYXWBHxfBNQS9nPTHkXOKm1PgGgtX6gtU7wbguSkIUQLkUl5V+UK1NGPPyirCofcCPK65sRZcTWRmttBZ4A2YCXAa2U2qmUOqqUGp6Y2GXKQgjhUpIyY6G1XggsjKM6tjW9eJ2MuNqYgDeBysAzYJdS6ojWeld88cgIWQjhUpJxyuImEPWOD/mB23G1iZg3zgQ8jCjfp7W+H3G1zO1AhQRjT0wHhRAirUjGC9QfAooppQorpdyB1sDWF9psBZ5frP19YLe2X0JzJ1BGKZUuIlHXBM6SAJmyEEK4lFhudv6vaK2tSql+2JOrEViqtT6jlJoEHNZabwWWACuVUhexj4xbRyz7SCk1A3tS18B2rfX3Cb2nJGQhhEtJzjP1Im7mvP2FsnFRnpuBFnEsuwr7oW+JJglZCOFS5PKb8Uj3wk0sXUGlglkcHUKKyOTh5ugQkt2ea655HwX/4ATPMUiTyhYs/p/XkZZPnZYRshDCpRhiPRItbZCELIRwKTJCFkIIJ2FKw5PIkpCFEC5FRshCCOEk5AL1QgjhJNJwPpaELIRwLWn5ehCSkIUQLkWmLIQQwklIQhZCCCeRdtOxJGQhhItJwwNkSchCCNeSiOscOy1JyEIIlyJHWQghhJOQnXpCCOEkZMpCCCGchExZCCGEk0jLI2Sn/jKZNG407771Bq2aNYq33ZnTp3itfGl2/bQTgKtXr9ChdXPatmjCyRPHALBarfTx64I5JCTF406I/727DOrdlY4tG9O5VRM2rYt5261rVy/Tp2s76rxRgXWrlkeWP370kH49OtK5dVP2790VWT56aH/uB/inRvjxCrfZmDSwI19OHBKj7q/Tx5g8sBM9fd/kyK+7I8vv3rzG5EGdmdi/A5fOnwLAZrMyY0x/LGZzqsUel3UfdmLzxN78b3JfvvloQIx6y9Mgfpo3ic2TevPtxwN5eOsqACFBj9n26RA2T+zF1eO/Rbb/ce5Enj5+kFrhxyosJJhDKz5h9ye92T2tDw+vno9Wf//iKbaPbs3ezwey9/OBXPhxHQCW4Ccc+GoEe6b3486pPyLbH1w6BfMTx/bpOZWEh7Nx6hFyQ98mtGzTlvGjR8bZxmazMfuLz6la7Y3Isi0b19Nv4GDy5M3H7Fkz+HRGeTZvWEf9hr54enmlRujxMhqN9Bk4lJdLlOLZ06f4dWxFpSqv4/NSkcg2GTNmYsDQURzYuzvasrt+3EHdBo15p049hg/sRfW3avHb/r0UK16K7DlypnZXYvh52wby5Pch5NnTGHVZc+Smy6Cx7NyyOlr5vh++oXmnPmTLlYf/LZ9L7w8/Zu/2LVR9uy4enp6pFXq8Ggz5BE/vTLHWHd+xnmz5i1Cn9zge373Bb2vmUH/wJ1w6tI9ir9fmpco1+eHLsfiUq8a1E3+QvUBR0mfOlso9iO7UN4vIWbwClTuNJNwahi3MEqNNtsKleK37uGhlt479QoHK75CvXHV+XzSBPK9W5e6Zg2TKVwTPTI7t03PG/08jZKXU1ykRSGwqVKxMxoyZ422zfu0q3q5dhyxZ//kwmEwmzBYLZrMZk8lEUGAg+/ftoUEj35QOOVGyZc/ByyVKAZAufXoKFS7M/YB70dpkyZqNEqVewWiK/p1pMpqwWCyEhoWilAGr1cqmtato3aFzaoUfp4f3/Tl16FfefLdxrPXZc+Uhf+GiKBX9Y2c0mQgNtRBqMWM0mXgWHMTJgwd4/Z36qRH2f/b4znXyliwLQObcBQh6cI9ngY8wGI1Yw0IJt4ahlCLcZuPMrm8o815zh8YbZn7Gw8tnKPhaHQAMJjfcvLwTtawyGrGFhWKzWlHKQLjNxuVftlLk7WYpGXKSKJX4h7OJd4SslNr6YhHwtlIqM4DWOva/vFTif+8ee3f/zLxFyzl7ZkxkeYvWbRk/ZiRhoaGMGjuRxQvm0qVHT6ecW7pz+xZ/XzhPydJlEtW+Vt36TBk7gp3fb6Vnvw/4dvN63q3fCE9Px4/81y/6gve79MMc8ixJy71dvzlLZ07CGhZGh74j2LZuKfVbdnKi7aXY8cVoUIqS1etRokb0L4qs+V/i6tHfyF30FfyvXCD4oT/PHt2naJW32bN4Ghd/30XlZl04u+87ir5eC5O7Y0f9zx7cxT19Jo6vm8WT21fInL8orzTpgckjelwPr11g72cD8MyUlVKNupIxd0Hyl6/JkdWfc+PwHko16MTV37aTv9LbmNw9HNSbmJRTTkYkTkJTFvmBs8BiQGNPyJWAz1M4rkSZMf1j+g8agtEY/c7WufPkZcES+0D+xvVrBAT441P4JcZ9OJywsDB69R1AIZ/Cjgg5mmfPnjF+5Af0GzyC9N6JG6F4e2fgk5lzAQgKfMLalUuZNO0Lpn80gaCgQFq17UjpMuVSMuxYnTh4gIyZslCoaAkunDqapGWz5czNsI/tffK/fYMnD++TJ78PSz6fiNUahm97P3LnK5gSYSdKo+Gfkz5zNkICH7Nj1odkyl2APC+/Gllftm4Lfl+/gP9N7kvWfD5kK1AEZTDi7pWe9/pPAuzzzCd3bqR2r7HsXzkLy7MgXq3dnFxFSqZ6f3S4jSe3LvFqUz+yFCrOqW8WcXH3JkrUax/ZJlP+ItQZsxiThxf3zh3m0LKPqDVqAW5e6akaMY0R+iyYi3s2U7nzKI5vmE1YSDBFajYhq0+JVO9TVE7zPf4vJDRlUQk4AowGnmit9wIhWut9Wut9cS2klPJTSh1WSh1etmRh8kX7gnNnTjN6xBAa16vF7p9+ZNpHk9i7++dobeZ+9QW9+g5g/ZpV1G3QiJ59+rNowdwUiymxrNYwxo/4gNrvNaDG27X/1TpWLJ5P+y492P3jdl4uUYoRYyaxaN6XyRxp4lw6d5LjB/czsltTFn46lgsnj7D48wlJXs+WlQvwbe/Hrm0beO2td2nctjvb1i5J/oCT4Pl8r1fGzBQqV42Aqxei1bt7padm58E0GzuHml2GYg5+QobsuaK1Ofb9GsrVb82lQ3vJXrAoNTp+wOFvlqdWF6LxzJQdz0zZyVKoOAB5y1Tj8a3L0dq4eabD5GH/1ZWrZCXCbTYswYHR2vz10zqK1W7JrWO/kDl/Ecq1GsC57StTpxPxMKAS/XA28Y6QtdbhwEyl1MaI/95LaJmI5RYCCwECzeE6OQKNzbc7/km+E8aOonqNt3jrnX+S25HDB8mZMxcFC/lgNodgUAYMBgMWBx9pobXm08njKVj4JVq26/Sv1nHz+jUe3A+gXIXKXPzrAh4eHiilCLXE3DmTGpp16kOzTn0AuHDqKDv/t5ruQyYkaR0XTh0lc7Yc5MpbgFCLGaUMGIxGQi2OO9IizGJG63DcPdMRZjFz6+xRyjdoG62N5VkwJncPjCY3Lhz4gdzFXsXdK31k/ZN7t3j6+CF5Xi7DgxuXMbm5g1LYwkJTuzsAeGbMglfm7AT738Q7Z34C/j5BhlwForUxBz7CI0NmlFI8uv4X6HDc02eIrA8OuI35yUOyF3mFwFuXMbi5o4Bwq2P6FFVaHiEn6igLrfVNoIVSqgEQmFD75DJ6xBCOHD7I48ePaVDnLfx698NqtQLQvGXreJfVWrN00Xw+nj4TgKbNWzJ21DBsNhsjR49P8djjc+rEMX7csY2XihajW7v3AejRZwD37t4FwLd5Sx7cv0/Pzq149vQpShnYtG4lK9Z9Gzm1sXjel3TvbT8Eq9a79RgzbCCb162mS8++julUHL5dtZBCxUpS7rXqXPnrLHOnjrTvtDt0gG9XL2bS3DWAfXt9v2E5PUdMAaBG3SYs/mw8tnAb7XsPc1j8IYGP+Hn+ZMB+SF+RKm9R4JVKnNv3PQAlazbg8Z0b7Fv+GUoZyJynIDU6Doq2jsPfrqCSr/2Lt0jlt/hp3iRO7/6Wio07pG5noni1qR9HVs8g3BZG+qy5Kdd6IFd/2wGAT7V63Dn5K1d/24EyGDG6uVOx/bBoc/rnd6ykRD17/PnK1+Dgsqlc2b+N4nXbOaQ/UaXlU6eV1ik2gAVSdoTsKE8tVkeHkCL+vhfs6BCS3Z+3Hzs6hBThHxzm6BBSxPSGxf9zNt11/n6ic06tEtmdKns79XHIQgiRVGn5KAunPlNPCCGSKjmPQ1ZK1VVKXVBKXVRKxThDTSnloZRaH1H/p1LK54X6gkqpYKXU0MTELglZCOFSVBL+xbsepYzAHKAeUApoo5Qq9UKzbsAjrXVRYCYw7YX6mcCOxMYuCVkI4VIMKvGPBFQBLmqtL2utQ4F1wIun+/oCKyKebwJqqYi9n0qpJsBl4EyiY09sQyGESAsMSiX6EfWciYiHX5RV5QNuRHl9M6KM2Npora3AEyCbUio9MAKYmJTYZaeeEMKlJGWXXtRzJhK5qheP4IirzURgptY6OCmXAJCELIRwKcl4HPJNIOoZM/mB23G0uamUMgGZgIfAa8D7SqlPgcxAuFLKrLWeHd8bSkIWQriUZDzo7RBQTClVGLgFtAbavtBmK9AJ+B14H9it7Sd3VI+MR6kJQHBCyRgkIQshXE0yZWSttVUp1Q/YCRiBpVrrM0qpScBhrfVWYAmwUil1EfvIOP5TiBMgCVkI4VKS89RprfV2YPsLZeOiPDcDLRJYx4TEvp8kZCGES0m75+lJQhZCuJo0nJElIQshXEpavpaFJGQhhEtJw1fflIQshHAtaTgfS0IWQrgW57k5btJJQhZCuJQ0nI9TPiHbXO+GIWRJ7+7oEFJE2QKZHR1CssuSzjW3VZVGMS7N6xKmN0zwZLYEpeF8LCNkIYSLScMZWRKyEMKlyGFvQgjhJGQOWQghnIQkZCGEcBIyZSGEEE5CRshCCOEk0nA+loQshHAxaTgjS0IWQriU5LxAfWqThCyEcClpNx1LQhZCuJo0nJElIQshXIoc9iaEEE4iDU8hS0IWQriWNJyPJSELIVyLXKBeCCGcRBrOxxgcHUB8pkwYTb133qTt+41jrT9y+CC1qlehQ6umdGjVlCUL5gLw6OFD/Lq0p+37jdm35+fI9sMG9SXA3z9VYo/PuDGjeKv66zTzbRhrvdaaT6ZOoWHdOrzftBHnzp4B4OqVy7Ru0YwWTRtz4vgxAKxWK37dOhMSEpJa4cdp8vjRvPf2G7Ru3ijW+iOHDvL2m5Vp17Ip7Vo2ZfGCOYB9e/Xo3I7WzRuxd/c/22uoE2yv0FALI/t0ZEiP1gzq2oL1y+fHaLNt4yoGdXmfwd1bMWFoLwLu3QHg1o2rDO/VjiE9WnPhzEkAbDYrE4f1xmJO/e01f3w7ru36mMMbP4wsmzqoCcf/N4aD60ex/vMeZPL2inXZ899P5NCGD/lj3UgOrB4eWT5lgC8H149i8eQOkWVtGlSmb5u3UqwfCVFJeDgbp07IDRo1ZeachfG2KVe+IivXb2Hl+i1069kHgB9/+J76jXxZtGItq1YsA2D/vj0UL1mKHDlzpnjcCfFt0ox5CxbHWX9g/y9cv3aVbTt+ZNyEyUyZNAGAjRvWM/CDIXz2xZesWLYUgA3r19KwkS9eXrH/IaWmBo2bMGtuwttr9YYtrN6whe49+wL27dWgkS9Lvl7HqhX2fu3ft4fiJRy/vdzc3Bn/+Xw+X7SOzxau4dih3/jr7KlobQoXLc60eSuZsXg9r9eoxcqFswD4adv/aNe9P0PHf8rWjSsB2Ll1EzVr18fDM/W318ptf+Dbd060sl1/nKdii6lUafUxf1/zZ1jXd+Ncvq7fLKq2/oQ3230KQEZvT6qWLUyVVh9jNBgoXTQvnh5udGhUlQUbf0nRvsQrDWdkp07I5StWImOmTElezmRyw2I2ExYaisGgsFqtrF/zNe07dk2BKJOuYqXK8fZrz+5dNGrcBKUUZcqWIygokIAAf0wmExazGXOIGZObicDAQPbt3UMj3yapGH3cKlSsTMaMSb8NlNFkwmKxEBYaijIYsFqtrF39NR06OX57KaXw8koHgM1qxWa1xvhDfqV85cgEW6zkqzwIsI/qjSYToaEWLBYzRqOJp8FBHP79F2q+G/svo5T269FLPHzyLFrZrj/OY7OFA3Dw1BXy5Ur89gsP17i72Wc9vTzcCLPa+KBTLeau24vVGp58gSeRSsI/Z5OkhKyUelMpNVgpFffXaCo7dfI47Vs2ZVBfPy5f+huA9+o14M/ff2VQXz+69+zL5g1rqdfAF08nGEUmhr//PXLlzh35Oleu3Pjfu0frNu1Y+fVypkwaT/cePVkwbw49/HqlqZ0Yp04ep23LJgzs68eli/btVbdeQ/747QAD+vagRy/79qrf0Hm2l81mY6hfG7o1r0OZilV5ueSrcbbdveNbylepBkBd3xZs27SahTOn0qxtVzZ+vYjm7bo57fbq6Ps6O389G2ud1pptc/vx6+rhdG32BgDBzyx8s+s4f6wbydXbDwgMDqFiqUJ8t/dUrOtILUol/uFs4t2pp5Q6qLWuEvG8B9AX2AKMV0pV0Fp/kgoxxqlEiVJ8s/1n0qVLz2/79zH8g/5s2voD3hkyMOMr+1xfYOATVi5fwiefz2LqpHEEBQbStkNnXi1bzpGhx0/HvDGsUoo8efOyZLn9p+/1a9cICPCncOGX+HDkMMLCwujbfyA+PoVTO9pEK16yFFt37CJduvT8un8fwz/ox+ZtO/HOkIGZsxcAEdtr2WKmzfiSjyaOJSjIvr3KlC3vsLiNRiOfLVzL0+AgPh03hOtXLlKwcNEY7X75aTuX/jrLpBmLAMiRKw+TZtincO7cusHDBwHkK+DDlx+PxWoNo3Xn3uQtUChV+xKX4d3ew2YLZ932Q7HWv9NlJncCnpAjizffze/Hhat3+fXoJWas+JkZK+zz/nPHtWXyvO/p3PR1alctyam/bzFt8c7U7AYABidMtImV0AjZLcpzP6CO1noi8C7QLq6FlFJ+SqnDSqnDy5cuSoYwY5fe25t06dIDUK16TaxWK48fPYrWZunCeXTu1pMff9hOiZKlGDNhCvNmf5FiMSWHnLlyc+/u3cjX9+7djTGX+tWXM+nbfyBrVq+kQYNG9OnbnwVz//sde1OSd5Tt9UYc22vxgrl06d6TH3d8T4lSpRkz4SPmfeUc2yu9dwZKl6vEsUO/xag7eeRPNq9ZwsjJM3Fzj3mn67VL59CmS2+2b1lH9Vr1aNWpJxtWxj/fnlraNXqN+jVeofPo5XG2uRPwBICAR8Fs3X2SyqV9otWXLZ4fgL+v+dOu4Wu0H7GU0kXzUqRgjpQKOx7JN4mslKqrlLqglLqolIpxq2+llIdSan1E/Z9KKZ+I8jpKqSNKqVMR/30nMZEnlJANSqksSqlsgNJaBwBorZ8C1rgW0lov1FpX0lpX6ty1R2Li+Fce3A9AR4wmz5w+idbhZMr8zxzY9WtXCQjwp0KlyljMZgwGAyhFqMWSYjElh7fefodtW79Ba83JE8fx9s5Ajhz/JOTDhw6SM2cuChXywRwSgjIYMBiNhJjNDow6Yfejbq9TJwnXOsb2uh/gT4VKVTCbzRiUAaUUllDHba8njx/xNDgIAIvFzMkjf5KvgE+0Npf/Ps+CmR8xcvJMMmXJGmMdZ04cIWu2nOTJXxCLxYwyKAwGI6FOsL3qVCvJkM61eX/QAkLMYbG2Sefpjnc6j8jntV8vwZlLt6O1GdenIZPnfY+byYgxYogaHq5J5xnzyymlJdeUhVLKCMwB6gGlgDZKqVIvNOsGPNJaFwVmAtMiyu8DjbTWrwKdgJWJiT2h45AzAUewf5VopVRurfVdpZQ3qbCPcuzIoRw9cpDHjx/T6L236dGrH1ar/UPTrEVrdv/8I//buA6j0YSHpweTP/482vzcgjmz6Nl3IAB16tZnxAf9Wb9mJT1690/p0OM1YuhgDh86yOPHj6jzTg169+2P1Wr/fmvZqg3Va9TkwC/7aFivDp6eXkyaMjVyWa01ixbMY/rn9lFj8xatGDViKDabldFjJziiO5HGjBzCkcP27dXw3bfo0btfZL+aR2yvzRvWYjSZ8PTw4KNPom+vebNn0buffXu9W68Bwwb1Y92ar+nZZ4BD+gPw6MF9Zn86nnCbDa011WrWptLrNVi3bB5FipeicrWarFw4C3NICJ9PGgFA9py5GTllJmDfXptXLWHwOPvsXp0GzZg1dTQ2mw2/QaNStS8rPu5M9YrFyJ7Zm4s/TGby/O0M6/IuHu4mvpvXD4CDp64y4KN15MmRibnj2tK0/zxyZsvA+hn2gZXJaGT9jsP89Nu5yPU2eqsMR85cixxF/3nyKoc2fMjpv29x6q9bqdpHSNbEVAW4qLW+DKCUWgf4AlEn2n2BCRHPNwGzlVJKa30sSpszgKdSykNrHe/oQulY5isTopRKB+TSWl9JqO2jZ7akv4GT83I3OjqEFGEJc9ye8ZRy/cGzhBulQVUaxfj17BJCjs3+z/n0zpPQROecvJk9emKfjn1uodZ6IYBS6n2grta6e8TrDsBrWut+zxsrpU5HtLkZ8fpSRJv7Udq8D/TSWtdOKJ5/daae1voZkGAyFkKI1JaUo1gikm9ck/mxrejFZB9vG6VUaezTGIk6Ms2pj0MWQoikSsbzQm4CBaK8zg/cjquNUsqEfZr3YcTr/NiPSuuotb6UmNglIQshXEoyHod8CCimlCqslHIHWgNbX2izFftOO4D3gd1aa62Uygx8D4zSWv+a2NglIQshXEpynamntbYC/YCdwDlgg9b6jFJqklLq+QV2lgDZlFIXgcHA88n9fkBRYKxS6njEI8HrAPyrnXpJITv10g7ZqZd2yE69uAUEWxOdc3J4m5zqNBK5/KYQwqU4VYZNIknIQgiXYnDGi1QkkiRkIYRLScP5WHbqCSGEs5ARshDCpaTlEbIkZCGES3HGC88nliRkIYRLkRGyEEI4CUnIQgjhJGTKQgghnISMkIUQwkmk4XwsCVkI4WLScEaWhCyEcClp+dTpFL/aW2pSSvk9v/2KK3HFfrlin8A1++WKfXJWrnbqtF/CTdIkV+yXK/YJXLNfrtgnp+RqCVkIIdIsSchCCOEkXC0hu+o8lyv2yxX7BK7ZL1fsk1NyqZ16QgiRlrnaCFkIIdIsl0jISqmlSil/pdRpR8eSXJRSBePgQ+sAAAJGSURBVJRSe5RS55RSZ5RSAx0dU3JQSnkqpQ4qpU5E9Guio2NKLkopo1LqmFLqO0fHklyUUleVUqci7pp82NHxuDqXmLJQStUAgoH/a9/uWZyI4iiMP6fYQqJiIxJMsZ2NhcqyTToR8WURSwutLC0UC8EvIX4AbUTRZrUSREFFFtYXsqwWrqWFKAQR0W31WGS+QSbcyfX8YMike9L8uXPn5o7tg6V72iCpD/Rtb0jaBYyAs7Y/Fk6biiQBPdvbkhaANeCy7deF06Ym6SqwBOy2vVK6pw2SPgNLtr+XbvkfVLFCtv0K+FG6o022v9neaO5/A1vA/rJV0/PEdvN1obnmflUgaQCcBm6Vbon5VcVArp2kReAw8KZsSTuaR/tNYAw8s13D77oJXAP+lg5pmYGnkkaS8geRGctA7jhJO4FV4IrtX6V72mD7j+1DwABYljTX20ySVoCx7VHplhkY2j4CnAQuNduDMSMZyB3W7LGuAvdsPyzd0zbbP4GXwInCKdMaAmea/dYHwFFJd8smtcP21+ZzDDwClssW1S0DuaOal1+3gS3bN0r3tEXSXkl7mvsdwDHgU9mq6di+bntgexE4Bzy3fb5w1tQk9ZoXykjqAceBak4ydVEVA1nSfWAdOCDpi6SLpZtaMAQuMFltbTbXqdJRLegDLyR9AN4x2UOu5phYZfYBa5LeA2+Bx7afFG6qWhXH3iIialDFCjkiogYZyBERHZGBHBHRERnIEREdkYEcEdERGcgRER2RgRwR0REZyBERHfEPDkQ0fjPnGGcAAAAASUVORK5CYII=\n",
1096 "text/plain": [
1097 "<Figure size 432x288 with 2 Axes>"
1098 ]
1099 },
1100 "metadata": {
1101 "needs_background": "light"
1102 },
1103 "output_type": "display_data"
1104 }
1105 ],
1106 "source": [
1107 "sns.heatmap(pd.DataFrame(cm/np.sum(cm), \n",
1108 " index=stars, \n",
1109 " columns=stars), \n",
1110 " annot=True, \n",
1111 " cmap='Blues', \n",
1112 " fmt='.1%')"
1113 ]
1114 },
1115 {
1116 "cell_type": "code",
1117 "execution_count": 81,
1118 "metadata": {
1119 "ExecuteTime": {
1120 "end_time": "2018-12-28T02:45:12.284710Z",
1121 "start_time": "2018-12-28T02:45:12.277127Z"
1122 }
1123 },
1124 "outputs": [
1125 {
1126 "data": {
1127 "text/plain": [
1128 "0.44955063467061984"
1129 ]
1130 },
1131 "execution_count": 81,
1132 "metadata": {},
1133 "output_type": "execute_result"
1134 }
1135 ],
1136 "source": [
1137 "accuracy_score(y_true=y_test, y_pred=y_pred)"
1138 ]
1139 },
1140 {
1141 "cell_type": "code",
1142 "execution_count": 36,
1143 "metadata": {
1144 "ExecuteTime": {
1145 "end_time": "2018-12-28T02:30:21.727266Z",
1146 "start_time": "2018-12-28T02:30:21.428123Z"
1147 }
1148 },
1149 "outputs": [
1150 {
1151 "data": {
1152 "text/plain": [
1153 "0.8614708105573701"
1154 ]
1155 },
1156 "execution_count": 36,
1157 "metadata": {},
1158 "output_type": "execute_result"
1159 }
1160 ],
1161 "source": [
1162 "roc_auc_score(y_score=lgb_model.predict(X_test), y_true=y_test)"
1163 ]
1164 },
1165 {
1166 "cell_type": "code",
1167 "execution_count": 55,
1168 "metadata": {
1169 "ExecuteTime": {
1170 "end_time": "2018-12-27T23:56:19.836967Z",
1171 "start_time": "2018-12-27T23:56:19.660296Z"
1172 }
1173 },
1174 "outputs": [
1175 {
1176 "data": {
1177 "text/html": [
1178 "<div>\n",
1179 "<style scoped>\n",
1180 " .dataframe tbody tr th:only-of-type {\n",
1181 " vertical-align: middle;\n",
1182 " }\n",
1183 "\n",
1184 " .dataframe tbody tr th {\n",
1185 " vertical-align: top;\n",
1186 " }\n",
1187 "\n",
1188 " .dataframe thead th {\n",
1189 " text-align: right;\n",
1190 " }\n",
1191 "</style>\n",
1192 "<table border=\"1\" class=\"dataframe\">\n",
1193 " <thead>\n",
1194 " <tr style=\"text-align: right;\">\n",
1195 " <th></th>\n",
1196 " <th>0</th>\n",
1197 " <th>1</th>\n",
1198 " <th>2</th>\n",
1199 " </tr>\n",
1200 " </thead>\n",
1201 " <tbody>\n",
1202 " <tr>\n",
1203 " <th>count</th>\n",
1204 " <td>139717.000000</td>\n",
1205 " <td>139717.000000</td>\n",
1206 " <td>139717.000000</td>\n",
1207 " </tr>\n",
1208 " <tr>\n",
1209 " <th>mean</th>\n",
1210 " <td>0.630986</td>\n",
1211 " <td>0.209083</td>\n",
1212 " <td>0.159931</td>\n",
1213 " </tr>\n",
1214 " <tr>\n",
1215 " <th>std</th>\n",
1216 " <td>0.007147</td>\n",
1217 " <td>0.005648</td>\n",
1218 " <td>0.004706</td>\n",
1219 " </tr>\n",
1220 " <tr>\n",
1221 " <th>min</th>\n",
1222 " <td>0.502827</td>\n",
1223 " <td>0.152388</td>\n",
1224 " <td>0.110754</td>\n",
1225 " </tr>\n",
1226 " <tr>\n",
1227 " <th>25%</th>\n",
1228 " <td>0.629275</td>\n",
1229 " <td>0.206945</td>\n",
1230 " <td>0.158686</td>\n",
1231 " </tr>\n",
1232 " <tr>\n",
1233 " <th>50%</th>\n",
1234 " <td>0.630822</td>\n",
1235 " <td>0.208772</td>\n",
1236 " <td>0.160465</td>\n",
1237 " </tr>\n",
1238 " <tr>\n",
1239 " <th>75%</th>\n",
1240 " <td>0.632655</td>\n",
1241 " <td>0.210202</td>\n",
1242 " <td>0.161593</td>\n",
1243 " </tr>\n",
1244 " <tr>\n",
1245 " <th>max</th>\n",
1246 " <td>0.700247</td>\n",
1247 " <td>0.337525</td>\n",
1248 " <td>0.262446</td>\n",
1249 " </tr>\n",
1250 " </tbody>\n",
1251 "</table>\n",
1252 "</div>"
1253 ],
1254 "text/plain": [
1255 " 0 1 2\n",
1256 "count 139717.000000 139717.000000 139717.000000\n",
1257 "mean 0.630986 0.209083 0.159931\n",
1258 "std 0.007147 0.005648 0.004706\n",
1259 "min 0.502827 0.152388 0.110754\n",
1260 "25% 0.629275 0.206945 0.158686\n",
1261 "50% 0.630822 0.208772 0.160465\n",
1262 "75% 0.632655 0.210202 0.161593\n",
1263 "max 0.700247 0.337525 0.262446"
1264 ]
1265 },
1266 "execution_count": 55,
1267 "metadata": {},
1268 "output_type": "execute_result"
1269 }
1270 ],
1271 "source": [
1272 "pd.DataFrame(lgb_model.predict(X_test)).describe()"
1273 ]
1274 },
1275 {
1276 "cell_type": "markdown",
1277 "metadata": {},
1278 "source": [
1279 "## Random Forest"
1280 ]
1281 },
1282 {
1283 "cell_type": "code",
1284 "execution_count": 28,
1285 "metadata": {
1286 "ExecuteTime": {
1287 "end_time": "2018-12-27T23:40:48.817474Z",
1288 "start_time": "2018-12-27T23:35:53.942018Z"
1289 }
1290 },
1291 "outputs": [
1292 {
1293 "name": "stdout",
1294 "output_type": "stream",
1295 "text": [
1296 "Accuracy: 63.39%\n"
1297 ]
1298 }
1299 ],
1300 "source": [
1301 "rf = RandomForestClassifier(n_jobs=-1, \n",
1302 " n_estimators=100,\n",
1303 " class_weight='balanced_subsample')\n",
1304 "rf.fit(X_train, y_train)\n",
1305 "y_pred = rf.predict(X_test)\n",
1306 "print(f'Accuracy: {accuracy_score(y_true=y_test, y_pred=y_pred):.2%}')"
1307 ]
1308 },
1309 {
1310 "cell_type": "code",
1311 "execution_count": 38,
1312 "metadata": {
1313 "ExecuteTime": {
1314 "end_time": "2018-12-27T23:50:06.311958Z",
1315 "start_time": "2018-12-27T23:50:04.657072Z"
1316 }
1317 },
1318 "outputs": [],
1319 "source": [
1320 "y_pred_prob = rf.predict_proba(X_test)"
1321 ]
1322 },
1323 {
1324 "cell_type": "code",
1325 "execution_count": 39,
1326 "metadata": {
1327 "ExecuteTime": {
1328 "end_time": "2018-12-27T23:50:16.664162Z",
1329 "start_time": "2018-12-27T23:50:16.598883Z"
1330 }
1331 },
1332 "outputs": [
1333 {
1334 "data": {
1335 "text/html": [
1336 "<div>\n",
1337 "<style scoped>\n",
1338 " .dataframe tbody tr th:only-of-type {\n",
1339 " vertical-align: middle;\n",
1340 " }\n",
1341 "\n",
1342 " .dataframe tbody tr th {\n",
1343 " vertical-align: top;\n",
1344 " }\n",
1345 "\n",
1346 " .dataframe thead th {\n",
1347 " text-align: right;\n",
1348 " }\n",
1349 "</style>\n",
1350 "<table border=\"1\" class=\"dataframe\">\n",
1351 " <thead>\n",
1352 " <tr style=\"text-align: right;\">\n",
1353 " <th></th>\n",
1354 " <th>0</th>\n",
1355 " <th>1</th>\n",
1356 " <th>2</th>\n",
1357 " </tr>\n",
1358 " </thead>\n",
1359 " <tbody>\n",
1360 " <tr>\n",
1361 " <th>count</th>\n",
1362 " <td>139717.000000</td>\n",
1363 " <td>139717.000000</td>\n",
1364 " <td>139717.000000</td>\n",
1365 " </tr>\n",
1366 " <tr>\n",
1367 " <th>mean</th>\n",
1368 " <td>0.635541</td>\n",
1369 " <td>0.207648</td>\n",
1370 " <td>0.156812</td>\n",
1371 " </tr>\n",
1372 " <tr>\n",
1373 " <th>std</th>\n",
1374 " <td>0.048588</td>\n",
1375 " <td>0.041110</td>\n",
1376 " <td>0.036620</td>\n",
1377 " </tr>\n",
1378 " <tr>\n",
1379 " <th>min</th>\n",
1380 " <td>0.190000</td>\n",
1381 " <td>0.050000</td>\n",
1382 " <td>0.030000</td>\n",
1383 " </tr>\n",
1384 " <tr>\n",
1385 " <th>25%</th>\n",
1386 " <td>0.600000</td>\n",
1387 " <td>0.180000</td>\n",
1388 " <td>0.130000</td>\n",
1389 " </tr>\n",
1390 " <tr>\n",
1391 " <th>50%</th>\n",
1392 " <td>0.640000</td>\n",
1393 " <td>0.210000</td>\n",
1394 " <td>0.160000</td>\n",
1395 " </tr>\n",
1396 " <tr>\n",
1397 " <th>75%</th>\n",
1398 " <td>0.670000</td>\n",
1399 " <td>0.230000</td>\n",
1400 " <td>0.180000</td>\n",
1401 " </tr>\n",
1402 " <tr>\n",
1403 " <th>max</th>\n",
1404 " <td>0.860000</td>\n",
1405 " <td>0.740000</td>\n",
1406 " <td>0.340000</td>\n",
1407 " </tr>\n",
1408 " </tbody>\n",
1409 "</table>\n",
1410 "</div>"
1411 ],
1412 "text/plain": [
1413 " 0 1 2\n",
1414 "count 139717.000000 139717.000000 139717.000000\n",
1415 "mean 0.635541 0.207648 0.156812\n",
1416 "std 0.048588 0.041110 0.036620\n",
1417 "min 0.190000 0.050000 0.030000\n",
1418 "25% 0.600000 0.180000 0.130000\n",
1419 "50% 0.640000 0.210000 0.160000\n",
1420 "75% 0.670000 0.230000 0.180000\n",
1421 "max 0.860000 0.740000 0.340000"
1422 ]
1423 },
1424 "execution_count": 39,
1425 "metadata": {},
1426 "output_type": "execute_result"
1427 }
1428 ],
1429 "source": [
1430 "pd.DataFrame(y_pred_prob).describe()"
1431 ]
1432 },
1433 {
1434 "cell_type": "code",
1435 "execution_count": 36,
1436 "metadata": {
1437 "ExecuteTime": {
1438 "end_time": "2018-12-27T23:49:06.348646Z",
1439 "start_time": "2018-12-27T23:49:06.325988Z"
1440 }
1441 },
1442 "outputs": [
1443 {
1444 "data": {
1445 "text/plain": [
1446 "0 139715\n",
1447 "1 2\n",
1448 "dtype: int64"
1449 ]
1450 },
1451 "execution_count": 36,
1452 "metadata": {},
1453 "output_type": "execute_result"
1454 }
1455 ],
1456 "source": [
1457 "pd.Series(y_pred).value_counts()"
1458 ]
1459 },
1460 {
1461 "cell_type": "code",
1462 "execution_count": 32,
1463 "metadata": {
1464 "ExecuteTime": {
1465 "end_time": "2018-12-27T23:47:01.765414Z",
1466 "start_time": "2018-12-27T23:47:01.758140Z"
1467 }
1468 },
1469 "outputs": [
1470 {
1471 "data": {
1472 "text/plain": [
1473 "0 354263\n",
1474 "1 116809\n",
1475 "2 87796\n",
1476 "dtype: int64"
1477 ]
1478 },
1479 "execution_count": 32,
1480 "metadata": {},
1481 "output_type": "execute_result"
1482 }
1483 ],
1484 "source": [
1485 "pd.Series(y_train).value_counts()"
1486 ]
1487 },
1488 {
1489 "cell_type": "code",
1490 "execution_count": 33,
1491 "metadata": {
1492 "ExecuteTime": {
1493 "end_time": "2018-12-27T23:47:24.456819Z",
1494 "start_time": "2018-12-27T23:47:24.444497Z"
1495 }
1496 },
1497 "outputs": [
1498 {
1499 "data": {
1500 "text/plain": [
1501 "0.6338956605137528"
1502 ]
1503 },
1504 "execution_count": 33,
1505 "metadata": {},
1506 "output_type": "execute_result"
1507 }
1508 ],
1509 "source": [
1510 "(y_test == 0).mean()"
1511 ]
1512 },
1513 {
1514 "cell_type": "code",
1515 "execution_count": 29,
1516 "metadata": {
1517 "ExecuteTime": {
1518 "end_time": "2018-12-27T23:40:48.919212Z",
1519 "start_time": "2018-12-27T23:40:48.819056Z"
1520 }
1521 },
1522 "outputs": [
1523 {
1524 "data": {
1525 "text/plain": [
1526 "array([[88564, 2, 0],\n",
1527 " [29202, 0, 0],\n",
1528 " [21949, 0, 0]])"
1529 ]
1530 },
1531 "execution_count": 29,
1532 "metadata": {},
1533 "output_type": "execute_result"
1534 }
1535 ],
1536 "source": [
1537 "confusion_matrix(y_true=y_test, y_pred=y_pred)"
1538 ]
1539 },
1540 {
1541 "cell_type": "markdown",
1542 "metadata": {},
1543 "source": [
1544 "## Logistic Regression"
1545 ]
1546 },
1547 {
1548 "cell_type": "markdown",
1549 "metadata": {},
1550 "source": [
1551 "### Binary Classification"
1552 ]
1553 },
1554 {
1555 "cell_type": "code",
1556 "execution_count": 44,
1557 "metadata": {
1558 "ExecuteTime": {
1559 "end_time": "2018-12-28T01:29:09.839140Z",
1560 "start_time": "2018-12-28T01:29:04.044264Z"
1561 }
1562 },
1563 "outputs": [
1564 {
1565 "name": "stdout",
1566 "output_type": "stream",
1567 "text": [
1568 "Accuracy: 50.05%\n"
1569 ]
1570 }
1571 ],
1572 "source": [
1573 "lr = LogisticRegression()\n",
1574 "lr.fit(X_train, y_train)\n",
1575 "y_pred = lr.predict(X_test)\n",
1576 "print(f'Accuracy: {accuracy_score(y_true=y_test, y_pred=y_pred):.2%}')"
1577 ]
1578 },
1579 {
1580 "cell_type": "markdown",
1581 "metadata": {},
1582 "source": [
1583 "### Multinomial Classification"
1584 ]
1585 },
1586 {
1587 "cell_type": "code",
1588 "execution_count": 100,
1589 "metadata": {
1590 "ExecuteTime": {
1591 "end_time": "2018-12-28T04:17:30.953069Z",
1592 "start_time": "2018-12-28T04:17:16.299083Z"
1593 }
1594 },
1595 "outputs": [
1596 {
1597 "name": "stdout",
1598 "output_type": "stream",
1599 "text": [
1600 "Accuracy: 34.60%\n"
1601 ]
1602 }
1603 ],
1604 "source": [
1605 "lr = LogisticRegression(multi_class='multinomial', solver='lbfgs', class_weight='balanced')\n",
1606 "lr.fit(X_train, y_train)\n",
1607 "y_pred = lr.predict(X_test)\n",
1608 "print(f'Accuracy: {accuracy_score(y_true=y_test, y_pred=y_pred):.2%}')"
1609 ]
1610 },
1611 {
1612 "cell_type": "code",
1613 "execution_count": 101,
1614 "metadata": {
1615 "ExecuteTime": {
1616 "end_time": "2018-12-28T04:18:01.235296Z",
1617 "start_time": "2018-12-28T04:18:01.179436Z"
1618 }
1619 },
1620 "outputs": [
1621 {
1622 "data": {
1623 "text/plain": [
1624 "array([[ 7624, 3105, 1792, 2075, 4929],\n",
1625 " [ 4957, 5365, 3126, 2027, 4111],\n",
1626 " [ 3142, 3631, 4546, 3171, 5017],\n",
1627 " [ 2301, 1756, 3307, 4580, 7430],\n",
1628 " [ 1742, 888, 1363, 3653, 11499]])"
1629 ]
1630 },
1631 "execution_count": 101,
1632 "metadata": {},
1633 "output_type": "execute_result"
1634 }
1635 ],
1636 "source": [
1637 "confusion_matrix(y_true=y_test, y_pred=y_pred)"
1638 ]
1639 },
1640 {
1641 "cell_type": "code",
1642 "execution_count": null,
1643 "metadata": {},
1644 "outputs": [],
1645 "source": []
1646 }
1647 ],
1648 "metadata": {
1649 "kernelspec": {
1650 "display_name": "Python 3",
1651 "language": "python",
1652 "name": "python3"
1653 },
1654 "language_info": {
1655 "codemirror_mode": {
1656 "name": "ipython",
1657 "version": 3
1658 },
1659 "file_extension": ".py",
1660 "mimetype": "text/x-python",
1661 "name": "python",
1662 "nbconvert_exporter": "python",
1663 "pygments_lexer": "ipython3",
1664 "version": "3.6.8"
1665 },
1666 "toc": {
1667 "base_numbering": 1,
1668 "nav_menu": {},
1669 "number_sections": true,
1670 "sideBar": true,
1671 "skip_h1_title": false,
1672 "title_cell": "Table of Contents",
1673 "title_sidebar": "Contents",
1674 "toc_cell": false,
1675 "toc_position": {},
1676 "toc_section_display": true,
1677 "toc_window_display": false
1678 }
1679 },
1680 "nbformat": 4,
1681 "nbformat_minor": 2
1682 }