ml-finance-python
python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
d2v_test.py
(2015B)
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 __author__ = 'Stefan Jansen'
4
5 from pathlib import Path
6 import numpy as np
7 import pandas as pd
8 from gensim.models import Doc2Vec
9 from gensim.models.doc2vec import TaggedDocument
10 import logging
11 from random import shuffle
12 from sklearn.model_selection import train_test_split
13 from sklearn.linear_model import LogisticRegression
14 from sklearn.ensemble import RandomForestClassifier
15 from sklearn.metrics import confusion_matrix, accuracy_score
16
17 pd.set_option('display.expand_frame_repr', False)
18 np.random.seed(42)
19
20 logging.basicConfig(
21 filename='test.log',
22 level=logging.DEBUG,
23 format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
24 datefmt='%H:%M:%S')
25
26 df = pd.read_csv('yelp_sample.csv')
27
28
29 def train_model():
30 sentences = []
31 for i, (stars, text) in df.iterrows():
32 sentences.append(TaggedDocument(words=text.split(), tags=[i]))
33
34 print('start training')
35 model = Doc2Vec(vector_size=300, window=5, min_count=5, workers=8, epochs=1)
36 print('build vocab')
37 model.build_vocab(sentences)
38
39 print('keep training')
40
41 for epoch in range(10):
42 print(epoch, end=' ', flush=True)
43 shuffle(sentences)
44 model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
45
46 print(model.most_similar('good'))
47 # save model
48 model.save('test.model')
49
50
51 model = Doc2Vec.load('test.model')
52 X = np.zeros(shape=(len(df), 300))
53 y = np.zeros(shape=len(df))
54 for i in range(len(df)):
55 X[i] = model[i]
56 y[i] = df.loc[i, 'stars']
57
58
59 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
60
61 log = LogisticRegression()
62 rf = RandomForestClassifier(n_jobs=-1, n_estimators=100)
63 rf.fit(X_train, y_train)
64 y_pred = rf.predict(X_test)
65
66 mode = pd.Series(y_train).mode()
67 print(accuracy_score(y_true=y_test, y_pred=y_pred))
68 print(accuracy_score(y_true=np.full_like(y_test, fill_value=mode), y_pred=y_pred))
69 print(confusion_matrix(y_true=y_test, y_pred=y_pred))