ml-finance-python

python scripts for finance machine learning

git clone https://9o.is/git/ml-finance-python.git

d2v_test.py

(2015B)


      1 #!/usr/bin/env python
      2 # -*- coding: utf-8 -*-
      3 __author__ = 'Stefan Jansen'
      4 
      5 from pathlib import Path
      6 import numpy as np
      7 import pandas as pd
      8 from gensim.models import Doc2Vec
      9 from gensim.models.doc2vec import TaggedDocument
     10 import logging
     11 from random import shuffle
     12 from sklearn.model_selection import train_test_split
     13 from sklearn.linear_model import LogisticRegression
     14 from sklearn.ensemble import RandomForestClassifier
     15 from sklearn.metrics import confusion_matrix, accuracy_score
     16 
     17 pd.set_option('display.expand_frame_repr', False)
     18 np.random.seed(42)
     19 
     20 logging.basicConfig(
     21         filename='test.log',
     22         level=logging.DEBUG,
     23         format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
     24         datefmt='%H:%M:%S')
     25 
     26 df = pd.read_csv('yelp_sample.csv')
     27 
     28 
     29 def train_model():
     30     sentences = []
     31     for i, (stars, text) in df.iterrows():
     32         sentences.append(TaggedDocument(words=text.split(), tags=[i]))
     33 
     34     print('start training')
     35     model = Doc2Vec(vector_size=300, window=5, min_count=5, workers=8, epochs=1)
     36     print('build vocab')
     37     model.build_vocab(sentences)
     38 
     39     print('keep training')
     40 
     41     for epoch in range(10):
     42         print(epoch, end=' ', flush=True)
     43         shuffle(sentences)
     44         model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
     45 
     46     print(model.most_similar('good'))
     47     # save model
     48     model.save('test.model')
     49 
     50 
     51 model = Doc2Vec.load('test.model')
     52 X = np.zeros(shape=(len(df), 300))
     53 y = np.zeros(shape=len(df))
     54 for i in range(len(df)):
     55     X[i] = model[i]
     56     y[i] = df.loc[i, 'stars']
     57 
     58 
     59 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
     60 
     61 log = LogisticRegression()
     62 rf = RandomForestClassifier(n_jobs=-1, n_estimators=100)
     63 rf.fit(X_train, y_train)
     64 y_pred = rf.predict(X_test)
     65 
     66 mode = pd.Series(y_train).mode()
     67 print(accuracy_score(y_true=y_test, y_pred=y_pred))
     68 print(accuracy_score(y_true=np.full_like(y_test, fill_value=mode), y_pred=y_pred))
     69 print(confusion_matrix(y_true=y_test, y_pred=y_pred))