ml-finance-python

python scripts for finance machine learning

git clone https://9o.is/git/ml-finance-python.git

word2vec.py

(2821B)


      1 # coding: utf-8
      2 
      3 from pathlib import Path
      4 from argparse import ArgumentParser
      5 from time import time
      6 import pandas as pd
      7 import numpy as np
      8 
      9 from gensim.models import Word2Vec
     10 from gensim.models.word2vec import LineSentence
     11 
     12 np.random.seed(42)
     13 
     14 LANGUAGES = ['en', 'es']
     15 
     16 SOURCES = ['ted', 'euro']
     17 SOURCE_LABELS = ['Ted', 'Europarliament']
     18 source_dict = dict(zip(SOURCES, SOURCE_LABELS))
     19 
     20 parser = ArgumentParser(description='Run Keras word2vec model')
     21 parser.add_argument('-l', '--language', choices=LANGUAGES, help='language', default='es')
     22 parser.add_argument('-s', '--source', choices=SOURCES, help='data source', default='euro')
     23 parser.add_argument('-m', '--model', type=int, choices=[1, 2, 3], help='model', default=1)
     24 
     25 args = parser.parse_args()
     26 LANGUAGE = args.language
     27 MODEL = 'ngrams_{}'.format(args.model)
     28 SOURCE = source_dict[args.source]
     29 
     30 PROJECT_DIR = Path('/home/stefan/projects/odsc_2018/word2vec-translation')
     31 
     32 print('\nLanguage: {} | Source: {} | Model: {}'.format(LANGUAGE, SOURCE, MODEL))
     33 
     34 
     35 def get_accuracy(acc, detail=False):
     36     results = [[c['section'], len(c['correct']), len(c['incorrect'])] for c in acc]
     37     results = pd.DataFrame(results, columns=['category', 'correct', 'incorrect'])
     38     results['average'] = results.correct.div(results[['correct', 'incorrect']].sum(1))
     39     results.sort_values('average', ascending=False)
     40     if detail:
     41         print(results)
     42     return results.iloc[-1, 1:].tolist()
     43 
     44 
     45 ANALOGIES_PATH = PROJECT_DIR / 'data' / 'analogies' / 'analogies-{}.txt'.format(LANGUAGE)
     46 gensim_path = PROJECT_DIR / 'gensim' / SOURCE / LANGUAGE / MODEL
     47 if not gensim_path.exists():
     48     gensim_path.mkdir(parents=True, exist_ok=True)
     49 
     50 sentence_path = PROJECT_DIR / 'vocab' / SOURCE / LANGUAGE / 'ngrams_{}.txt'.format(1)
     51 sentences = LineSentence(str(sentence_path))
     52 start = time()
     53 model = Word2Vec(sentences,
     54                  sg=1,
     55                  size=300,
     56                  window=5,
     57                  min_count=5,
     58                  negative=10,
     59                  workers=8,
     60                  iter=1,
     61                  alpha=0.05)
     62 
     63 print('Duration: {:,.1f}s'.format(time() - start))
     64 
     65 model.wv.save(str(gensim_path / 'word_vectors.bin'))
     66 
     67 acc = get_accuracy(model.wv.accuracy(str(ANALOGIES_PATH), case_insensitive=True))
     68 print('Base Accuracy: Correct {:,d} | Wrong {:,d} | Avg {:,.2%}\n'.format(*acc))
     69 
     70 accuracies = [acc]
     71 for i in range(1, 11):
     72     start = time()
     73     model.train(sentences, epochs=1, total_examples=model.corpus_count)
     74     accuracies.append(get_accuracy(model.wv.accuracy(str(ANALOGIES_PATH))))
     75     print('{} | Duration: {:,.1f} | Accuracy: {:.2%} '.format(i, time() - start, accuracies[-1][-1]))
     76 
     77 pd.DataFrame(accuracies, columns=['correct', 'wrong', 'average']).to_csv(gensim_path / 'accuracies.csv', index=False)
     78 model.wv.save(str(gensim_path / 'word_vectors_final.bin'))