ml-finance-python

python scripts for finance machine learning

git clone https://9o.is/git/ml-finance-python.git

word2vec_wiki.py

(2450B)


      1 # coding: utf-8
      2 
      3 from pathlib import Path
      4 from argparse import ArgumentParser
      5 from time import time
      6 import pandas as pd
      7 import numpy as np
      8 
      9 from gensim.models import Word2Vec
     10 from gensim.models.word2vec import LineSentence
     11 
     12 np.random.seed(42)
     13 
     14 LANGUAGES = ['en', 'es']
     15 
     16 
     17 def combine_files():
     18     for language in LANGUAGES:
     19         source_dir = DATA_DIR / language / 'sentences'
     20         target_file = Path('wiki', language, 'wiki.txt')
     21         with target_file.open('a') as target:
     22             for source in source_dir.glob('*.txt'):
     23                 for line in source.open('r'):
     24                     target.write(line)
     25 
     26 
     27 def get_accuracy(acc, detail=False):
     28     results = [[c['section'], len(c['correct']), len(c['incorrect'])] for c in acc]
     29     results = pd.DataFrame(results, columns=['category', 'correct', 'incorrect'])
     30     results['average'] = results.correct.div(results[['correct', 'incorrect']].sum(1))
     31     results.sort_values('average', ascending=False)
     32     if detail:
     33         print(results)
     34     return results.iloc[-1, 1:].tolist()
     35 
     36 language = 'es'
     37 PROJECT_DIR = Path('/home/stefan/projects/odsc_2018/word2vec-translation')
     38 ANALOGIES_PATH = PROJECT_DIR / 'data' / 'analogies' / 'analogies-{}.txt'.format(language)
     39 
     40 gensim_path = Path('wiki', language)
     41 if not gensim_path.exists():
     42     gensim_path.mkdir(parents=True, exist_ok=True)
     43 
     44 sentence_path = gensim_path / 'wiki.txt'
     45 sentences = LineSentence(str(sentence_path))
     46 start = time()
     47 model = Word2Vec(sentences,
     48                  sg=1,
     49                  size=300,
     50                  window=5,
     51                  min_count=5,
     52                  negative=10,
     53                  workers=8,
     54                  iter=1,
     55                  alpha=0.05)
     56 
     57 print('Duration: {:,.1f}s'.format(time() - start))
     58 
     59 model.wv.save(str(gensim_path / 'word_vectors.bin'))
     60 
     61 acc = get_accuracy(model.wv.accuracy(str(ANALOGIES_PATH), case_insensitive=True))
     62 print('Base Accuracy: Correct {:,d} | Wrong {:,d} | Avg {:,.2%}\n'.format(*acc))
     63 
     64 accuracies = [acc]
     65 for i in range(1, 11):
     66     start = time()
     67     model.train(sentences, epochs=1, total_examples=model.corpus_count)
     68     accuracies.append(get_accuracy(model.wv.accuracy(str(ANALOGIES_PATH))))
     69     print('{} | Duration: {:,.1f} | Accuracy: {:.2%} '.format(i, time() - start, accuracies[-1][-1]))
     70 
     71 pd.DataFrame(accuracies, columns=['correct', 'wrong', 'average']).to_csv(gensim_path / 'accuracies.csv', index=False)
     72 model.wv.save(str(gensim_path / 'word_vectors_final.bin'))