ml-finance-python
python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
word2vec_wiki.py
(2450B)
1 # coding: utf-8
2
3 from pathlib import Path
4 from argparse import ArgumentParser
5 from time import time
6 import pandas as pd
7 import numpy as np
8
9 from gensim.models import Word2Vec
10 from gensim.models.word2vec import LineSentence
11
12 np.random.seed(42)
13
14 LANGUAGES = ['en', 'es']
15
16
17 def combine_files():
18 for language in LANGUAGES:
19 source_dir = DATA_DIR / language / 'sentences'
20 target_file = Path('wiki', language, 'wiki.txt')
21 with target_file.open('a') as target:
22 for source in source_dir.glob('*.txt'):
23 for line in source.open('r'):
24 target.write(line)
25
26
27 def get_accuracy(acc, detail=False):
28 results = [[c['section'], len(c['correct']), len(c['incorrect'])] for c in acc]
29 results = pd.DataFrame(results, columns=['category', 'correct', 'incorrect'])
30 results['average'] = results.correct.div(results[['correct', 'incorrect']].sum(1))
31 results.sort_values('average', ascending=False)
32 if detail:
33 print(results)
34 return results.iloc[-1, 1:].tolist()
35
36 language = 'es'
37 PROJECT_DIR = Path('/home/stefan/projects/odsc_2018/word2vec-translation')
38 ANALOGIES_PATH = PROJECT_DIR / 'data' / 'analogies' / 'analogies-{}.txt'.format(language)
39
40 gensim_path = Path('wiki', language)
41 if not gensim_path.exists():
42 gensim_path.mkdir(parents=True, exist_ok=True)
43
44 sentence_path = gensim_path / 'wiki.txt'
45 sentences = LineSentence(str(sentence_path))
46 start = time()
47 model = Word2Vec(sentences,
48 sg=1,
49 size=300,
50 window=5,
51 min_count=5,
52 negative=10,
53 workers=8,
54 iter=1,
55 alpha=0.05)
56
57 print('Duration: {:,.1f}s'.format(time() - start))
58
59 model.wv.save(str(gensim_path / 'word_vectors.bin'))
60
61 acc = get_accuracy(model.wv.accuracy(str(ANALOGIES_PATH), case_insensitive=True))
62 print('Base Accuracy: Correct {:,d} | Wrong {:,d} | Avg {:,.2%}\n'.format(*acc))
63
64 accuracies = [acc]
65 for i in range(1, 11):
66 start = time()
67 model.train(sentences, epochs=1, total_examples=model.corpus_count)
68 accuracies.append(get_accuracy(model.wv.accuracy(str(ANALOGIES_PATH))))
69 print('{} | Duration: {:,.1f} | Accuracy: {:.2%} '.format(i, time() - start, accuracies[-1][-1]))
70
71 pd.DataFrame(accuracies, columns=['correct', 'wrong', 'average']).to_csv(gensim_path / 'accuracies.csv', index=False)
72 model.wv.save(str(gensim_path / 'word_vectors_final.bin'))