ml-finance-python
python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
word2vec.py
(2821B)
1 # coding: utf-8
2
3 from pathlib import Path
4 from argparse import ArgumentParser
5 from time import time
6 import pandas as pd
7 import numpy as np
8
9 from gensim.models import Word2Vec
10 from gensim.models.word2vec import LineSentence
11
12 np.random.seed(42)
13
14 LANGUAGES = ['en', 'es']
15
16 SOURCES = ['ted', 'euro']
17 SOURCE_LABELS = ['Ted', 'Europarliament']
18 source_dict = dict(zip(SOURCES, SOURCE_LABELS))
19
20 parser = ArgumentParser(description='Run Keras word2vec model')
21 parser.add_argument('-l', '--language', choices=LANGUAGES, help='language', default='es')
22 parser.add_argument('-s', '--source', choices=SOURCES, help='data source', default='euro')
23 parser.add_argument('-m', '--model', type=int, choices=[1, 2, 3], help='model', default=1)
24
25 args = parser.parse_args()
26 LANGUAGE = args.language
27 MODEL = 'ngrams_{}'.format(args.model)
28 SOURCE = source_dict[args.source]
29
30 PROJECT_DIR = Path('/home/stefan/projects/odsc_2018/word2vec-translation')
31
32 print('\nLanguage: {} | Source: {} | Model: {}'.format(LANGUAGE, SOURCE, MODEL))
33
34
35 def get_accuracy(acc, detail=False):
36 results = [[c['section'], len(c['correct']), len(c['incorrect'])] for c in acc]
37 results = pd.DataFrame(results, columns=['category', 'correct', 'incorrect'])
38 results['average'] = results.correct.div(results[['correct', 'incorrect']].sum(1))
39 results.sort_values('average', ascending=False)
40 if detail:
41 print(results)
42 return results.iloc[-1, 1:].tolist()
43
44
45 ANALOGIES_PATH = PROJECT_DIR / 'data' / 'analogies' / 'analogies-{}.txt'.format(LANGUAGE)
46 gensim_path = PROJECT_DIR / 'gensim' / SOURCE / LANGUAGE / MODEL
47 if not gensim_path.exists():
48 gensim_path.mkdir(parents=True, exist_ok=True)
49
50 sentence_path = PROJECT_DIR / 'vocab' / SOURCE / LANGUAGE / 'ngrams_{}.txt'.format(1)
51 sentences = LineSentence(str(sentence_path))
52 start = time()
53 model = Word2Vec(sentences,
54 sg=1,
55 size=300,
56 window=5,
57 min_count=5,
58 negative=10,
59 workers=8,
60 iter=1,
61 alpha=0.05)
62
63 print('Duration: {:,.1f}s'.format(time() - start))
64
65 model.wv.save(str(gensim_path / 'word_vectors.bin'))
66
67 acc = get_accuracy(model.wv.accuracy(str(ANALOGIES_PATH), case_insensitive=True))
68 print('Base Accuracy: Correct {:,d} | Wrong {:,d} | Avg {:,.2%}\n'.format(*acc))
69
70 accuracies = [acc]
71 for i in range(1, 11):
72 start = time()
73 model.train(sentences, epochs=1, total_examples=model.corpus_count)
74 accuracies.append(get_accuracy(model.wv.accuracy(str(ANALOGIES_PATH))))
75 print('{} | Duration: {:,.1f} | Accuracy: {:.2%} '.format(i, time() - start, accuracies[-1][-1]))
76
77 pd.DataFrame(accuracies, columns=['correct', 'wrong', 'average']).to_csv(gensim_path / 'accuracies.csv', index=False)
78 model.wv.save(str(gensim_path / 'word_vectors_final.bin'))