ml-finance-python
python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
eval_vecs.py
(1884B)
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 __author__ = 'Stefan Jansen'
4
5 from pathlib import Path
6 import numpy as np
7 import pandas as pd
8
9 pd.set_option('display.expand_frame_repr', False)
10 np.random.seed(42)
11
12 from pathlib import Path
13 from time import time
14 import warnings
15 from collections import Counter
16 import logging
17
18 import numpy as np
19 import pandas as pd
20
21 from gensim.models import Word2Vec, KeyedVectors
22 from gensim.models.word2vec import LineSentence
23
24 ANALOGIES_PATH = Path().cwd().parent / 'data' / 'analogies' / 'analogies-en.txt'
25
26
27 def eval_analogies(w2v, max_vocab=15000):
28 accuracy = w2v.wv.accuracy(ANALOGIES_PATH,
29 restrict_vocab=max_vocab,
30 case_insensitive=True)
31 return (pd.DataFrame([[c['section'],
32 len(c['correct']),
33 len(c['incorrect'])] for c in accuracy],
34 columns=['category', 'correct', 'incorrect'])
35 .assign(average=lambda x:
36 x.correct.div(x.correct.add(x.incorrect)))).fillna(0)
37
38
39 model_path = Path('models', 'trial_5')
40 accuracies = pd.DataFrame()
41 totals = {}
42 for model_file in model_path.glob('*.bin'):
43 _, size = model_file.stem.split('_')
44 model = KeyedVectors.load_word2vec_format(model_file.as_posix(),
45 binary=True,
46 unicode_errors='ignore')
47 accuracy = eval_analogies(model).set_index('category')
48 total = (accuracy.loc['total',
49 ['correct', 'incorrect']]
50 .sum().astype(int))
51 totals[size] = total
52 print(size, '\t', f"{total:,d} | {accuracy.loc['total', 'average']:.2%}")
53
54 accuracies[size] = accuracy.average
55 totals = pd.Series(totals)
56 print(totals)
57 totals.to_csv(model_path / 'totals.csv')
58 accuracies.to_csv(model_path / 'accuracies.csv')