ml-finance-python
python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
vis_experiments.py
(2096B)
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 __author__ = 'Stefan Jansen'
4
5 from pathlib import Path
6 from itertools import zip_longest
7 import numpy as np
8 import pandas as pd
9 from gensim.models import LdaModel, LdaMulticore
10 from gensim.matutils import Sparse2Corpus
11 from scipy import sparse
12 from itertools import product
13 from time import time
14 from gensim.corpora import Dictionary
15 import pyLDAvis
16 from pyLDAvis.gensim import prepare
17
18 np.random.seed(42)
19
20
21 def format_time(t):
22 m_, s = divmod(t, 60)
23 h, m = divmod(m_, 60)
24 return f'{h:>02.0f}:{m:>02.0f}:{s:>02.0f}'
25
26
27 experiment_path = Path('experiments')
28 vis_path = Path('ldavis')
29 if not vis_path.exists():
30 vis_path.mkdir(exist_ok=True)
31
32 # dtm params
33 min_dfs = [.001, .005, .01]
34 max_dfs = [.1, .25, .5, 1.0]
35 binarys = [True, False]
36 dtm_params = list(product(*[min_dfs, max_dfs, binarys]))
37
38 topics = [3, 5, 7, 10, 15, 20, 25, 50]
39 passes = 1
40 start = time()
41 for i, (min_df, max_df, binary) in enumerate(dtm_params, 1):
42
43 print(min_df, max_df, binary)
44
45 vocab_path = experiment_path / str(min_df) / str(max_df) / str(int(binary))
46 try:
47 dtm = sparse.load_npz(vocab_path / f'dtm.npz')
48 tokens = pd.read_csv(vocab_path / f'tokens.csv', header=None, squeeze=True)
49 except FileNotFoundError:
50 print('missing')
51 continue
52 corpus = Sparse2Corpus(dtm, documents_columns=False)
53 id2word = tokens.to_dict()
54 dictionary = Dictionary.from_corpus(corpus, id2word)
55
56 for num_topics in topics:
57 print(num_topics, end=' ')
58 model_path = vocab_path / str(num_topics) / str(passes) / 'lda'
59 if model_path.exists():
60 lda = LdaModel.load(model_path.as_posix())
61 else:
62 continue
63 start = time()
64 vis = prepare(lda, corpus, dictionary, mds='tsne')
65 terms = vis.topic_info
66 terms = terms[terms.Category != 'Default']
67 pyLDAvis.save_html(vis, (model_path / 'ldavis.html').as_posix())
68 terms.to_csv(model_path / 'relevant_terms.csv', index=False)
69 duration = time() - start
70 print(format_time(duration))