ml-finance-python

python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
vis_experiments.py

(2096B)
      1 #!/usr/bin/env python
      2 # -*- coding: utf-8 -*-
      3 __author__ = 'Stefan Jansen'
      4 
      5 from pathlib import Path
      6 from itertools import zip_longest
      7 import numpy as np
      8 import pandas as pd
      9 from gensim.models import LdaModel, LdaMulticore
     10 from gensim.matutils import Sparse2Corpus
     11 from scipy import sparse
     12 from itertools import product
     13 from time import time
     14 from gensim.corpora import Dictionary
     15 import pyLDAvis
     16 from pyLDAvis.gensim import prepare
     17 
     18 np.random.seed(42)
     19 
     20 
     21 def format_time(t):
     22     m_, s = divmod(t, 60)
     23     h, m = divmod(m_, 60)
     24     return f'{h:>02.0f}:{m:>02.0f}:{s:>02.0f}'
     25 
     26 
     27 experiment_path = Path('experiments')
     28 vis_path = Path('ldavis')
     29 if not vis_path.exists():
     30     vis_path.mkdir(exist_ok=True)
     31 
     32 # dtm params
     33 min_dfs = [.001, .005, .01]
     34 max_dfs = [.1, .25, .5, 1.0]
     35 binarys = [True, False]
     36 dtm_params = list(product(*[min_dfs, max_dfs, binarys]))
     37 
     38 topics = [3, 5, 7, 10, 15, 20, 25, 50]
     39 passes = 1
     40 start = time()
     41 for i, (min_df, max_df, binary) in enumerate(dtm_params, 1):
     42 
     43     print(min_df, max_df, binary)
     44 
     45     vocab_path = experiment_path / str(min_df) / str(max_df) / str(int(binary))
     46     try:
     47         dtm = sparse.load_npz(vocab_path / f'dtm.npz')
     48         tokens = pd.read_csv(vocab_path / f'tokens.csv', header=None, squeeze=True)
     49     except FileNotFoundError:
     50         print('missing')
     51         continue
     52     corpus = Sparse2Corpus(dtm, documents_columns=False)
     53     id2word = tokens.to_dict()
     54     dictionary = Dictionary.from_corpus(corpus, id2word)
     55 
     56     for num_topics in topics:
     57         print(num_topics, end=' ')
     58         model_path = vocab_path / str(num_topics) / str(passes) / 'lda'
     59         if model_path.exists():
     60             lda = LdaModel.load(model_path.as_posix())
     61         else:
     62             continue
     63         start = time()
     64         vis = prepare(lda, corpus, dictionary, mds='tsne')
     65         terms = vis.topic_info
     66         terms = terms[terms.Category != 'Default']
     67         pyLDAvis.save_html(vis, (model_path / 'ldavis.html').as_posix())
     68         terms.to_csv(model_path / 'relevant_terms.csv', index=False)
     69         duration = time() - start
     70         print(format_time(duration))