ml-finance-python

python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
run_experiments.py

(6043B)
      1 #!/usr/bin/env python
      2 # -*- coding: utf-8 -*-
      3 __author__ = 'Stefan Jansen'
      4 
      5 from pathlib import Path
      6 from itertools import zip_longest
      7 import numpy as np
      8 import pandas as pd
      9 from sklearn.feature_extraction.text import CountVectorizer
     10 from sklearn.model_selection import train_test_split
     11 from gensim.models import LdaModel, LdaMulticore
     12 from gensim.matutils import Sparse2Corpus
     13 from scipy import sparse
     14 from itertools import product
     15 from random import shuffle
     16 from time import time
     17 import logging
     18 
     19 pd.set_option('display.expand_frame_repr', False)
     20 np.random.seed(42)
     21 
     22 logging.basicConfig(
     23         filename='gensim.log',
     24         level=logging.DEBUG,
     25         format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
     26         datefmt='%H:%M:%S')
     27 
     28 
     29 def format_time(t):
     30     m_, s = divmod(t, 60)
     31     h, m = divmod(m_, 60)
     32     return f'{h:>02.0f}:{m:>02.0f}:{s:>02.0f}'
     33 
     34 
     35 # experiment setup
     36 cols = ['vocab_size', 'test_vocab', 'min_df', 'max_df', 'binary', 'num_topics', 'passes', 'perplexity']
     37 experiment_path = Path('experiments')
     38 
     39 docs = Path('clean_reviews.txt').read_text().split('\n')
     40 shuffle(docs)
     41 print('\n', len(docs))
     42 # train_docs, test_docs = train_test_split(docs, test_size=.1)
     43 
     44 # dtm params
     45 min_dfs = [.001, .005, .01]
     46 max_dfs = [.1, .25, .5, 1.0]
     47 binarys = [True, False]
     48 dtm_params = list(product(*[min_dfs, max_dfs, binarys]))
     49 n = len(dtm_params)
     50 shuffle(dtm_params)
     51 
     52 topicss = [3, 5, 7, 10, 15, 20, 25, 50]
     53 passess = [1]
     54 model_params = list(product(*[topicss, passess]))
     55 
     56 # corpus = id2word = train_corpus = train_tokens = test_corpus = vocab_size = test_vocab = None
     57 start = time()
     58 for i, (min_df, max_df, binary) in enumerate(dtm_params, 1):
     59     print(min_df, max_df, binary)
     60 
     61     vocab_path = experiment_path / str(min_df) / str(max_df) / str(int(binary))
     62     coherence_path = vocab_path / 'coherence.csv'
     63     perplexity_path = vocab_path / 'result.csv'
     64     if all([coherence_path.exists(), perplexity_path.exists()]):
     65         continue
     66     if not vocab_path.exists():
     67         vocab_path.mkdir(exist_ok=True, parents=True)
     68     dtm_path = vocab_path / f'dtm.npz'
     69     token_path = vocab_path / f'tokens.csv'
     70     start = time()
     71     if all([dtm_path.exists() and token_path.exists()]):
     72         print('Loading vectorized docs')
     73         dtm = sparse.load_npz(vocab_path / f'dtm.npz')
     74         tokens = pd.read_csv(vocab_path / f'tokens.csv', header=None, squeeze=True)
     75         print('Loading done', format_time(time() - start))
     76     else:
     77         print('Vectorizing docs')
     78         vectorizer = CountVectorizer(min_df=min_df,
     79                                      max_df=max_df,
     80                                      binary=binary)
     81         dtm = vectorizer.fit_transform(docs)
     82         tokens = pd.Series(vectorizer.get_feature_names())
     83         sparse.save_npz(dtm_path, dtm)
     84         tokens.to_csv(token_path, index=False)
     85         print('Vectorizing done', format_time(time() - start))
     86 
     87     corpus = Sparse2Corpus(dtm, documents_columns=False)
     88     id2word = tokens.to_dict()
     89     vocab_size = len(tokens)
     90 
     91     assert vocab_size == dtm.shape[1], print(dtm.shape, vocab_size)
     92 
     93     train_dtm, test_dtm = train_test_split(dtm, test_size=.1)
     94     assert vocab_size == train_dtm.shape[1] == test_dtm.shape[1], \
     95         print(vocab_size, train_dtm.shape[1], test_dtm.shape[1])
     96     assert train_dtm.shape[0] + test_dtm.shape[0] == dtm.shape[0]
     97     train_corpus = Sparse2Corpus(train_dtm, documents_columns=False)
     98     test_corpus = Sparse2Corpus(test_dtm, documents_columns=False)
     99     timing = []
    100     for workers in [8, 16]:
    101         for num_topics in [10, 50]:
    102             print('start', workers, num_topics, end=' ')
    103             start = time()
    104             lda = LdaMulticore(corpus=train_corpus,
    105                                num_topics=num_topics,
    106                                id2word=id2word,
    107                                chunksize=1000,
    108                                passes=1,
    109                                eval_every=None,
    110                                workers=workers,
    111                                random_state=42)
    112             duration = time() - start
    113             test_perplexity = 2 ** (-lda.log_perplexity(test_corpus))
    114             timing.append([workers, num_topics, duration, test_perplexity])
    115             print(format_time(duration), test_perplexity)
    116             pd.DataFrame(timing, columns=['workers',
    117                                           'num_topics',
    118                                           'duration',
    119                                           'test_perplexity']).to_csv(f'timings_{workers}.csv', index=False)
    120     exit()
    121 
    122     test_vocab = test_dtm.count_nonzero()
    123     perplexity, coherence = [], []
    124     for num_topics, passes in model_params:
    125         model_path = vocab_path / str(num_topics) / str(passes)
    126         if not model_path.exists():
    127             model_path.mkdir(exist_ok=True, parents=True)
    128         print((num_topics, passes), end=' ', flush=True)
    129         lda = LdaMulticore(corpus=train_corpus,
    130                            num_topics=num_topics,
    131                            id2word=id2word,
    132                            passes=passes,
    133                            eval_every=None,
    134                            workers=72,
    135                            random_state=42)
    136         test_perplexity = 2 ** (-lda.log_perplexity(test_corpus))
    137         lda.update(corpus=test_corpus)
    138         lda.save((model_path / 'lda').resolve().as_posix())
    139 
    140         topic_coherence = lda.top_topics(corpus=corpus, coherence='u_mass', topn=20)
    141         coherence.append([c[1] for c in topic_coherence])
    142 
    143         perplexity.append([vocab_size, test_vocab, min_df, max_df,
    144                            binary, num_topics, passes, test_perplexity])
    145 
    146     elapsed = time() - start
    147     print(f'\nDone: {i / n:.2%} | Duration: {format_time(elapsed)} | To Go: {format_time(elapsed / i * (n - i))}\n')
    148     perplexity = pd.DataFrame(perplexity, columns=cols).sort_values('perplexity')
    149     print(perplexity)
    150     perplexity.to_csv(perplexity_path, index=False)
    151     pd.DataFrame((_ for _ in zip_longest(*coherence))).to_csv(coherence_path, index=False)