ml-finance-python
python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
run_experiments.py
(6043B)
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 __author__ = 'Stefan Jansen'
4
5 from pathlib import Path
6 from itertools import zip_longest
7 import numpy as np
8 import pandas as pd
9 from sklearn.feature_extraction.text import CountVectorizer
10 from sklearn.model_selection import train_test_split
11 from gensim.models import LdaModel, LdaMulticore
12 from gensim.matutils import Sparse2Corpus
13 from scipy import sparse
14 from itertools import product
15 from random import shuffle
16 from time import time
17 import logging
18
19 pd.set_option('display.expand_frame_repr', False)
20 np.random.seed(42)
21
22 logging.basicConfig(
23 filename='gensim.log',
24 level=logging.DEBUG,
25 format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
26 datefmt='%H:%M:%S')
27
28
29 def format_time(t):
30 m_, s = divmod(t, 60)
31 h, m = divmod(m_, 60)
32 return f'{h:>02.0f}:{m:>02.0f}:{s:>02.0f}'
33
34
35 # experiment setup
36 cols = ['vocab_size', 'test_vocab', 'min_df', 'max_df', 'binary', 'num_topics', 'passes', 'perplexity']
37 experiment_path = Path('experiments')
38
39 docs = Path('clean_reviews.txt').read_text().split('\n')
40 shuffle(docs)
41 print('\n', len(docs))
42 # train_docs, test_docs = train_test_split(docs, test_size=.1)
43
44 # dtm params
45 min_dfs = [.001, .005, .01]
46 max_dfs = [.1, .25, .5, 1.0]
47 binarys = [True, False]
48 dtm_params = list(product(*[min_dfs, max_dfs, binarys]))
49 n = len(dtm_params)
50 shuffle(dtm_params)
51
52 topicss = [3, 5, 7, 10, 15, 20, 25, 50]
53 passess = [1]
54 model_params = list(product(*[topicss, passess]))
55
56 # corpus = id2word = train_corpus = train_tokens = test_corpus = vocab_size = test_vocab = None
57 start = time()
58 for i, (min_df, max_df, binary) in enumerate(dtm_params, 1):
59 print(min_df, max_df, binary)
60
61 vocab_path = experiment_path / str(min_df) / str(max_df) / str(int(binary))
62 coherence_path = vocab_path / 'coherence.csv'
63 perplexity_path = vocab_path / 'result.csv'
64 if all([coherence_path.exists(), perplexity_path.exists()]):
65 continue
66 if not vocab_path.exists():
67 vocab_path.mkdir(exist_ok=True, parents=True)
68 dtm_path = vocab_path / f'dtm.npz'
69 token_path = vocab_path / f'tokens.csv'
70 start = time()
71 if all([dtm_path.exists() and token_path.exists()]):
72 print('Loading vectorized docs')
73 dtm = sparse.load_npz(vocab_path / f'dtm.npz')
74 tokens = pd.read_csv(vocab_path / f'tokens.csv', header=None, squeeze=True)
75 print('Loading done', format_time(time() - start))
76 else:
77 print('Vectorizing docs')
78 vectorizer = CountVectorizer(min_df=min_df,
79 max_df=max_df,
80 binary=binary)
81 dtm = vectorizer.fit_transform(docs)
82 tokens = pd.Series(vectorizer.get_feature_names())
83 sparse.save_npz(dtm_path, dtm)
84 tokens.to_csv(token_path, index=False)
85 print('Vectorizing done', format_time(time() - start))
86
87 corpus = Sparse2Corpus(dtm, documents_columns=False)
88 id2word = tokens.to_dict()
89 vocab_size = len(tokens)
90
91 assert vocab_size == dtm.shape[1], print(dtm.shape, vocab_size)
92
93 train_dtm, test_dtm = train_test_split(dtm, test_size=.1)
94 assert vocab_size == train_dtm.shape[1] == test_dtm.shape[1], \
95 print(vocab_size, train_dtm.shape[1], test_dtm.shape[1])
96 assert train_dtm.shape[0] + test_dtm.shape[0] == dtm.shape[0]
97 train_corpus = Sparse2Corpus(train_dtm, documents_columns=False)
98 test_corpus = Sparse2Corpus(test_dtm, documents_columns=False)
99 timing = []
100 for workers in [8, 16]:
101 for num_topics in [10, 50]:
102 print('start', workers, num_topics, end=' ')
103 start = time()
104 lda = LdaMulticore(corpus=train_corpus,
105 num_topics=num_topics,
106 id2word=id2word,
107 chunksize=1000,
108 passes=1,
109 eval_every=None,
110 workers=workers,
111 random_state=42)
112 duration = time() - start
113 test_perplexity = 2 ** (-lda.log_perplexity(test_corpus))
114 timing.append([workers, num_topics, duration, test_perplexity])
115 print(format_time(duration), test_perplexity)
116 pd.DataFrame(timing, columns=['workers',
117 'num_topics',
118 'duration',
119 'test_perplexity']).to_csv(f'timings_{workers}.csv', index=False)
120 exit()
121
122 test_vocab = test_dtm.count_nonzero()
123 perplexity, coherence = [], []
124 for num_topics, passes in model_params:
125 model_path = vocab_path / str(num_topics) / str(passes)
126 if not model_path.exists():
127 model_path.mkdir(exist_ok=True, parents=True)
128 print((num_topics, passes), end=' ', flush=True)
129 lda = LdaMulticore(corpus=train_corpus,
130 num_topics=num_topics,
131 id2word=id2word,
132 passes=passes,
133 eval_every=None,
134 workers=72,
135 random_state=42)
136 test_perplexity = 2 ** (-lda.log_perplexity(test_corpus))
137 lda.update(corpus=test_corpus)
138 lda.save((model_path / 'lda').resolve().as_posix())
139
140 topic_coherence = lda.top_topics(corpus=corpus, coherence='u_mass', topn=20)
141 coherence.append([c[1] for c in topic_coherence])
142
143 perplexity.append([vocab_size, test_vocab, min_df, max_df,
144 binary, num_topics, passes, test_perplexity])
145
146 elapsed = time() - start
147 print(f'\nDone: {i / n:.2%} | Duration: {format_time(elapsed)} | To Go: {format_time(elapsed / i * (n - i))}\n')
148 perplexity = pd.DataFrame(perplexity, columns=cols).sort_values('perplexity')
149 print(perplexity)
150 perplexity.to_csv(perplexity_path, index=False)
151 pd.DataFrame((_ for _ in zip_longest(*coherence))).to_csv(coherence_path, index=False)