ml-finance-python
python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
run_experiments.py
(4885B)
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 __author__ = 'Stefan Jansen'
4
5 from pathlib import Path
6 import numpy as np
7 import pandas as pd
8 from sklearn.feature_extraction.text import CountVectorizer
9 from sklearn.model_selection import train_test_split
10 from gensim.models import LdaModel
11 from gensim.matutils import Sparse2Corpus
12 from scipy import sparse
13 from itertools import product
14 from random import shuffle
15 from time import time
16 import spacy
17 import logging
18
19 pd.set_option('display.expand_frame_repr', False)
20 np.random.seed(42)
21 nlp = spacy.load('en')
22
23 logging.basicConfig(
24 filename='gensim.log',
25 level=logging.DEBUG,
26 format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
27 datefmt='%H:%M:%S')
28
29
30 def format_time(t):
31 m_, s = divmod(t, 60)
32 h, m = divmod(m_, 60)
33 return f'{h:>02.0f}:{m:>02.0f}:{s:>02.0f}'
34
35
36 clean_text = Path('clean_text.txt')
37
38 # experiment setup
39 cols = ['vocab_size', 'test_vocab', 'min_df', 'max_df', 'binary', 'num_topics', 'passes', 'perplexity']
40 experiment_path = Path('experiments')
41
42 # get text files
43 clean_docs = clean_text.read_text().split('\n')
44
45 print('\n', len(clean_docs))
46 train_docs, test_docs = train_test_split(clean_docs, test_size=.1)
47
48 # dtm params
49 min_dfs = [50, 100, 250, 500]
50 max_dfs = [.1, .25, .5, 1.0]
51 binarys = [True, False]
52 dtm_params = list(product(*[min_dfs, max_dfs, binarys]))
53 n = len(dtm_params)
54 shuffle(dtm_params)
55
56 topicss = [3, 5, 7, 10, 15, 20, 25, 50]
57 passess = [1, 25]
58 model_params = list(product(*[topicss, passess]))
59
60 corpus = id2word = train_corpus = train_tokens = test_corpus = vocab_size = test_vocab = None
61 start = time()
62 for i, (min_df, max_df, binary) in enumerate(dtm_params, 1):
63 print(min_df, max_df, binary)
64 result = []
65
66 vocab_path = experiment_path / str(min_df) / str(max_df) / str(int(binary))
67 if vocab_path.exists():
68 continue
69 else:
70 vocab_path.mkdir(exist_ok=True, parents=True)
71 vectorizer = CountVectorizer(min_df=min_df,
72 max_df=max_df,
73 binary=binary)
74 train_dtm = vectorizer.fit_transform(train_docs)
75 train_corpus = Sparse2Corpus(train_dtm, documents_columns=False)
76 train_tokens = vectorizer.get_feature_names()
77
78 test_dtm = vectorizer.transform(test_docs)
79 test_corpus = Sparse2Corpus(test_dtm, documents_columns=False)
80 test_vocab = test_dtm.count_nonzero()
81
82 dtm = vectorizer.fit_transform(clean_docs)
83 sparse.save_npz(vocab_path / f'dtm.npz', dtm)
84 tokens = vectorizer.get_feature_names()
85 vocab_size = len(tokens)
86 pd.Series(tokens).to_csv(vocab_path / f'tokens.csv', index=False)
87
88 id2word = pd.Series(tokens).to_dict()
89 corpus = Sparse2Corpus(dtm, documents_columns=False)
90
91 coherence = pd.DataFrame()
92 for num_topics, passes in model_params:
93 model_path = vocab_path / str(num_topics) / str(passes)
94 if not model_path.exists():
95 model_path.mkdir(exist_ok=True, parents=True)
96 print((num_topics, passes), end=' ', flush=True)
97 lda = LdaModel(corpus=corpus,
98 num_topics=num_topics,
99 id2word=id2word,
100 passes=passes,
101 eval_every=None,
102 random_state=42)
103
104 doc_topics = pd.DataFrame()
105 model_file = (model_path / 'lda').resolve().as_posix()
106 lda.save(model_file)
107 train_lda = LdaModel(corpus=train_corpus,
108 num_topics=num_topics,
109 id2word=pd.Series(train_tokens).to_dict(),
110 passes=passes,
111 eval_every=None,
112 random_state=42)
113
114 test_perplexity = 2 ** (-train_lda.log_perplexity(test_corpus))
115 coherence = pd.concat([coherence, (pd.Series([c[1] for c in lda.top_topics(corpus=corpus,
116 coherence='u_mass',
117 topn=20)])
118 .to_frame((num_topics, passes)))], axis=1)
119 result.append([vocab_size,
120 test_vocab,
121 min_df,
122 max_df,
123 binary,
124 num_topics,
125 passes,
126 test_perplexity])
127
128 elapsed = time() - start
129 print(f'\nDone: {i / n:.2%} | Duration: {format_time(elapsed)} | To Go: {format_time(elapsed / i * (n - i))}\n')
130 results = pd.DataFrame(result, columns=cols).sort_values('perplexity')
131 print(results.head(10))
132 results.to_csv(vocab_path / 'perplexity.csv', index=False)
133 coherence.to_csv(vocab_path / 'coherence.csv', index=False)