ml-finance-python
python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
word2vec.py
(8552B)
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 __author__ = 'Stefan Jansen'
4
5 from pathlib import Path
6 from collections import Counter
7 from argparse import ArgumentParser
8
9 import pandas as pd
10 import numpy as np
11 from scipy.spatial.distance import cdist
12 from keras.models import Model
13 from keras.layers import Input, Dense, Reshape, Dot
14 from keras.layers.embeddings import Embedding
15 from keras.preprocessing.sequence import skipgrams
16 from keras.preprocessing import sequence
17 from keras.callbacks import Callback, TensorBoard
18
19 np.random.seed(42)
20
21 LANGUAGES = ['en', 'es']
22
23 SOURCES = ['ted', 'euro']
24 SOURCE_LABELS = ['Ted', 'Europarliament']
25 source_dict = dict(zip(SOURCES, SOURCE_LABELS))
26
27 parser = ArgumentParser(description='Run Keras word2vec model')
28 parser.add_argument('-l', '--language', choices=LANGUAGES, help='language', default='en')
29 parser.add_argument('-s', '--source', choices=SOURCES, help='data source', default='euro')
30 parser.add_argument('-m', '--model', choices=[1, 2, 3], help='model', default=1)
31
32 args = parser.parse_args()
33 LANGUAGE = args.language
34 MODEL = 'ngrams_{}'.format(args.model)
35 SOURCE = source_dict[args.source]
36
37 PROJECT_DIR = Path('/home/stefan/projects/odsc_2018/word2vec-translation')
38
39
40 def get_vocab_stats():
41 with pd.HDFStore(Path('vocab', SOURCE, 'vocab.h5').name) as store:
42 df = store['{}/vocab'.format(LANGUAGE)]
43
44 wc = df['count'].value_counts().sort_index(ascending=False).reset_index()
45 wc.columns = ['word_count', 'freq']
46 wc['n_words'] = wc.word_count.mul(wc.freq)
47
48 wc['corpus_share'] = wc.n_words.div(wc.n_words.sum())
49 wc['coverage'] = wc.corpus_share.cumsum()
50 wc['vocab_size'] = wc.freq.cumsum()
51 return wc
52
53
54 # wc = get_vocab_stats()
55 # print('# words: {:,d}'.format(wc.n_words.sum()))
56 # print(wc.loc[:, ['word_count', 'freq', 'n_words', 'vocab_size', 'coverage']].tail(10))
57
58 ### Model Settings
59 MIN_FREQ = 5
60 WINDOW_SIZE = 5
61 EMBEDDING_SIZE = 300
62 EPOCHS = 1
63 BATCH_SIZE = 100
64
65 PATH = Path('.', SOURCE, LANGUAGE, MODEL)
66 TB_PATH = PATH / 'tensorboard'
67 if not TB_PATH.exists():
68 TB_PATH.mkdir(parents=True, exist_ok=True)
69
70 VALID_SIZE = 15 # Random set of words to evaluate similarity on.
71 VALID_WINDOW = 250 # Evaluation samples from most frequent words
72 NN = 10 # Nearest neighbors for evaluation
73
74 valid_examples = np.random.choice(VALID_WINDOW, VALID_SIZE, replace=False)
75
76
77 def build_data(language, ngrams=1):
78 path = PROJECT_DIR / 'vocab' / SOURCE / language / 'ngrams_{}.txt'.format(ngrams)
79 words = path.read_text().split()
80
81 token_counts = [t for t in Counter(words).most_common() if t[1] >= MIN_FREQ]
82 tokens, counts = list(zip(*token_counts))
83
84 id_to_token = pd.Series(tokens).to_dict()
85 id_to_token.update({-1: 'UNK'})
86 token_to_id = {t: i for i, t in id_to_token.items()}
87 data = [token_to_id.get(word, -1) for word in words]
88 return data, token_to_id, id_to_token
89
90
91 data, token_to_id, id_to_token = build_data(LANGUAGE, ngrams=1)
92
93 vocab_size = len(token_to_id) - 1
94
95
96 def save_meta(d):
97 s = pd.Series(d).value_counts().reset_index()
98 s.columns = ['id', 'count']
99 s['token'] = s.id.map(id_to_token)
100 s[s.id >= 0].sort_values('id').token.dropna().to_csv(TB_PATH / 'meta.tsv', index=False)
101
102
103 save_meta(data)
104
105
106 # #### Process Analogies
107 def get_analogies(lang):
108 analogies = pd.read_csv(Path('..', 'data', 'analogies', 'analogies-{}.txt'.format(lang)),
109 header=None, names=['analogies'], squeeze=True)
110 cats = analogies.apply(lambda x: x if x.startswith(':') else np.nan).ffill().str.strip(':').str.strip().to_frame(
111 'cats')
112 analogies = analogies[~analogies.str.startswith(':')].str.split(expand=True)
113 analogies.columns = list('abcd')
114 analogies = cats.merge(analogies, left_index=True, right_index=True)
115 df['cats'], idx = pd.factorize(df.cats)
116 return analogies, pd.Series(idx)
117
118
119 analogies, categories = get_analogies('en')
120 analogies_id = analogies.apply(lambda x: x.map(token_to_id))
121
122 test_set = analogies_id.dropna().astype(int)
123 a, b, c, actual = test_set.values.T
124 actual = actual.reshape(-1, 1)
125 n_analogies = len(actual)
126
127 sampling_table = sequence.make_sampling_table(vocab_size)
128
129 couples, labels = skipgrams(sequence=data,
130 vocabulary_size=vocab_size,
131 window_size=WINDOW_SIZE,
132 sampling_table=sampling_table,
133 negative_samples=1.0,
134 shuffle=True)
135
136 target_word, context_word = np.array(couples, dtype=np.int32).T
137 labels = np.array(labels, dtype=np.int8)
138 del couples
139
140 with pd.HDFStore(PATH / 'data.h5') as store:
141 store.put('id_to_token', pd.Series(id_to_token))
142 store.put('analogies', test_set)
143
144
145 def model_graph():
146 #### Scalar Input Variables
147 input_target = Input((1,), name='target_input')
148 input_context = Input((1,), name='context_input')
149
150 #### Shared Embedding Layer
151 embedding = Embedding(input_dim=vocab_size,
152 output_dim=EMBEDDING_SIZE,
153 input_length=1,
154 name='embedding_layer')
155
156 #### Select Embedding Vectors
157 target = embedding(input_target)
158 target = Reshape((EMBEDDING_SIZE, 1), name='target_embedding')(target)
159
160 context = embedding(input_context)
161 context = Reshape((EMBEDDING_SIZE, 1), name='context_embedding')(context)
162
163 #### Compute Similarity (not normalized)
164 dot_product = Dot(axes=1)([target, context])
165 dot_product = Reshape((1,), name='similarity')(dot_product)
166
167 #### Sigmoid Output Layer
168 output = Dense(units=1, activation='sigmoid', name='output')(dot_product)
169
170 # #### Training Model
171 model = Model(inputs=[input_target, input_context], outputs=output)
172
173 # Validation Model (Cosine Similarity)
174 similarity = Dot(normalize=True,
175 axes=1,
176 name='cosine_similarity')([target, context])
177 valid_model = Model(inputs=[input_target, input_context], outputs=similarity)
178
179 return model, valid_model, embedding
180
181
182 train_model, valid_model, embedding = model_graph()
183 train_model.compile(loss='binary_crossentropy', optimizer='rmsprop')
184
185 print(train_model.summary())
186 print(valid_model.summary())
187
188
189 #### Evaluation: Nearest Neighors & Analogies
190 class EvalCallback(Callback):
191 def on_train_begin(self, logs=None):
192 print('\n\t{} nearest neighbors:'.format(NN))
193 for i in range(VALID_SIZE):
194 valid_word = id_to_token[valid_examples[i]]
195 sim = self._get_similiarity(valid_examples[i]).reshape(-1)
196 nearest = (-sim).argsort()[1:NN + 1]
197 neighbors = [id_to_token[nearest[n]] for n in range(NN)]
198 print('\t\t{}: {}'.format(valid_word, ', '.join(neighbors)))
199
200 def on_train_end(self, logs=None):
201 print('\n\t{} nearest neighbors:'.format(NN))
202 for i in range(VALID_SIZE):
203 valid_word = id_to_token[valid_examples[i]]
204 sim = self._get_similiarity(valid_examples[i]).reshape(-1)
205 nearest = (-sim).argsort()[1:NN + 1]
206 neighbors = [id_to_token[nearest[n]] for n in range(NN)]
207 print('\t\t{}: {}'.format(valid_word, ', '.join(neighbors)))
208
209 def on_epoch_end(self, eppch, logs=None):
210 print('\n\tAnalogy Accuracy:\n\t\t', end='')
211 print(self.test_analogies())
212
213 @staticmethod
214 def test_analogies():
215 embeddings = embedding.get_weights()[0]
216 target = embeddings[c] + embeddings[b] - embeddings[a]
217 neighbors = np.argsort(cdist(target, embeddings, metric='cosine'))
218 match_id = np.argwhere(neighbors == actual)[:, 1]
219 return '\n\t\t'.join(['Top {}: {:.2%}'.format(i, (match_id < i).sum() / n_analogies) for i in [1, 5, 10]])
220
221 @staticmethod
222 def _get_similiarity(valid_word_idx):
223 target = np.full(shape=vocab_size, fill_value=valid_word_idx)
224 context = np.arange(vocab_size)
225 return valid_model.predict([target, context])
226
227
228 evaluation = EvalCallback()
229
230 # ##### Tensorboard Callback
231 tensorboard = TensorBoard(log_dir=str(TB_PATH), histogram_freq=0,
232 batch_size=32, write_graph=True,
233 embeddings_freq=100,
234 embeddings_metadata=str((TB_PATH / 'meta.tsv').resolve()))
235
236 loss = train_model.fit(x=[target_word, context_word], y=labels,
237 shuffle=True,
238 batch_size=BATCH_SIZE, epochs=EPOCHS,
239 callbacks=[evaluation, tensorboard])
240
241 train_model.save(str(PATH / 'skipgram_model.h5'))