ml-finance-python
python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
word2vec.py
(19434B)
1 """Multi-threaded word2vec unbatched skip-gram model.
2
3 Trains the model described in:
4 (Mikolov, et. al.) Efficient Estimation of Word Representations in Vector Space ICLR 2013.
5 http://arxiv.org/abs/1301.3781
6 This model does true SGD (i.e. no minibatching). To do this efficiently, custom
7 ops are used to sequentially process data within a 'batch'. These need to be compiled.
8
9 The key ops used are:
10 * skipgram custom op that does input processing.
11 * neg_train custom op that efficiently calculates and applies the gradient using true SGD.
12
13 """
14
15 # from __future__ import absolute_import
16 # from __future__ import division
17 # from __future__ import print_function
18
19 import sys
20 import threading
21 from math import sqrt
22 from os import environ
23 from pathlib import Path
24 from time import time, sleep
25
26 import numpy as np
27 import pandas as pd
28
29 environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
30
31 import tensorflow as tf
32 from six.moves import range # pylint: disable=redefined-builtin
33 from tensorflow.contrib.tensorboard.plugins import projector
34
35
36 PROJECT_DIR = Path().cwd().resolve()
37 VOCAB_DIR = PROJECT_DIR / 'vocab'
38 DATA_DIR = PROJECT_DIR / 'data'
39
40 # Load custom ops
41 word2vec = tf.load_op_library(str(PROJECT_DIR / 'tensorflow' / 'word2vec_ops.so'))
42
43 # Define command line options
44 flags = tf.app.flags
45 flags.DEFINE_string('language', 'en', 'Document language.')
46 flags.DEFINE_string('file', 'ngrams_1', 'Input doc.')
47 flags.DEFINE_string('source', 'TED', 'Data source.')
48 flags.DEFINE_integer('epochs_to_train', 1, 'Number of epochs to train. ')
49 flags.DEFINE_integer('embedding_size', 200, 'The embedding dimension size.')
50 flags.DEFINE_float('starter_lr', 0.05, 'Initial learning rate.')
51 flags.DEFINE_float('target_lr', 0.05, 'Final learning rate.')
52 flags.DEFINE_integer('num_neg_samples', 10, 'Negative samples per training example.')
53 flags.DEFINE_integer('batch_size', 500, 'No. of training examples each step processes (no minibatching).')
54 flags.DEFINE_integer('concurrent_steps', 8, 'The number of concurrent training steps.')
55 flags.DEFINE_integer('window_size', 5, "No of words to predict to the left and right of target.")
56 flags.DEFINE_integer('min_count', 5, 'Minimum no of occurrences for word to enter vocabulary.')
57 flags.DEFINE_float('subsample', 1e-3, 'Words with higher frequency will be randomly down-sampled. 0 to disable.')
58 flags.DEFINE_boolean('custom_freq', False, 'Use language-specific subsample threshold.')
59 flags.DEFINE_integer('words_to_project', 10000, 'Words to project using Tensorboard.')
60 FLAGS = flags.FLAGS
61
62
63 def time_diff(t):
64 m, s = divmod(time() - t, 60)
65 h, m = divmod(m, 60)
66 return ['{:0>2.0f}'.format(x) for x in [h, m, s]]
67
68
69 class Options(object):
70 """Flags to options used by word2vec model."""
71
72 def __init__(self):
73 self.freq_map = dict(en=0.003,
74 es=0.014) # custom subsample thresholds to filter out 0.1% most frequent words
75 self.lang = FLAGS.language
76 self.file = FLAGS.file
77 self.source = FLAGS.source
78 self.emb_dim = FLAGS.embedding_size
79 self.train_data = Path(VOCAB_DIR, self.source, self.lang, self.file + '.txt')
80 self.num_samples = FLAGS.num_neg_samples
81 self.starter_lr = FLAGS.starter_lr
82 self.target_lr = FLAGS.target_lr
83 self.epochs_to_train = FLAGS.epochs_to_train
84 self.concurrent_steps = FLAGS.concurrent_steps
85 self.batch_size = FLAGS.batch_size
86 self.window_size = FLAGS.window_size
87 self.min_count = FLAGS.min_count
88 self.custom_freq = FLAGS.custom_freq
89 if self.custom_freq:
90 self.subsample = self.freq_map[self.lang]
91 else:
92 self.subsample = FLAGS.subsample
93 self.words_to_project = FLAGS.words_to_project
94 self.save_path = Path(PROJECT_DIR, self.source, self.lang, self.file , '{}_{}_{}_{}_{}_{}_{}'.format(self.emb_dim, self.num_samples, int(self.starter_lr * 100), int(self.target_lr * 100), self.batch_size, self.window_size, self.min_count))
95 self.tensor_board_path = self.save_path / 'tensorboard'
96 if not self.tensor_board_path.exists():
97 self.tensor_board_path.mkdir(parents=True, exist_ok=True)
98
99 self.analogy_path = DATA_DIR / 'analogies' / 'analogies-{}.txt'.format(self.lang)
100
101
102 class Word2Vec(object):
103 """Word2Vec model (Skipgram)."""
104
105 def __init__(self, options, session):
106 self._options = options
107 self._session = session
108 self._word2id = {}
109 self._id2word = []
110 self.build_graph()
111 self.build_eval_graph()
112 self.save_vocab()
113 self.accuracies = []
114
115 def read_analogies(self):
116 """Reads through the analogy question file.
117
118 Returns:
119 questions: a [n, 4] numpy array containing the analogy question's word ids.
120 questions_skipped: questions skipped due to unknown words.
121 """
122 questions = []
123 questions_skipped = 0
124 with open(self._options.analogy_path, 'rb') as analogy_f:
125 for line in analogy_f:
126 if line.startswith(b':'): # Skip comments.
127 continue
128 words = line.strip().lower().split()
129 ids = [self._word2id.get(w.strip()) for w in words]
130 if None in ids or len(ids) != 4:
131 questions_skipped += 1
132 else:
133 questions.append(np.array(ids))
134 print('\nEval analogy file: ', self._options.analogy_path.stem)
135 print('Questions: ', len(questions))
136 print('Skipped: ', questions_skipped)
137 self._analogy_questions = np.array(questions, dtype=np.int32)
138
139 def build_graph(self):
140 """Build the model graph."""
141 opts = self._options
142
143 # The training data. A text file.
144 with tf.name_scope('input'):
145 words, counts, words_per_epoch, current_epoch, total_words_processed, center_words, target_words = \
146 word2vec.skipgram_word2vec(filename=str(opts.train_data),
147 batch_size=opts.batch_size,
148 window_size=opts.window_size,
149 min_count=opts.min_count,
150 subsample=opts.subsample)
151
152 opts.vocab_words, opts.vocab_counts, opts.words_per_epoch = self._session.run([words, counts, words_per_epoch])
153 opts.vocab_size = len(opts.vocab_words)
154
155 print('Data file: ', opts.file)
156 print('Vocab size: {:,} + UNK'.format(opts.vocab_size - 1))
157 print('Words per epoch: {:,}'.format(opts.words_per_epoch))
158
159 self._id2word = opts.vocab_words
160 self._word2id = {w: i for i, w in enumerate(self._id2word)}
161
162 # Input words embedding: [vocab_size, emb_dim]
163 with tf.name_scope('embedding'):
164 embeddings = tf.Variable(
165 tf.random_uniform([opts.vocab_size, opts.emb_dim], -0.5 / opts.emb_dim, 0.5 / opts.emb_dim),
166 name='vectors')
167 tf.summary.histogram('histogram', embeddings)
168
169 with tf.name_scope('output'):
170 embed = tf.nn.embedding_lookup(embeddings, center_words)
171 with tf.name_scope('weights'):
172 nce_weights = tf.Variable(
173 tf.truncated_normal([opts.vocab_size, opts.emb_dim], stddev=1.0 / sqrt(opts.emb_dim)),
174 name='weights')
175 tf.summary.histogram('histogram', nce_weights)
176 with tf.name_scope('biases'):
177 nce_biases = tf.Variable(tf.zeros([opts.vocab_size]), name='biases')
178 tf.summary.histogram('histogram', nce_biases)
179
180 with tf.name_scope('nce_loss'):
181 loss = tf.reduce_mean(
182 tf.nn.nce_loss(weights=nce_weights,
183 biases=nce_biases,
184 labels=tf.reshape(target_words, shape=(-1, 1)),
185 inputs=embed,
186 num_sampled=opts.num_samples,
187 num_classes=opts.vocab_size), name='loss')
188 ema = tf.train.ExponentialMovingAverage(decay=0.99)
189 update_loss_ema = ema.apply([loss])
190 loss_ema = ema.average(loss)
191 tf.summary.scalar('exp_moving_avg', loss_ema)
192
193 with tf.name_scope('train'):
194 global_step = tf.Variable(0, name='global_step', trainable=False)
195 words_to_train = float(opts.words_per_epoch * opts.epochs_to_train)
196 learning_rate = opts.starter_lr * tf.maximum(opts.target_lr, 1.0 - tf.cast(total_words_processed,
197 tf.float32) / words_to_train)
198 tf.summary.scalar('learning_rate', learning_rate)
199
200 train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss=loss, global_step=global_step)
201
202 self._update_loss_ema = update_loss_ema
203 self._loss_ema = loss_ema
204 self._embeddings = embeddings
205 self._learning_rate = learning_rate
206 self._train_step = train_step
207 self.global_step = global_step
208 self._epoch = current_epoch
209 self._words = total_words_processed
210
211 def save_vocab(self):
212 """Save the vocabulary to a file so the model can be reloaded."""
213 opts = self._options
214
215 vocab = []
216 for i in range(opts.vocab_size):
217 vocab.append([tf.compat.as_text(opts.vocab_words[i]).encode('utf-8'), opts.vocab_counts[i]])
218 vocab = pd.DataFrame(vocab, columns=['word', 'count'])
219 vocab.word = vocab.word.astype(str)
220
221 with pd.HDFStore(opts.save_path / 'results.h5') as store:
222 store.put('/'.join([opts.lang, 'vocab']), vocab, format='t')
223
224 meta = vocab.word.str[1:].str.strip("'").iloc[:opts.words_to_project]
225 meta.to_csv(opts.tensor_board_path / 'metadata.tsv', header=None, sep='\t', index=False)
226
227 def build_eval_graph(self):
228 """Build the evaluation graph."""
229 # Eval graph
230 opts = self._options
231
232 # Each analogy task is to predict the 4th word (d) given three
233 # words: a, b, c. E.g., a=italy, b=rome, c=france, we should
234 # predict d=paris.
235 with tf.name_scope('eval'):
236 # The eval feeds three vectors of word ids for a, b, c, each of
237 # which is of size N, where N is the number of analogies we want to
238 # evaluate in one batch.
239 analogy_a = tf.placeholder(dtype=tf.int32, name='analogy_a') # [N]
240 analogy_b = tf.placeholder(dtype=tf.int32, name='analogy_b') # [N]
241 analogy_c = tf.placeholder(dtype=tf.int32, name='analogy_c') # [N]
242
243 # Normalized word embeddings of shape [vocab_size, emb_dim].
244 nemb = tf.nn.l2_normalize(self._embeddings, 1)
245
246 # Each row of a_emb, b_emb, c_emb is a word's embedding vector.
247 # They all have the shape [N, emb_dim]
248 a_emb = tf.gather(nemb, analogy_a, name='analogy_a_emb') # a's embs
249 b_emb = tf.gather(nemb, analogy_b, name='analogy_a_emb') # b's embs
250 c_emb = tf.gather(nemb, analogy_c, name='analogy_a_emb') # c's embs
251
252 # We expect that d's embedding vectors on the unit hyper-sphere is
253 # near: c_emb + (b_emb - a_emb), which has the shape [N, emb_dim].
254 target = c_emb + (b_emb - a_emb)
255
256 # Compute cosine distance between each pair of target and vocab.
257 # dist has shape [N, vocab_size].
258 dist = tf.matmul(target, nemb, transpose_b=True, name='target_dist')
259
260 # For each question (row in dist), find the top 4 words.
261 _, pred_idx = tf.nn.top_k(dist, 4)
262
263 # Nodes for computing neighbors for a given word according to
264 # their cosine distance.
265 nearby_word = tf.placeholder(dtype=tf.int32, name='nearby_word') # word id
266 nearby_emb = tf.gather(nemb, nearby_word, name='nearby_emb')
267 nearby_dist = tf.matmul(nearby_emb, nemb, transpose_b=True, name='nearby_dist')
268 nearby_val, nearby_idx = tf.nn.top_k(nearby_dist, min(1000, opts.vocab_size), name='k_nn')
269
270 # Nodes in the construct graph which are used by training and
271 # evaluation to run/feed/fetch.
272 self._analogy_a = analogy_a
273 self._analogy_b = analogy_b
274 self._analogy_c = analogy_c
275 self._analogy_pred_idx = pred_idx
276 self._nearby_word = nearby_word
277 self._nearby_val = nearby_val
278 self._nearby_idx = nearby_idx
279
280 self._merged = tf.summary.merge_all()
281 self._train_writer = tf.summary.FileWriter(str(opts.tensor_board_path / 'train'), graph=self._session.graph)
282 self._test_writer = tf.summary.FileWriter(str(opts.tensor_board_path / 'test'), graph=self._session.graph)
283
284 # Properly initialize all variables.
285 tf.global_variables_initializer().run()
286
287 self.saver = tf.train.Saver()
288
289 def _train_thread_body(self):
290 initial_epoch, = self._session.run([self._epoch])
291 while True:
292 _, _, epoch = self._session.run([self._update_loss_ema, self._train_step, self._epoch])
293 if epoch != initial_epoch:
294 break
295
296 def train(self):
297 """Train the model."""
298 opts = self._options
299
300 initial_epoch, initial_words = self._session.run([self._epoch, self._words])
301
302 workers = []
303 for _ in range(opts.concurrent_steps):
304 t = threading.Thread(target=self._train_thread_body)
305 t.start()
306 workers.append(t)
307
308 last_words, last_time, start = initial_words, time(), time()
309 rates = []
310 while True:
311 sleep(2) # Reports our progress once a while.
312 epoch, step, words, loss_ema, summary = self._session.run(
313 [self._epoch, self.global_step, self._words, self._loss_ema, self._merged])
314 self._train_writer.add_summary(summary=summary, global_step=step)
315
316 now = time()
317 last_words, last_time, rate = words, now, (words - last_words) / (now - last_time)
318 rates.append(rate)
319
320 h, m, s = time_diff(start)
321 print(
322 '\r{}:{}:{}\tEpoch {:>2}\tStep {:>8,}\twords/sec: {:>8,.0f}\tloss: {:>8.4f}'.format(
323 h, m, s, epoch, step, sum(rates) / len(rates), loss_ema), end='')
324 sys.stdout.flush()
325
326 if epoch != initial_epoch:
327 break
328
329 for t in workers:
330 t.join()
331
332 def _predict(self, analogy):
333 """Predict the top 4 answers for analogy questions."""
334 idx, = self._session.run([self._analogy_pred_idx], {
335 self._analogy_a: analogy[:, 0],
336 self._analogy_b: analogy[:, 1],
337 self._analogy_c: analogy[:, 2]
338 })
339 return idx
340
341 def eval(self):
342 """Evaluate analogy questions and reports accuracy."""
343
344 # How many questions we get right at precision@1.
345 correct = 0
346
347 try:
348 total = self._analogy_questions.shape[0]
349 except AttributeError:
350 raise AttributeError('Need to read analogy questions.')
351
352 start = 0
353 while start < total:
354 limit = start + 2500
355 sub = self._analogy_questions[start:limit, :]
356 idx = self._predict(sub)
357 start = limit
358 for question in range(sub.shape[0]):
359 for j in range(4):
360 if idx[question, j] == sub[question, 3]:
361 # Bingo! We predicted correctly. E.g., [italy, rome, france, paris].
362 correct += 1
363 break
364 elif idx[question, j] in sub[question, :3]:
365 # We need to skip words already in the question.
366 continue
367 else:
368 # The correct label is not the precision@1
369 break
370 summary = tf.Summary(value=[tf.Summary.Value(tag='accuracies', simple_value=correct / total), ])
371 step = self.global_step.eval(self._session)
372 self._test_writer.add_summary(summary, global_step=step)
373
374 self.accuracies.append(correct / total)
375
376 print('\n\tEval {:4d}/{}\t\taccuracy = {:.1%}'.format(correct, total, correct / total))
377
378 def analogy(self, w0, w1, w2):
379 """Predict word w3 as in w0:w1 vs w2:w3."""
380 wid = np.array([[self._word2id.get(w, 0) for w in [w0, w1, w2]]])
381 idx = self._predict(wid)
382 for c in [self._id2word[i] for i in idx[0, :]]:
383 if c not in [w0, w1, w2]:
384 print(c)
385 break
386 print('unknown')
387
388 def nearby(self, words, num=20):
389 """Prints out nearby words given a list of words."""
390 ids = np.array([self._word2id.get(x, 0) for x in words])
391 vals, idx = self._session.run(
392 [self._nearby_val, self._nearby_idx], {self._nearby_word: ids})
393 for i in range(len(words)):
394 print("\n%s\n=====================================" % (words[i]))
395 for (neighbor, distance) in zip(idx[i, :num], vals[i, :num]):
396 print("%-20s %6.4f" % (self._id2word[neighbor], distance))
397
398
399 def main(_):
400 """Train a word2vec model."""
401 opts = Options()
402 with tf.Graph().as_default(), tf.Session() as session:
403 with tf.device('/cpu:0'):
404 model = Word2Vec(opts, session)
405 model.read_analogies() # Read analogy questions
406 for e in range(opts.epochs_to_train):
407 print()
408 model.train() # Process one epoch
409 model.eval() # Eval analogies.
410
411
412 model.saver.save(session, opts.tensor_board_path / 'model.ckpt', global_step=model.global_step)
413 final_embeddings = tf.nn.l2_normalize(model._embeddings, 1).eval(session=session)
414
415 writer = tf.summary.FileWriter(str(opts.tensor_board_path), session.graph)
416
417 embedding_var = tf.Variable(final_embeddings[:opts.words_to_project], name='embedding_result')
418 session.run(embedding_var.initializer)
419
420 config = projector.ProjectorConfig()
421 embedded = config.embeddings.add()
422 embedded.tensor_name = embedding_var.name
423 embedded.metadata_path = str((opts.tensor_board_path / 'metadata.tsv').resolve())
424 projector.visualize_embeddings(writer, config)
425
426 tf.global_variables_initializer().run()
427 saver_embed = tf.train.Saver([embedding_var])
428 saver_embed.save(session, opts.tensor_board_path / 'skip-gram.ckpt', 1)
429
430 params = dict(emb_dim=opts.emb_dim,
431 num_samples=opts.num_samples,
432 starter_lr=opts.starter_lr,
433 target_lr=opts.target_lr,
434 epochs_to_train=opts.epochs_to_train,
435 window_size=opts.window_size,
436 min_count=opts.min_count,
437 subsample=opts.subsample)
438
439 with pd.HDFStore(opts.save_path / 'results.h5') as store:
440 store.put('/'.join([opts.lang, 'embeddings']), pd.DataFrame(final_embeddings))
441 store.put('/'.join([opts.lang, 'accuracies']), pd.Series(model.accuracies))
442 store.put('/'.join([opts.lang, 'params']), pd.Series(params))
443
444
445 if __name__ == "__main__":
446 tf.app.run()