ml-finance-python

python scripts for finance machine learning

git clone https://9o.is/git/ml-finance-python.git

word2vec.py

(19434B)


      1 """Multi-threaded word2vec unbatched skip-gram model.
      2 
      3 Trains the model described in:
      4 (Mikolov, et. al.) Efficient Estimation of Word Representations in Vector Space ICLR 2013.
      5 http://arxiv.org/abs/1301.3781
      6 This model does true SGD (i.e. no minibatching). To do this efficiently, custom
      7 ops are used to sequentially process data within a 'batch'. These need to be compiled.
      8 
      9 The key ops used are:
     10 * skipgram custom op that does input processing.
     11 * neg_train custom op that efficiently calculates and applies the gradient using true SGD.
     12 
     13 """
     14 
     15 # from __future__ import absolute_import
     16 # from __future__ import division
     17 # from __future__ import print_function
     18 
     19 import sys
     20 import threading
     21 from math import sqrt
     22 from os import environ
     23 from pathlib import Path
     24 from time import time, sleep
     25 
     26 import numpy as np
     27 import pandas as pd
     28 
     29 environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
     30 
     31 import tensorflow as tf
     32 from six.moves import range  # pylint: disable=redefined-builtin
     33 from tensorflow.contrib.tensorboard.plugins import projector
     34 
     35 
     36 PROJECT_DIR = Path().cwd().resolve()
     37 VOCAB_DIR = PROJECT_DIR / 'vocab'
     38 DATA_DIR = PROJECT_DIR / 'data'
     39 
     40 # Load custom ops
     41 word2vec = tf.load_op_library(str(PROJECT_DIR / 'tensorflow' / 'word2vec_ops.so'))
     42 
     43 # Define command line options
     44 flags = tf.app.flags
     45 flags.DEFINE_string('language', 'en', 'Document language.')
     46 flags.DEFINE_string('file', 'ngrams_1', 'Input doc.')
     47 flags.DEFINE_string('source', 'TED', 'Data source.')
     48 flags.DEFINE_integer('epochs_to_train', 1, 'Number of epochs to train. ')
     49 flags.DEFINE_integer('embedding_size', 200, 'The embedding dimension size.')
     50 flags.DEFINE_float('starter_lr', 0.05, 'Initial learning rate.')
     51 flags.DEFINE_float('target_lr', 0.05, 'Final learning rate.')
     52 flags.DEFINE_integer('num_neg_samples', 10, 'Negative samples per training example.')
     53 flags.DEFINE_integer('batch_size', 500, 'No. of training examples each step processes (no minibatching).')
     54 flags.DEFINE_integer('concurrent_steps', 8, 'The number of concurrent training steps.')
     55 flags.DEFINE_integer('window_size', 5, "No of words to predict to the left and right of target.")
     56 flags.DEFINE_integer('min_count', 5, 'Minimum no of occurrences for word to enter vocabulary.')
     57 flags.DEFINE_float('subsample', 1e-3, 'Words with higher frequency will be randomly down-sampled. 0 to disable.')
     58 flags.DEFINE_boolean('custom_freq', False, 'Use language-specific subsample threshold.')
     59 flags.DEFINE_integer('words_to_project', 10000, 'Words to project using Tensorboard.')
     60 FLAGS = flags.FLAGS
     61 
     62 
     63 def time_diff(t):
     64     m, s = divmod(time() - t, 60)
     65     h, m = divmod(m, 60)
     66     return ['{:0>2.0f}'.format(x) for x in [h, m, s]]
     67 
     68 
     69 class Options(object):
     70     """Flags to options used by word2vec model."""
     71 
     72     def __init__(self):
     73         self.freq_map = dict(en=0.003,
     74                              es=0.014)  # custom subsample thresholds to filter out 0.1% most frequent words
     75         self.lang = FLAGS.language
     76         self.file = FLAGS.file
     77         self.source = FLAGS.source
     78         self.emb_dim = FLAGS.embedding_size
     79         self.train_data = Path(VOCAB_DIR, self.source, self.lang, self.file + '.txt')
     80         self.num_samples = FLAGS.num_neg_samples
     81         self.starter_lr = FLAGS.starter_lr
     82         self.target_lr = FLAGS.target_lr
     83         self.epochs_to_train = FLAGS.epochs_to_train
     84         self.concurrent_steps = FLAGS.concurrent_steps
     85         self.batch_size = FLAGS.batch_size
     86         self.window_size = FLAGS.window_size
     87         self.min_count = FLAGS.min_count
     88         self.custom_freq = FLAGS.custom_freq
     89         if self.custom_freq:
     90             self.subsample = self.freq_map[self.lang]
     91         else:
     92             self.subsample = FLAGS.subsample
     93         self.words_to_project = FLAGS.words_to_project
     94         self.save_path = Path(PROJECT_DIR, self.source, self.lang, self.file , '{}_{}_{}_{}_{}_{}_{}'.format(self.emb_dim, self.num_samples, int(self.starter_lr * 100), int(self.target_lr * 100), self.batch_size, self.window_size, self.min_count))
     95         self.tensor_board_path = self.save_path / 'tensorboard'
     96         if not self.tensor_board_path.exists():
     97             self.tensor_board_path.mkdir(parents=True, exist_ok=True)
     98 
     99         self.analogy_path = DATA_DIR / 'analogies' / 'analogies-{}.txt'.format(self.lang)
    100 
    101 
    102 class Word2Vec(object):
    103     """Word2Vec model (Skipgram)."""
    104 
    105     def __init__(self, options, session):
    106         self._options = options
    107         self._session = session
    108         self._word2id = {}
    109         self._id2word = []
    110         self.build_graph()
    111         self.build_eval_graph()
    112         self.save_vocab()
    113         self.accuracies = []
    114 
    115     def read_analogies(self):
    116         """Reads through the analogy question file.
    117 
    118         Returns:
    119           questions: a [n, 4] numpy array containing the analogy question's word ids.
    120           questions_skipped: questions skipped due to unknown words.
    121         """
    122         questions = []
    123         questions_skipped = 0
    124         with open(self._options.analogy_path, 'rb') as analogy_f:
    125             for line in analogy_f:
    126                 if line.startswith(b':'):  # Skip comments.
    127                     continue
    128                 words = line.strip().lower().split()
    129                 ids = [self._word2id.get(w.strip()) for w in words]
    130                 if None in ids or len(ids) != 4:
    131                     questions_skipped += 1
    132                 else:
    133                     questions.append(np.array(ids))
    134         print('\nEval analogy file: ', self._options.analogy_path.stem)
    135         print('Questions: ', len(questions))
    136         print('Skipped: ', questions_skipped)
    137         self._analogy_questions = np.array(questions, dtype=np.int32)
    138 
    139     def build_graph(self):
    140         """Build the model graph."""
    141         opts = self._options
    142 
    143         # The training data. A text file.
    144         with tf.name_scope('input'):
    145             words, counts, words_per_epoch, current_epoch, total_words_processed, center_words, target_words = \
    146                 word2vec.skipgram_word2vec(filename=str(opts.train_data),
    147                                            batch_size=opts.batch_size,
    148                                            window_size=opts.window_size,
    149                                            min_count=opts.min_count,
    150                                            subsample=opts.subsample)
    151 
    152         opts.vocab_words, opts.vocab_counts, opts.words_per_epoch = self._session.run([words, counts, words_per_epoch])
    153         opts.vocab_size = len(opts.vocab_words)
    154 
    155         print('Data file: ', opts.file)
    156         print('Vocab size: {:,} + UNK'.format(opts.vocab_size - 1))
    157         print('Words per epoch: {:,}'.format(opts.words_per_epoch))
    158 
    159         self._id2word = opts.vocab_words
    160         self._word2id = {w: i for i, w in enumerate(self._id2word)}
    161 
    162         # Input words embedding: [vocab_size, emb_dim]
    163         with tf.name_scope('embedding'):
    164             embeddings = tf.Variable(
    165                     tf.random_uniform([opts.vocab_size, opts.emb_dim], -0.5 / opts.emb_dim, 0.5 / opts.emb_dim),
    166                     name='vectors')
    167             tf.summary.histogram('histogram', embeddings)
    168 
    169         with tf.name_scope('output'):
    170             embed = tf.nn.embedding_lookup(embeddings, center_words)
    171             with tf.name_scope('weights'):
    172                 nce_weights = tf.Variable(
    173                         tf.truncated_normal([opts.vocab_size, opts.emb_dim], stddev=1.0 / sqrt(opts.emb_dim)),
    174                         name='weights')
    175                 tf.summary.histogram('histogram', nce_weights)
    176             with tf.name_scope('biases'):
    177                 nce_biases = tf.Variable(tf.zeros([opts.vocab_size]), name='biases')
    178                 tf.summary.histogram('histogram', nce_biases)
    179 
    180         with tf.name_scope('nce_loss'):
    181             loss = tf.reduce_mean(
    182                     tf.nn.nce_loss(weights=nce_weights,
    183                                    biases=nce_biases,
    184                                    labels=tf.reshape(target_words, shape=(-1, 1)),
    185                                    inputs=embed,
    186                                    num_sampled=opts.num_samples,
    187                                    num_classes=opts.vocab_size), name='loss')
    188             ema = tf.train.ExponentialMovingAverage(decay=0.99)
    189             update_loss_ema = ema.apply([loss])
    190             loss_ema = ema.average(loss)
    191             tf.summary.scalar('exp_moving_avg', loss_ema)
    192 
    193         with tf.name_scope('train'):
    194             global_step = tf.Variable(0, name='global_step', trainable=False)
    195             words_to_train = float(opts.words_per_epoch * opts.epochs_to_train)
    196             learning_rate = opts.starter_lr * tf.maximum(opts.target_lr, 1.0 - tf.cast(total_words_processed,
    197                                                                                        tf.float32) / words_to_train)
    198             tf.summary.scalar('learning_rate', learning_rate)
    199 
    200             train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss=loss, global_step=global_step)
    201 
    202         self._update_loss_ema = update_loss_ema
    203         self._loss_ema = loss_ema
    204         self._embeddings = embeddings
    205         self._learning_rate = learning_rate
    206         self._train_step = train_step
    207         self.global_step = global_step
    208         self._epoch = current_epoch
    209         self._words = total_words_processed
    210 
    211     def save_vocab(self):
    212         """Save the vocabulary to a file so the model can be reloaded."""
    213         opts = self._options
    214 
    215         vocab = []
    216         for i in range(opts.vocab_size):
    217             vocab.append([tf.compat.as_text(opts.vocab_words[i]).encode('utf-8'), opts.vocab_counts[i]])
    218         vocab = pd.DataFrame(vocab, columns=['word', 'count'])
    219         vocab.word = vocab.word.astype(str)
    220 
    221         with pd.HDFStore(opts.save_path / 'results.h5') as store:
    222             store.put('/'.join([opts.lang, 'vocab']), vocab, format='t')
    223 
    224         meta = vocab.word.str[1:].str.strip("'").iloc[:opts.words_to_project]
    225         meta.to_csv(opts.tensor_board_path / 'metadata.tsv', header=None, sep='\t', index=False)
    226 
    227     def build_eval_graph(self):
    228         """Build the evaluation graph."""
    229         # Eval graph
    230         opts = self._options
    231 
    232         # Each analogy task is to predict the 4th word (d) given three
    233         # words: a, b, c.  E.g., a=italy, b=rome, c=france, we should
    234         # predict d=paris.
    235         with tf.name_scope('eval'):
    236             # The eval feeds three vectors of word ids for a, b, c, each of
    237             # which is of size N, where N is the number of analogies we want to
    238             # evaluate in one batch.
    239             analogy_a = tf.placeholder(dtype=tf.int32, name='analogy_a')  # [N]
    240             analogy_b = tf.placeholder(dtype=tf.int32, name='analogy_b')  # [N]
    241             analogy_c = tf.placeholder(dtype=tf.int32, name='analogy_c')  # [N]
    242 
    243             # Normalized word embeddings of shape [vocab_size, emb_dim].
    244             nemb = tf.nn.l2_normalize(self._embeddings, 1)
    245 
    246             # Each row of a_emb, b_emb, c_emb is a word's embedding vector.
    247             # They all have the shape [N, emb_dim]
    248             a_emb = tf.gather(nemb, analogy_a, name='analogy_a_emb')  # a's embs
    249             b_emb = tf.gather(nemb, analogy_b, name='analogy_a_emb')  # b's embs
    250             c_emb = tf.gather(nemb, analogy_c, name='analogy_a_emb')  # c's embs
    251 
    252             # We expect that d's embedding vectors on the unit hyper-sphere is
    253             # near: c_emb + (b_emb - a_emb), which has the shape [N, emb_dim].
    254             target = c_emb + (b_emb - a_emb)
    255 
    256             # Compute cosine distance between each pair of target and vocab.
    257             # dist has shape [N, vocab_size].
    258             dist = tf.matmul(target, nemb, transpose_b=True, name='target_dist')
    259 
    260             # For each question (row in dist), find the top 4 words.
    261             _, pred_idx = tf.nn.top_k(dist, 4)
    262 
    263             # Nodes for computing neighbors for a given word according to
    264             # their cosine distance.
    265             nearby_word = tf.placeholder(dtype=tf.int32, name='nearby_word')  # word id
    266             nearby_emb = tf.gather(nemb, nearby_word, name='nearby_emb')
    267             nearby_dist = tf.matmul(nearby_emb, nemb, transpose_b=True, name='nearby_dist')
    268             nearby_val, nearby_idx = tf.nn.top_k(nearby_dist, min(1000, opts.vocab_size), name='k_nn')
    269 
    270         # Nodes in the construct graph which are used by training and
    271         # evaluation to run/feed/fetch.
    272         self._analogy_a = analogy_a
    273         self._analogy_b = analogy_b
    274         self._analogy_c = analogy_c
    275         self._analogy_pred_idx = pred_idx
    276         self._nearby_word = nearby_word
    277         self._nearby_val = nearby_val
    278         self._nearby_idx = nearby_idx
    279 
    280         self._merged = tf.summary.merge_all()
    281         self._train_writer = tf.summary.FileWriter(str(opts.tensor_board_path / 'train'), graph=self._session.graph)
    282         self._test_writer = tf.summary.FileWriter(str(opts.tensor_board_path / 'test'), graph=self._session.graph)
    283 
    284         # Properly initialize all variables.
    285         tf.global_variables_initializer().run()
    286 
    287         self.saver = tf.train.Saver()
    288 
    289     def _train_thread_body(self):
    290         initial_epoch, = self._session.run([self._epoch])
    291         while True:
    292             _, _, epoch = self._session.run([self._update_loss_ema, self._train_step, self._epoch])
    293             if epoch != initial_epoch:
    294                 break
    295 
    296     def train(self):
    297         """Train the model."""
    298         opts = self._options
    299 
    300         initial_epoch, initial_words = self._session.run([self._epoch, self._words])
    301 
    302         workers = []
    303         for _ in range(opts.concurrent_steps):
    304             t = threading.Thread(target=self._train_thread_body)
    305             t.start()
    306             workers.append(t)
    307 
    308         last_words, last_time, start = initial_words, time(), time()
    309         rates = []
    310         while True:
    311             sleep(2)  # Reports our progress once a while.
    312             epoch, step, words, loss_ema, summary = self._session.run(
    313                     [self._epoch, self.global_step, self._words, self._loss_ema, self._merged])
    314             self._train_writer.add_summary(summary=summary, global_step=step)
    315 
    316             now = time()
    317             last_words, last_time, rate = words, now, (words - last_words) / (now - last_time)
    318             rates.append(rate)
    319 
    320             h, m, s = time_diff(start)
    321             print(
    322                     '\r{}:{}:{}\tEpoch {:>2}\tStep {:>8,}\twords/sec: {:>8,.0f}\tloss: {:>8.4f}'.format(
    323                             h, m, s, epoch, step, sum(rates) / len(rates), loss_ema), end='')
    324             sys.stdout.flush()
    325 
    326             if epoch != initial_epoch:
    327                 break
    328 
    329         for t in workers:
    330             t.join()
    331 
    332     def _predict(self, analogy):
    333         """Predict the top 4 answers for analogy questions."""
    334         idx, = self._session.run([self._analogy_pred_idx], {
    335             self._analogy_a: analogy[:, 0],
    336             self._analogy_b: analogy[:, 1],
    337             self._analogy_c: analogy[:, 2]
    338         })
    339         return idx
    340 
    341     def eval(self):
    342         """Evaluate analogy questions and reports accuracy."""
    343 
    344         # How many questions we get right at precision@1.
    345         correct = 0
    346 
    347         try:
    348             total = self._analogy_questions.shape[0]
    349         except AttributeError:
    350             raise AttributeError('Need to read analogy questions.')
    351 
    352         start = 0
    353         while start < total:
    354             limit = start + 2500
    355             sub = self._analogy_questions[start:limit, :]
    356             idx = self._predict(sub)
    357             start = limit
    358             for question in range(sub.shape[0]):
    359                 for j in range(4):
    360                     if idx[question, j] == sub[question, 3]:
    361                         # Bingo! We predicted correctly. E.g., [italy, rome, france, paris].
    362                         correct += 1
    363                         break
    364                     elif idx[question, j] in sub[question, :3]:
    365                         # We need to skip words already in the question.
    366                         continue
    367                     else:
    368                         # The correct label is not the precision@1
    369                         break
    370         summary = tf.Summary(value=[tf.Summary.Value(tag='accuracies', simple_value=correct / total), ])
    371         step = self.global_step.eval(self._session)
    372         self._test_writer.add_summary(summary, global_step=step)
    373 
    374         self.accuracies.append(correct / total)
    375 
    376         print('\n\tEval {:4d}/{}\t\taccuracy = {:.1%}'.format(correct, total, correct / total))
    377 
    378     def analogy(self, w0, w1, w2):
    379         """Predict word w3 as in w0:w1 vs w2:w3."""
    380         wid = np.array([[self._word2id.get(w, 0) for w in [w0, w1, w2]]])
    381         idx = self._predict(wid)
    382         for c in [self._id2word[i] for i in idx[0, :]]:
    383             if c not in [w0, w1, w2]:
    384                 print(c)
    385                 break
    386         print('unknown')
    387 
    388     def nearby(self, words, num=20):
    389         """Prints out nearby words given a list of words."""
    390         ids = np.array([self._word2id.get(x, 0) for x in words])
    391         vals, idx = self._session.run(
    392                 [self._nearby_val, self._nearby_idx], {self._nearby_word: ids})
    393         for i in range(len(words)):
    394             print("\n%s\n=====================================" % (words[i]))
    395             for (neighbor, distance) in zip(idx[i, :num], vals[i, :num]):
    396                 print("%-20s %6.4f" % (self._id2word[neighbor], distance))
    397 
    398 
    399 def main(_):
    400     """Train a word2vec model."""
    401     opts = Options()
    402     with tf.Graph().as_default(), tf.Session() as session:
    403         with tf.device('/cpu:0'):
    404             model = Word2Vec(opts, session)
    405             model.read_analogies()  # Read analogy questions
    406         for e in range(opts.epochs_to_train):
    407             print()
    408             model.train()  # Process one epoch
    409             model.eval()  # Eval analogies.
    410 
    411 
    412         model.saver.save(session, opts.tensor_board_path / 'model.ckpt', global_step=model.global_step)
    413         final_embeddings = tf.nn.l2_normalize(model._embeddings, 1).eval(session=session)
    414 
    415         writer = tf.summary.FileWriter(str(opts.tensor_board_path), session.graph)
    416 
    417         embedding_var = tf.Variable(final_embeddings[:opts.words_to_project], name='embedding_result')
    418         session.run(embedding_var.initializer)
    419 
    420         config = projector.ProjectorConfig()
    421         embedded = config.embeddings.add()
    422         embedded.tensor_name = embedding_var.name
    423         embedded.metadata_path = str((opts.tensor_board_path / 'metadata.tsv').resolve())
    424         projector.visualize_embeddings(writer, config)
    425 
    426         tf.global_variables_initializer().run()
    427         saver_embed = tf.train.Saver([embedding_var])
    428         saver_embed.save(session, opts.tensor_board_path / 'skip-gram.ckpt', 1)
    429 
    430         params = dict(emb_dim=opts.emb_dim,
    431                       num_samples=opts.num_samples,
    432                       starter_lr=opts.starter_lr,
    433                       target_lr=opts.target_lr,
    434                       epochs_to_train=opts.epochs_to_train,
    435                       window_size=opts.window_size,
    436                       min_count=opts.min_count,
    437                       subsample=opts.subsample)
    438 
    439         with pd.HDFStore(opts.save_path / 'results.h5') as store:
    440             store.put('/'.join([opts.lang, 'embeddings']), pd.DataFrame(final_embeddings))
    441             store.put('/'.join([opts.lang, 'accuracies']), pd.Series(model.accuracies))
    442             store.put('/'.join([opts.lang, 'params']), pd.Series(params))
    443 
    444 
    445 if __name__ == "__main__":
    446     tf.app.run()