ml-finance-python

python scripts for finance machine learning

git clone https://9o.is/git/ml-finance-python.git

word2vec.py

(8552B)


      1 #!/usr/bin/env python
      2 # -*- coding: utf-8 -*-
      3 __author__ = 'Stefan Jansen'
      4 
      5 from pathlib import Path
      6 from collections import Counter
      7 from argparse import ArgumentParser
      8 
      9 import pandas as pd
     10 import numpy as np
     11 from scipy.spatial.distance import cdist
     12 from keras.models import Model
     13 from keras.layers import Input, Dense, Reshape, Dot
     14 from keras.layers.embeddings import Embedding
     15 from keras.preprocessing.sequence import skipgrams
     16 from keras.preprocessing import sequence
     17 from keras.callbacks import Callback, TensorBoard
     18 
     19 np.random.seed(42)
     20 
     21 LANGUAGES = ['en', 'es']
     22 
     23 SOURCES = ['ted', 'euro']
     24 SOURCE_LABELS = ['Ted', 'Europarliament']
     25 source_dict = dict(zip(SOURCES, SOURCE_LABELS))
     26 
     27 parser = ArgumentParser(description='Run Keras word2vec model')
     28 parser.add_argument('-l', '--language', choices=LANGUAGES, help='language', default='en')
     29 parser.add_argument('-s', '--source', choices=SOURCES, help='data source', default='euro')
     30 parser.add_argument('-m', '--model', choices=[1, 2, 3], help='model', default=1)
     31 
     32 args = parser.parse_args()
     33 LANGUAGE = args.language
     34 MODEL = 'ngrams_{}'.format(args.model)
     35 SOURCE = source_dict[args.source]
     36 
     37 PROJECT_DIR = Path('/home/stefan/projects/odsc_2018/word2vec-translation')
     38 
     39 
     40 def get_vocab_stats():
     41     with pd.HDFStore(Path('vocab', SOURCE, 'vocab.h5').name) as store:
     42         df = store['{}/vocab'.format(LANGUAGE)]
     43 
     44     wc = df['count'].value_counts().sort_index(ascending=False).reset_index()
     45     wc.columns = ['word_count', 'freq']
     46     wc['n_words'] = wc.word_count.mul(wc.freq)
     47 
     48     wc['corpus_share'] = wc.n_words.div(wc.n_words.sum())
     49     wc['coverage'] = wc.corpus_share.cumsum()
     50     wc['vocab_size'] = wc.freq.cumsum()
     51     return wc
     52 
     53 
     54 # wc = get_vocab_stats()
     55 # print('# words: {:,d}'.format(wc.n_words.sum()))
     56 # print(wc.loc[:, ['word_count', 'freq', 'n_words', 'vocab_size', 'coverage']].tail(10))
     57 
     58 ### Model Settings
     59 MIN_FREQ = 5
     60 WINDOW_SIZE = 5
     61 EMBEDDING_SIZE = 300
     62 EPOCHS = 1
     63 BATCH_SIZE = 100
     64 
     65 PATH = Path('.', SOURCE, LANGUAGE, MODEL)
     66 TB_PATH = PATH / 'tensorboard'
     67 if not TB_PATH.exists():
     68     TB_PATH.mkdir(parents=True, exist_ok=True)
     69 
     70 VALID_SIZE = 15  # Random set of words to evaluate similarity on.
     71 VALID_WINDOW = 250  # Evaluation samples from most frequent words
     72 NN = 10  # Nearest neighbors for evaluation
     73 
     74 valid_examples = np.random.choice(VALID_WINDOW, VALID_SIZE, replace=False)
     75 
     76 
     77 def build_data(language, ngrams=1):
     78     path = PROJECT_DIR / 'vocab' / SOURCE / language / 'ngrams_{}.txt'.format(ngrams)
     79     words = path.read_text().split()
     80 
     81     token_counts = [t for t in Counter(words).most_common() if t[1] >= MIN_FREQ]
     82     tokens, counts = list(zip(*token_counts))
     83 
     84     id_to_token = pd.Series(tokens).to_dict()
     85     id_to_token.update({-1: 'UNK'})
     86     token_to_id = {t: i for i, t in id_to_token.items()}
     87     data = [token_to_id.get(word, -1) for word in words]
     88     return data, token_to_id, id_to_token
     89 
     90 
     91 data, token_to_id, id_to_token = build_data(LANGUAGE, ngrams=1)
     92 
     93 vocab_size = len(token_to_id) - 1
     94 
     95 
     96 def save_meta(d):
     97     s = pd.Series(d).value_counts().reset_index()
     98     s.columns = ['id', 'count']
     99     s['token'] = s.id.map(id_to_token)
    100     s[s.id >= 0].sort_values('id').token.dropna().to_csv(TB_PATH / 'meta.tsv', index=False)
    101 
    102 
    103 save_meta(data)
    104 
    105 
    106 # #### Process Analogies
    107 def get_analogies(lang):
    108     analogies = pd.read_csv(Path('..', 'data', 'analogies', 'analogies-{}.txt'.format(lang)),
    109                             header=None, names=['analogies'], squeeze=True)
    110     cats = analogies.apply(lambda x: x if x.startswith(':') else np.nan).ffill().str.strip(':').str.strip().to_frame(
    111             'cats')
    112     analogies = analogies[~analogies.str.startswith(':')].str.split(expand=True)
    113     analogies.columns = list('abcd')
    114     analogies = cats.merge(analogies, left_index=True, right_index=True)
    115     df['cats'], idx = pd.factorize(df.cats)
    116     return analogies, pd.Series(idx)
    117 
    118 
    119 analogies, categories = get_analogies('en')
    120 analogies_id = analogies.apply(lambda x: x.map(token_to_id))
    121 
    122 test_set = analogies_id.dropna().astype(int)
    123 a, b, c, actual = test_set.values.T
    124 actual = actual.reshape(-1, 1)
    125 n_analogies = len(actual)
    126 
    127 sampling_table = sequence.make_sampling_table(vocab_size)
    128 
    129 couples, labels = skipgrams(sequence=data,
    130                             vocabulary_size=vocab_size,
    131                             window_size=WINDOW_SIZE,
    132                             sampling_table=sampling_table,
    133                             negative_samples=1.0,
    134                             shuffle=True)
    135 
    136 target_word, context_word = np.array(couples, dtype=np.int32).T
    137 labels = np.array(labels, dtype=np.int8)
    138 del couples
    139 
    140 with pd.HDFStore(PATH / 'data.h5') as store:
    141     store.put('id_to_token', pd.Series(id_to_token))
    142     store.put('analogies', test_set)
    143 
    144 
    145 def model_graph():
    146     #### Scalar Input Variables
    147     input_target = Input((1,), name='target_input')
    148     input_context = Input((1,), name='context_input')
    149 
    150     #### Shared Embedding Layer
    151     embedding = Embedding(input_dim=vocab_size,
    152                           output_dim=EMBEDDING_SIZE,
    153                           input_length=1,
    154                           name='embedding_layer')
    155 
    156     #### Select Embedding Vectors
    157     target = embedding(input_target)
    158     target = Reshape((EMBEDDING_SIZE, 1), name='target_embedding')(target)
    159 
    160     context = embedding(input_context)
    161     context = Reshape((EMBEDDING_SIZE, 1), name='context_embedding')(context)
    162 
    163     #### Compute Similarity (not normalized)
    164     dot_product = Dot(axes=1)([target, context])
    165     dot_product = Reshape((1,), name='similarity')(dot_product)
    166 
    167     #### Sigmoid Output Layer
    168     output = Dense(units=1, activation='sigmoid', name='output')(dot_product)
    169 
    170     # #### Training Model
    171     model = Model(inputs=[input_target, input_context], outputs=output)
    172 
    173     # Validation Model (Cosine Similarity)
    174     similarity = Dot(normalize=True,
    175                      axes=1,
    176                      name='cosine_similarity')([target, context])
    177     valid_model = Model(inputs=[input_target, input_context], outputs=similarity)
    178 
    179     return model, valid_model, embedding
    180 
    181 
    182 train_model, valid_model, embedding = model_graph()
    183 train_model.compile(loss='binary_crossentropy', optimizer='rmsprop')
    184 
    185 print(train_model.summary())
    186 print(valid_model.summary())
    187 
    188 
    189 #### Evaluation: Nearest Neighors & Analogies
    190 class EvalCallback(Callback):
    191     def on_train_begin(self, logs=None):
    192         print('\n\t{} nearest neighbors:'.format(NN))
    193         for i in range(VALID_SIZE):
    194             valid_word = id_to_token[valid_examples[i]]
    195             sim = self._get_similiarity(valid_examples[i]).reshape(-1)
    196             nearest = (-sim).argsort()[1:NN + 1]
    197             neighbors = [id_to_token[nearest[n]] for n in range(NN)]
    198             print('\t\t{}: {}'.format(valid_word, ', '.join(neighbors)))
    199 
    200     def on_train_end(self, logs=None):
    201         print('\n\t{} nearest neighbors:'.format(NN))
    202         for i in range(VALID_SIZE):
    203             valid_word = id_to_token[valid_examples[i]]
    204             sim = self._get_similiarity(valid_examples[i]).reshape(-1)
    205             nearest = (-sim).argsort()[1:NN + 1]
    206             neighbors = [id_to_token[nearest[n]] for n in range(NN)]
    207             print('\t\t{}: {}'.format(valid_word, ', '.join(neighbors)))
    208 
    209     def on_epoch_end(self, eppch, logs=None):
    210         print('\n\tAnalogy Accuracy:\n\t\t', end='')
    211         print(self.test_analogies())
    212 
    213     @staticmethod
    214     def test_analogies():
    215         embeddings = embedding.get_weights()[0]
    216         target = embeddings[c] + embeddings[b] - embeddings[a]
    217         neighbors = np.argsort(cdist(target, embeddings, metric='cosine'))
    218         match_id = np.argwhere(neighbors == actual)[:, 1]
    219         return '\n\t\t'.join(['Top {}: {:.2%}'.format(i, (match_id < i).sum() / n_analogies) for i in [1, 5, 10]])
    220 
    221     @staticmethod
    222     def _get_similiarity(valid_word_idx):
    223         target = np.full(shape=vocab_size, fill_value=valid_word_idx)
    224         context = np.arange(vocab_size)
    225         return valid_model.predict([target, context])
    226 
    227 
    228 evaluation = EvalCallback()
    229 
    230 # ##### Tensorboard Callback
    231 tensorboard = TensorBoard(log_dir=str(TB_PATH), histogram_freq=0,
    232                           batch_size=32, write_graph=True,
    233                           embeddings_freq=100,
    234                           embeddings_metadata=str((TB_PATH / 'meta.tsv').resolve()))
    235 
    236 loss = train_model.fit(x=[target_word, context_word], y=labels,
    237                        shuffle=True,
    238                        batch_size=BATCH_SIZE, epochs=EPOCHS,
    239                        callbacks=[evaluation, tensorboard])
    240 
    241 train_model.save(str(PATH / 'skipgram_model.h5'))