ml-finance-python
python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
word2vec-sentiments.py
(3308B)
1 # gensim modules
2 from gensim import utils
3 from gensim.models.doc2vec import TaggedDocument
4 from gensim.models import Doc2Vec
5
6 # random
7 import random
8
9 # numpy
10 import numpy
11
12 # classifier
13 from sklearn.linear_model import LogisticRegression
14
15 import logging
16 import sys
17
18 log = logging.getLogger()
19 log.setLevel(logging.INFO)
20
21 ch = logging.StreamHandler(sys.stdout)
22 ch.setLevel(logging.INFO)
23 formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
24 ch.setFormatter(formatter)
25 log.addHandler(ch)
26
27 class TaggedLineSentence(object):
28 def __init__(self, sources):
29 self.sources = sources
30
31 flipped = {}
32
33 # make sure that keys are unique
34 for key, value in sources.items():
35 if value not in flipped:
36 flipped[value] = [key]
37 else:
38 raise Exception('Non-unique prefix encountered')
39
40 def __iter__(self):
41 for source, prefix in self.sources.items():
42 with utils.smart_open(source) as fin:
43 for item_no, line in enumerate(fin):
44 yield TaggedDocument(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
45
46 def to_array(self):
47 self.sentences = []
48 for source, prefix in self.sources.items():
49 with utils.smart_open(source) as fin:
50 for item_no, line in enumerate(fin):
51 self.sentences.append(TaggedDocument(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
52 return(self.sentences)
53
54 def sentences_perm(self):
55 shuffled = list(self.sentences)
56 random.shuffle(shuffled)
57 return(shuffled)
58
59
60 log.info('source load')
61 sources = {'test-neg.txt':'TEST_NEG', 'test-pos.txt':'TEST_POS', 'train-neg.txt':'TRAIN_NEG', 'train-pos.txt':'TRAIN_POS', 'train-unsup.txt':'TRAIN_UNS'}
62
63 log.info('TaggedDocument')
64 sentences = TaggedLineSentence(sources)
65
66 log.info('D2V')
67 model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7)
68 model.build_vocab(sentences.to_array())
69
70 log.info('Epoch')
71 for epoch in range(10):
72 log.info('EPOCH: {}'.format(epoch))
73 model.train(sentences.sentences_perm())
74
75 log.info('Model Save')
76 model.save('./imdb.d2v')
77 model = Doc2Vec.load('./imdb.d2v')
78
79 log.info('Sentiment')
80 train_arrays = numpy.zeros((25000, 100))
81 train_labels = numpy.zeros(25000)
82
83 for i in range(12500):
84 prefix_train_pos = 'TRAIN_POS_' + str(i)
85 prefix_train_neg = 'TRAIN_NEG_' + str(i)
86 train_arrays[i] = model.docvecs[prefix_train_pos]
87 train_arrays[12500 + i] = model.docvecs[prefix_train_neg]
88 train_labels[i] = 1
89 train_labels[12500 + i] = 0
90
91 log.info(train_labels)
92
93 test_arrays = numpy.zeros((25000, 100))
94 test_labels = numpy.zeros(25000)
95
96 for i in range(12500):
97 prefix_test_pos = 'TEST_POS_' + str(i)
98 prefix_test_neg = 'TEST_NEG_' + str(i)
99 test_arrays[i] = model.docvecs[prefix_test_pos]
100 test_arrays[12500 + i] = model.docvecs[prefix_test_neg]
101 test_labels[i] = 1
102 test_labels[12500 + i] = 0
103
104 log.info('Fitting')
105 classifier = LogisticRegression()
106 classifier.fit(train_arrays, train_labels)
107
108 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
109 intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)
110
111 log.info(classifier.score(test_arrays, test_labels))