ml-finance-python
python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
ml_algo.py
(17465B)
1 # https://www.quantopian.com/posts/machine-learning-alpha-with-risk-constraints
2 # https://www.quantopian.com/posts/machine-learning-on-quantopian-part-3-building-an-algorithm?utm_campaign=machine-learning-on-quantopian-part-3-building-an-algorithm&utm_medium=email&utm_source=forums
3 from collections import OrderedDict
4 from time import time
5
6 import pandas as pd
7 import numpy as np
8 from sklearn import ensemble, preprocessing, metrics, linear_model
9
10 from quantopian.algorithm import (
11 attach_pipeline,
12 date_rules,
13 order_optimal_portfolio,
14 pipeline_output,
15 record,
16 schedule_function,
17 set_commission,
18 set_slippage,
19 time_rules,
20 )
21 import quantopian.optimize as opt
22 from quantopian.pipeline import Pipeline
23 from quantopian.pipeline.classifiers.fundamentals import Sector as _Sector
24 from quantopian.pipeline.data import Fundamentals
25 from quantopian.pipeline.data.builtin import USEquityPricing
26 from quantopian.pipeline.factors import (
27 CustomFactor,
28 Returns,
29 MACDSignal,
30 )
31 from quantopian.pipeline.filters import QTradableStocksUS
32 from quantopian.pipeline.experimental import risk_loading_pipeline
33 from zipline.utils.numpy_utils import (
34 repeat_first_axis,
35 repeat_last_axis,
36 )
37
38 # If you have eventvestor, it's a good idea to screen out aquisition targets
39 # Comment out & ~IsAnnouncedAcqTarget() as well. You can also run this over
40 # the free period.
41 # from quantopian.pipeline.filters.eventvestor import IsAnnouncedAcqTarget
42
43 # Will be split 50% long and 50% short
44 N_STOCKS_TO_TRADE = 500
45
46 # Number of days to train the classifier on, easy to run out of memory here
47 ML_TRAINING_WINDOW = 252
48
49 # train on returns over N days into the future
50 PRED_N_FORWARD_DAYS = 5
51
52 # How often to trade, for daily, set to date_rules.every_day()
53 TRADE_FREQ = date_rules.week_start(days_offset=1) #date_rules.every_day()
54
55 class Sector(_Sector):
56 window_safe = True
57
58
59 class MeanReversion1M(CustomFactor):
60 inputs = (Returns(window_length=21),)
61 window_length = 252
62
63 def compute(self, today, assets, out, monthly_rets):
64 np.divide(
65 monthly_rets[-1] - np.nanmean(monthly_rets, axis=0),
66 np.nanstd(monthly_rets, axis=0),
67 out=out,
68 )
69
70
71 class MoneyflowVolume5d(CustomFactor):
72 inputs = (USEquityPricing.close, USEquityPricing.volume)
73
74 # we need one more day to get the direction of the price on the first
75 # day of our desired window of 5 days
76 window_length = 6
77
78 def compute(self, today, assets, out, close_extra, volume_extra):
79 # slice off the extra row used to get the direction of the close
80 # on the first day
81 close = close_extra[1:]
82 volume = volume_extra[1:]
83
84 dollar_volume = close * volume
85 denominator = dollar_volume.sum(axis=0)
86
87 difference = np.diff(close_extra, axis=0)
88 direction = np.where(difference > 0, 1, -1)
89 numerator = (direction * dollar_volume).sum(axis=0)
90
91 np.divide(numerator, denominator, out=out)
92
93
94 class PriceOscillator(CustomFactor):
95 inputs = (USEquityPricing.close,)
96 window_length = 252
97
98 def compute(self, today, assets, out, close):
99 four_week_period = close[-20:]
100 np.divide(
101 np.nanmean(four_week_period, axis=0),
102 np.nanmean(close, axis=0),
103 out=out,
104 )
105 out -= 1
106
107
108 class Trendline(CustomFactor):
109 inputs = [USEquityPricing.close]
110 window_length = 252
111
112 _x = np.arange(window_length)
113 _x_var = np.var(_x)
114
115 def compute(self, today, assets, out, close):
116 x_matrix = repeat_last_axis(
117 (self.window_length - 1) / 2 - self._x,
118 len(assets),
119 )
120
121 y_bar = np.nanmean(close, axis=0)
122 y_bars = repeat_first_axis(y_bar, self.window_length)
123 y_matrix = close - y_bars
124
125 np.divide(
126 (x_matrix * y_matrix).sum(axis=0) / self._x_var,
127 self.window_length,
128 out=out,
129 )
130
131
132 class Volatility3M(CustomFactor):
133 inputs = [Returns(window_length=2)]
134 window_length = 63
135
136 def compute(self, today, assets, out, rets):
137 np.nanstd(rets, axis=0, out=out)
138
139
140 class AdvancedMomentum(CustomFactor):
141 inputs = (USEquityPricing.close, Returns(window_length=126))
142 window_length = 252
143
144 def compute(self, today, assets, out, prices, returns):
145 np.divide(
146 (
147 (prices[-21] - prices[-252]) / prices[-252] -
148 prices[-1] - prices[-21]
149 ) / prices[-21],
150 np.nanstd(returns, axis=0),
151 out=out,
152 )
153
154
155 asset_growth_3m = Returns(
156 inputs=[Fundamentals.total_assets],
157 window_length=63,
158 )
159 asset_to_equity_ratio = (
160 Fundamentals.total_assets.latest /
161 Fundamentals.common_stock_equity.latest
162 )
163 capex_to_cashflows = (
164 Fundamentals.capital_expenditure.latest /
165 Fundamentals.free_cash_flow.latest
166 )
167
168 ebitda_yield = (
169 (Fundamentals.ebitda.latest * 4) /
170 USEquityPricing.close.latest
171 )
172 ebita_to_assets = (
173 (Fundamentals.ebit.latest * 4) /
174 Fundamentals.total_assets.latest
175 )
176 return_on_total_invest_capital = Fundamentals.roic.latest
177 mean_reversion_1m = MeanReversion1M()
178 macd_signal_10d = MACDSignal(
179 fast_period=12,
180 slow_period=26,
181 signal_period=10,
182 )
183 moneyflow_volume_5d = MoneyflowVolume5d()
184 net_income_margin = Fundamentals.net_margin.latest
185 operating_cashflows_to_assets = (
186 (Fundamentals.operating_cash_flow.latest * 4) /
187 Fundamentals.total_assets.latest
188 )
189 price_momentum_3m = Returns(window_length=63)
190 price_oscillator = PriceOscillator()
191 trendline = Trendline()
192 returns_39w = Returns(window_length=215)
193 volatility_3m = Volatility3M()
194 advanced_momentum = AdvancedMomentum()
195
196
197 features = {
198 'Asset Growth 3M': asset_growth_3m,
199 'Asset to Equity Ratio': asset_to_equity_ratio,
200 'Capex to Cashflows': capex_to_cashflows,
201 'EBIT to Assets': ebita_to_assets,
202 'EBITDA Yield': ebitda_yield,
203 'MACD Signal Line': macd_signal_10d,
204 'Mean Reversion 1M': mean_reversion_1m,
205 'Moneyflow Volume 5D': moneyflow_volume_5d,
206 'Net Income Margin': net_income_margin,
207 'Operating Cashflows to Assets': operating_cashflows_to_assets,
208 'Price Momentum 3M': price_momentum_3m,
209 'Price Oscillator': price_oscillator,
210 'Return on Invest Capital': return_on_total_invest_capital,
211 '39 Week Returns': returns_39w,
212 'Trendline': trendline,
213 'Volatility 3m': volatility_3m,
214 'Advanced Momentum': advanced_momentum,
215 }
216
217
218 def shift_mask_data(features,
219 labels,
220 n_forward_days,
221 lower_percentile,
222 upper_percentile):
223 """Align features to the labels ``n_forward_days`` into the future and
224 return the discrete, flattened features and masked labels.
225
226 Parameters
227 ----------
228 features : np.ndarray
229 A 3d array of (days, assets, feature).
230 labels : np.ndarray
231 The labels to predict.
232 n_forward_days : int
233 How many days into the future are we predicting?
234 lower_percentile : float
235 The lower percentile in the range [0, 100].
236 upper_percentile : float
237 The upper percentile in the range [0, 100].
238
239 Returns
240 -------
241 selected_features : np.ndarray
242 The flattened features that are not masked out.
243 selected_labels : np.ndarray
244 The labels that are not masked out.
245 """
246
247 # Slice off rolled elements
248 shift_by = n_forward_days + 1
249 aligned_features = features[:-shift_by]
250 aligned_labels = labels[shift_by:]
251
252 cutoffs = np.nanpercentile(
253 aligned_labels,
254 [lower_percentile, upper_percentile],
255 axis=1,
256 )
257 discrete_labels = np.select(
258 [
259 aligned_labels <= cutoffs[0, :, np.newaxis],
260 aligned_labels >= cutoffs[1, :, np.newaxis],
261 ],
262 [-1, 1],
263 )
264
265 # flatten the features per day
266 flattened_features = aligned_features.reshape(
267 -1,
268 aligned_features.shape[-1],
269 )
270
271 # Drop stocks that did not move much, meaning they are in between
272 # ``lower_percentile`` and ``upper_percentile``.
273 mask = discrete_labels != 0
274
275 selected_features = flattened_features[mask.ravel()]
276 selected_labels = discrete_labels[mask]
277
278 return selected_features, selected_labels
279
280
281 class ML(CustomFactor):
282 """
283 """
284 train_on_weekday = 1
285
286 def __init__(self, *args, **kwargs):
287 CustomFactor.__init__(self, *args, **kwargs)
288
289 self._imputer = preprocessing.Imputer()
290 self._scaler = preprocessing.MinMaxScaler()
291 self._classifier = linear_model.SGDClassifier(penalty='elasticnet')
292 self.trained = False
293 #ensemble.AdaBoostClassifier(
294 # random_state=1337,
295 # n_estimators=50,
296 #)
297
298 def _compute(self, *args, **kwargs):
299 ret = CustomFactor._compute(self, *args, **kwargs)
300
301 # reset the day counter so that we will begin training at the start of
302 # the next _compute call
303 self._day_counter = -1
304
305 return ret
306
307 def _train_model(self, today, returns, inputs):
308 log.info('training model for window starting on: {}', today)
309
310 imputer = self._imputer
311 scaler = self._scaler
312 classifier = self._classifier
313
314 features, labels = shift_mask_data(
315 np.dstack(inputs),
316 returns,
317 n_forward_days=PRED_N_FORWARD_DAYS,
318 lower_percentile=30,
319 upper_percentile=70,
320 )
321 features = scaler.fit_transform(imputer.fit_transform(features))
322
323 start = time()
324 classifier.fit(features, labels)
325 log.info('training took {} secs', time() - start)
326 self.trained = True
327
328 def _maybe_train_model(self, today, returns, inputs):
329 if (today.weekday() == self.train_on_weekday) or not self.trained:
330 self._train_model(today, returns, inputs)
331
332 def compute(self, today, assets, out, returns, *inputs):
333 # inputs is a list of factors, for example, assume we have 2 alpha
334 # signals, 3 stocks, and a lookback of 2 days. Each element in the
335 # inputs list will be data of one signal, so len(inputs) == 2. Then
336 # each element will contain a 2-D array of shape [time x stocks]. For
337 # example:
338 # inputs[0]:
339 # [[1, 3, 2], # factor 1 rankings of day t-1 for 3 stocks
340 # [3, 2, 1]] # factor 1 rankings of day t for 3 stocks
341 # inputs[1]:
342 # [[2, 3, 1], # factor 2 rankings of day t-1 for 3 stocks
343 # [1, 2, 3]] # factor 2 rankings of day t for 3 stocks
344 self._maybe_train_model(today, returns, inputs)
345
346 # Predict
347 # Get most recent factor values (inputs always has the full history)
348 last_factor_values = np.vstack([input_[-1] for input_ in inputs]).T
349 last_factor_values = self._imputer.transform(last_factor_values)
350 last_factor_values = self._scaler.transform(last_factor_values)
351
352 # Predict the probability for each stock going up
353 # (column 2 of the output of .predict_proba()) and
354 # return it via assignment to out.
355 #out[:] = self._classifier.predict_proba(last_factor_values)[:, 1]
356 out[:] = self._classifier.predict(last_factor_values)
357
358
359 def make_ml_pipeline(universe, window_length=21, n_forward_days=5):
360 pipeline_columns = OrderedDict()
361
362 # ensure that returns is the first input
363 pipeline_columns['Returns'] = Returns(
364 inputs=(USEquityPricing.open,),
365 mask=universe, window_length=n_forward_days + 1,
366 )
367
368 # rank all the factors and put them after returns
369 pipeline_columns.update({
370 k: v.rank(mask=universe) for k, v in features.items()
371 })
372
373 # Create our ML pipeline factor. The window_length will control how much
374 # lookback the passed in data will have.
375 pipeline_columns['ML'] = ML(
376 inputs=pipeline_columns.values(),
377 window_length=window_length + 1,
378 mask=universe,
379 )
380
381 pipeline_columns['Sector'] = Sector()
382
383 return Pipeline(screen=universe, columns=pipeline_columns)
384
385
386 def initialize(context):
387 """
388 Called once at the start of the algorithm.
389 """
390 set_slippage(slippage.FixedSlippage(spread=0.00))
391 set_commission(commission.PerShare(cost=0, min_trade_cost=0))
392
393 schedule_function(
394 rebalance,
395 TRADE_FREQ,
396 time_rules.market_open(minutes=1),
397 )
398
399 # Record tracking variables at the end of each day.
400 schedule_function(
401 record_vars,
402 date_rules.every_day(),
403 time_rules.market_close(),
404 )
405
406 # Set up universe, alphas and ML pipline
407 context.universe = QTradableStocksUS()
408 # if you are using IsAnnouncedAcqTarget, uncomment the next line
409 # context.universe &= IsAnnouncedAcqTarget()
410
411 ml_pipeline = make_ml_pipeline(
412 context.universe,
413 n_forward_days=PRED_N_FORWARD_DAYS,
414 window_length=ML_TRAINING_WINDOW,
415 )
416 # Create our dynamic stock selector.
417 attach_pipeline(ml_pipeline, 'alpha_model')
418 # Add the risk pipeline
419 attach_pipeline(risk_loading_pipeline(), 'risk_factors')
420
421 context.past_predictions = {}
422 context.hold_out_accuracy = 0
423 context.hold_out_log_loss = 0
424 context.hold_out_returns_spread_bps = 0
425
426
427 def evaluate_and_shift_hold_out(output, context):
428 # Look at past predictions to evaluate classifier accuracy on hold-out data
429 # A day has passed, shift days and drop old ones
430 context.past_predictions = {
431 k - 1: v
432 for k, v in context.past_predictions.iteritems()
433 if k > 0
434 }
435
436 if 0 in context.past_predictions:
437 # Past predictions for the current day exist, so we can use todays'
438 # n-back returns to evaluate them
439 raw_returns = output['Returns']
440 raw_predictions = context.past_predictions[0]
441
442 # Join to match up equities
443 returns, predictions = raw_returns.align(raw_predictions, join='inner')
444
445 # Binarize returns
446 returns_binary = returns > returns.median()
447 predictions_binary = predictions > 0.5
448
449 # Compute performance metrics
450 context.hold_out_accuracy = metrics.accuracy_score(
451 returns_binary.values,
452 predictions_binary.values,
453 )
454 context.hold_out_log_loss = metrics.log_loss(
455 returns_binary.values,
456 predictions.values,
457 )
458 long_rets = returns[predictions_binary == 1].mean()
459 short_rets = returns[predictions_binary == 0].mean()
460 context.hold_out_returns_spread_bps = (long_rets - short_rets) * 10000
461
462 # Store current predictions
463 context.past_predictions[PRED_N_FORWARD_DAYS] = context.predicted_probs
464
465
466 def before_trading_start(context, data):
467 """
468 Called every day before market open.
469 """
470 output = pipeline_output('alpha_model')
471 context.predicted_probs = output['ML']
472 context.predicted_probs.index.rename(['date', 'equity'], inplace=True)
473
474 context.risk_loadings = pipeline_output('risk_factors')
475
476 evaluate_and_shift_hold_out(output, context)
477
478 # These are the securities that we are interested in trading each day.
479 context.security_list = context.predicted_probs.index
480
481
482 def rebalance(context, data):
483 """
484 Execute orders according to our schedule_function() timing.
485 """
486
487 predictions = context.predicted_probs
488
489 # Filter out stocks that can not be traded
490 predictions = predictions.loc[data.can_trade(predictions.index)]
491 # Select top and bottom N stocks
492 n_long_short = min(N_STOCKS_TO_TRADE // 2, len(predictions) // 2)
493 predictions_top_bottom = pd.concat([
494 predictions.nlargest(n_long_short),
495 predictions.nsmallest(n_long_short),
496 ])
497
498 # If classifier predicts many identical values, the top might contain
499 # duplicate stocks
500 predictions_top_bottom = predictions_top_bottom.iloc[
501 ~predictions_top_bottom.index.duplicated()
502 ]
503
504 # predictions are probabilities ranging from 0 to 1
505 predictions_top_bottom = (predictions_top_bottom - 0.5) * 2
506
507 # pull in the risk factor loadings
508 risk_loadings = context.risk_loadings
509
510 # Setup Optimization Objective
511 # Factor-weighted portfolio
512 objective = opt.TargetWeights(predictions_top_bottom)
513
514 # Setup Optimization Constraints
515 constrain_gross_leverage = opt.MaxGrossExposure(1.0)
516 constrain_pos_size = opt.PositionConcentration.with_equal_bounds(
517 -0.02,
518 +0.02,
519 )
520 market_neutral = opt.DollarNeutral()
521
522 if predictions_top_bottom.index.duplicated().any():
523 log.debug(predictions_top_bottom.head())
524
525 risk_neutral = opt.experimental.RiskModelExposure(
526 risk_model_loadings=risk_loadings
527 )
528
529 # Run the optimization. This will calculate new portfolio weights and
530 # manage moving our portfolio toward the target.
531 order_optimal_portfolio(
532 objective=objective,
533 constraints=[
534 constrain_gross_leverage,
535 constrain_pos_size,
536 market_neutral,
537 risk_neutral
538 ],
539 )
540
541
542 def record_vars(context, data):
543 """
544 Plot variables at the end of each day.
545 """
546 record(
547 leverage=context.account.leverage,
548 hold_out_accuracy=context.hold_out_accuracy,
549 hold_out_log_loss=context.hold_out_log_loss,
550 hold_out_returns_spread_bps=context.hold_out_returns_spread_bps,
551 )
552
553
554 def handle_data(context, data):
555 pass