ml-finance-python

python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
ml_algo.py

(17465B)
      1 # https://www.quantopian.com/posts/machine-learning-alpha-with-risk-constraints
      2 # https://www.quantopian.com/posts/machine-learning-on-quantopian-part-3-building-an-algorithm?utm_campaign=machine-learning-on-quantopian-part-3-building-an-algorithm&utm_medium=email&utm_source=forums
      3 from collections import OrderedDict
      4 from time import time
      5 
      6 import pandas as pd
      7 import numpy as np
      8 from sklearn import ensemble, preprocessing, metrics, linear_model
      9 
     10 from quantopian.algorithm import (
     11     attach_pipeline,
     12     date_rules,
     13     order_optimal_portfolio,
     14     pipeline_output,
     15     record,
     16     schedule_function,
     17     set_commission,
     18     set_slippage,
     19     time_rules,
     20 )
     21 import quantopian.optimize as opt
     22 from quantopian.pipeline import Pipeline
     23 from quantopian.pipeline.classifiers.fundamentals import Sector as _Sector
     24 from quantopian.pipeline.data import Fundamentals
     25 from quantopian.pipeline.data.builtin import USEquityPricing
     26 from quantopian.pipeline.factors import (
     27     CustomFactor,
     28     Returns,
     29     MACDSignal,
     30 )
     31 from quantopian.pipeline.filters import QTradableStocksUS
     32 from quantopian.pipeline.experimental import risk_loading_pipeline
     33 from zipline.utils.numpy_utils import (
     34     repeat_first_axis,
     35     repeat_last_axis,
     36 )
     37 
     38 # If you have eventvestor, it's a good idea to screen out aquisition targets
     39 # Comment out & ~IsAnnouncedAcqTarget() as well. You can also run this over
     40 # the free period.
     41 # from quantopian.pipeline.filters.eventvestor import IsAnnouncedAcqTarget
     42 
     43 # Will be split 50% long and 50% short
     44 N_STOCKS_TO_TRADE = 500
     45 
     46 # Number of days to train the classifier on, easy to run out of memory here
     47 ML_TRAINING_WINDOW = 252
     48 
     49 # train on returns over N days into the future
     50 PRED_N_FORWARD_DAYS = 5
     51 
     52 # How often to trade, for daily, set to date_rules.every_day()
     53 TRADE_FREQ = date_rules.week_start(days_offset=1) #date_rules.every_day()
     54 
     55 class Sector(_Sector):
     56     window_safe = True
     57 
     58 
     59 class MeanReversion1M(CustomFactor):
     60     inputs = (Returns(window_length=21),)
     61     window_length = 252
     62 
     63     def compute(self, today, assets, out, monthly_rets):
     64         np.divide(
     65             monthly_rets[-1] - np.nanmean(monthly_rets, axis=0),
     66             np.nanstd(monthly_rets, axis=0),
     67             out=out,
     68         )
     69 
     70 
     71 class MoneyflowVolume5d(CustomFactor):
     72     inputs = (USEquityPricing.close, USEquityPricing.volume)
     73 
     74     # we need one more day to get the direction of the price on the first
     75     # day of our desired window of 5 days
     76     window_length = 6
     77 
     78     def compute(self, today, assets, out, close_extra, volume_extra):
     79         # slice off the extra row used to get the direction of the close
     80         # on the first day
     81         close = close_extra[1:]
     82         volume = volume_extra[1:]
     83 
     84         dollar_volume = close * volume
     85         denominator = dollar_volume.sum(axis=0)
     86 
     87         difference = np.diff(close_extra, axis=0)
     88         direction = np.where(difference > 0, 1, -1)
     89         numerator = (direction * dollar_volume).sum(axis=0)
     90 
     91         np.divide(numerator, denominator, out=out)
     92 
     93 
     94 class PriceOscillator(CustomFactor):
     95     inputs = (USEquityPricing.close,)
     96     window_length = 252
     97     
     98     def compute(self, today, assets, out, close):
     99         four_week_period = close[-20:]
    100         np.divide(
    101             np.nanmean(four_week_period, axis=0),
    102             np.nanmean(close, axis=0),
    103             out=out,
    104         )
    105         out -= 1
    106 
    107 
    108 class Trendline(CustomFactor):
    109     inputs = [USEquityPricing.close]
    110     window_length = 252
    111 
    112     _x = np.arange(window_length)
    113     _x_var = np.var(_x)
    114 
    115     def compute(self, today, assets, out, close):
    116         x_matrix = repeat_last_axis(
    117             (self.window_length - 1) / 2 - self._x,
    118             len(assets),
    119         )
    120 
    121         y_bar = np.nanmean(close, axis=0)
    122         y_bars = repeat_first_axis(y_bar, self.window_length)
    123         y_matrix = close - y_bars
    124 
    125         np.divide(
    126             (x_matrix * y_matrix).sum(axis=0) / self._x_var,
    127             self.window_length,
    128             out=out,
    129         )
    130 
    131 
    132 class Volatility3M(CustomFactor):
    133     inputs = [Returns(window_length=2)]
    134     window_length = 63
    135 
    136     def compute(self, today, assets, out, rets):
    137         np.nanstd(rets, axis=0, out=out)
    138 
    139 
    140 class AdvancedMomentum(CustomFactor):
    141     inputs = (USEquityPricing.close, Returns(window_length=126))
    142     window_length = 252
    143 
    144     def compute(self, today, assets, out, prices, returns):
    145         np.divide(
    146             (
    147                 (prices[-21] - prices[-252]) / prices[-252] -
    148                 prices[-1] - prices[-21]
    149             ) / prices[-21],
    150             np.nanstd(returns, axis=0),
    151             out=out,
    152         )
    153 
    154 
    155 asset_growth_3m = Returns(
    156     inputs=[Fundamentals.total_assets],
    157     window_length=63,
    158 )
    159 asset_to_equity_ratio = (
    160     Fundamentals.total_assets.latest /
    161     Fundamentals.common_stock_equity.latest
    162 )
    163 capex_to_cashflows = (
    164     Fundamentals.capital_expenditure.latest /
    165     Fundamentals.free_cash_flow.latest
    166 )
    167 
    168 ebitda_yield = (
    169     (Fundamentals.ebitda.latest * 4) /
    170     USEquityPricing.close.latest
    171 )
    172 ebita_to_assets = (
    173     (Fundamentals.ebit.latest * 4) /
    174     Fundamentals.total_assets.latest
    175 )
    176 return_on_total_invest_capital = Fundamentals.roic.latest
    177 mean_reversion_1m = MeanReversion1M()
    178 macd_signal_10d = MACDSignal(
    179     fast_period=12,
    180     slow_period=26,
    181     signal_period=10,
    182 )
    183 moneyflow_volume_5d = MoneyflowVolume5d()
    184 net_income_margin = Fundamentals.net_margin.latest
    185 operating_cashflows_to_assets = (
    186     (Fundamentals.operating_cash_flow.latest * 4) /
    187     Fundamentals.total_assets.latest
    188 )
    189 price_momentum_3m = Returns(window_length=63)
    190 price_oscillator = PriceOscillator()
    191 trendline = Trendline()
    192 returns_39w = Returns(window_length=215)
    193 volatility_3m = Volatility3M()
    194 advanced_momentum = AdvancedMomentum()
    195 
    196 
    197 features = {
    198     'Asset Growth 3M': asset_growth_3m,
    199     'Asset to Equity Ratio': asset_to_equity_ratio,
    200     'Capex to Cashflows': capex_to_cashflows,
    201     'EBIT to Assets': ebita_to_assets,
    202     'EBITDA Yield': ebitda_yield,
    203     'MACD Signal Line': macd_signal_10d,
    204     'Mean Reversion 1M': mean_reversion_1m,
    205     'Moneyflow Volume 5D': moneyflow_volume_5d,
    206     'Net Income Margin': net_income_margin,
    207     'Operating Cashflows to Assets': operating_cashflows_to_assets,
    208     'Price Momentum 3M': price_momentum_3m,
    209     'Price Oscillator': price_oscillator,
    210     'Return on Invest Capital': return_on_total_invest_capital,
    211     '39 Week Returns': returns_39w,
    212     'Trendline': trendline,
    213     'Volatility 3m': volatility_3m,
    214     'Advanced Momentum': advanced_momentum,
    215 }
    216 
    217 
    218 def shift_mask_data(features,
    219                     labels,
    220                     n_forward_days,
    221                     lower_percentile,
    222                     upper_percentile):
    223     """Align features to the labels ``n_forward_days`` into the future and
    224     return the discrete, flattened features and masked labels.
    225 
    226     Parameters
    227     ----------
    228     features : np.ndarray
    229         A 3d array of (days, assets, feature).
    230     labels : np.ndarray
    231         The labels to predict.
    232     n_forward_days : int
    233         How many days into the future are we predicting?
    234     lower_percentile : float
    235         The lower percentile in the range [0, 100].
    236     upper_percentile : float
    237         The upper percentile in the range [0, 100].
    238 
    239     Returns
    240     -------
    241     selected_features : np.ndarray
    242         The flattened features that are not masked out.
    243     selected_labels : np.ndarray
    244         The labels that are not masked out.
    245     """
    246 
    247     # Slice off rolled elements
    248     shift_by = n_forward_days + 1
    249     aligned_features = features[:-shift_by]
    250     aligned_labels = labels[shift_by:]
    251 
    252     cutoffs = np.nanpercentile(
    253         aligned_labels,
    254         [lower_percentile, upper_percentile],
    255         axis=1,
    256     )
    257     discrete_labels = np.select(
    258         [
    259             aligned_labels <= cutoffs[0, :, np.newaxis],
    260             aligned_labels >= cutoffs[1, :, np.newaxis],
    261         ],
    262         [-1, 1],
    263     )
    264 
    265     # flatten the features per day
    266     flattened_features = aligned_features.reshape(
    267         -1,
    268         aligned_features.shape[-1],
    269     )
    270 
    271     # Drop stocks that did not move much, meaning they are in between
    272     # ``lower_percentile`` and ``upper_percentile``.
    273     mask = discrete_labels != 0
    274 
    275     selected_features = flattened_features[mask.ravel()]
    276     selected_labels = discrete_labels[mask]
    277 
    278     return selected_features, selected_labels
    279 
    280 
    281 class ML(CustomFactor):
    282     """
    283     """
    284     train_on_weekday = 1
    285 
    286     def __init__(self, *args, **kwargs):
    287         CustomFactor.__init__(self, *args, **kwargs)
    288 
    289         self._imputer = preprocessing.Imputer()
    290         self._scaler = preprocessing.MinMaxScaler()
    291         self._classifier = linear_model.SGDClassifier(penalty='elasticnet')
    292         self.trained = False
    293         #ensemble.AdaBoostClassifier(
    294         #    random_state=1337,
    295         #    n_estimators=50,
    296         #)
    297 
    298     def _compute(self, *args, **kwargs):
    299         ret = CustomFactor._compute(self, *args, **kwargs)
    300 
    301         # reset the day counter so that we will begin training at the start of
    302         # the next _compute call
    303         self._day_counter = -1
    304 
    305         return ret
    306 
    307     def _train_model(self, today, returns, inputs):
    308         log.info('training model for window starting on: {}', today)
    309 
    310         imputer = self._imputer
    311         scaler = self._scaler
    312         classifier = self._classifier
    313 
    314         features, labels = shift_mask_data(
    315             np.dstack(inputs),
    316             returns,
    317             n_forward_days=PRED_N_FORWARD_DAYS,
    318             lower_percentile=30,
    319             upper_percentile=70,
    320         )
    321         features = scaler.fit_transform(imputer.fit_transform(features))
    322 
    323         start = time()
    324         classifier.fit(features, labels)
    325         log.info('training took {} secs', time() - start)
    326         self.trained = True
    327 
    328     def _maybe_train_model(self, today, returns, inputs):
    329         if (today.weekday() == self.train_on_weekday) or not self.trained:
    330             self._train_model(today, returns, inputs)
    331 
    332     def compute(self, today, assets, out, returns, *inputs):
    333         # inputs is a list of factors, for example, assume we have 2 alpha
    334         # signals, 3 stocks, and a lookback of 2 days. Each element in the
    335         # inputs list will be data of one signal, so len(inputs) == 2. Then
    336         # each element will contain a 2-D array of shape [time x stocks]. For
    337         # example:
    338         # inputs[0]:
    339         # [[1, 3, 2], # factor 1 rankings of day t-1 for 3 stocks
    340         #  [3, 2, 1]] # factor 1 rankings of day t for 3 stocks
    341         # inputs[1]:
    342         # [[2, 3, 1], # factor 2 rankings of day t-1 for 3 stocks
    343         #  [1, 2, 3]] # factor 2 rankings of day t for 3 stocks
    344         self._maybe_train_model(today, returns, inputs)
    345 
    346         # Predict
    347         # Get most recent factor values (inputs always has the full history)
    348         last_factor_values = np.vstack([input_[-1] for input_ in inputs]).T
    349         last_factor_values = self._imputer.transform(last_factor_values)
    350         last_factor_values = self._scaler.transform(last_factor_values)
    351 
    352         # Predict the probability for each stock going up
    353         # (column 2 of the output of .predict_proba()) and
    354         # return it via assignment to out.
    355         #out[:] = self._classifier.predict_proba(last_factor_values)[:, 1]
    356         out[:] = self._classifier.predict(last_factor_values)
    357 
    358 
    359 def make_ml_pipeline(universe, window_length=21, n_forward_days=5):
    360     pipeline_columns = OrderedDict()
    361 
    362     # ensure that returns is the first input
    363     pipeline_columns['Returns'] = Returns(
    364         inputs=(USEquityPricing.open,),
    365         mask=universe, window_length=n_forward_days + 1,
    366     )
    367 
    368     # rank all the factors and put them after returns
    369     pipeline_columns.update({
    370         k: v.rank(mask=universe) for k, v in features.items()
    371     })
    372 
    373     # Create our ML pipeline factor. The window_length will control how much
    374     # lookback the passed in data will have.
    375     pipeline_columns['ML'] = ML(
    376         inputs=pipeline_columns.values(),
    377         window_length=window_length + 1,
    378         mask=universe,
    379     )
    380 
    381     pipeline_columns['Sector'] = Sector()
    382 
    383     return Pipeline(screen=universe, columns=pipeline_columns)
    384 
    385 
    386 def initialize(context):
    387     """
    388     Called once at the start of the algorithm.
    389     """
    390     set_slippage(slippage.FixedSlippage(spread=0.00))
    391     set_commission(commission.PerShare(cost=0, min_trade_cost=0))
    392 
    393     schedule_function(
    394         rebalance,
    395         TRADE_FREQ,
    396         time_rules.market_open(minutes=1),
    397     )
    398 
    399     # Record tracking variables at the end of each day.
    400     schedule_function(
    401         record_vars,
    402         date_rules.every_day(),
    403         time_rules.market_close(),
    404     )
    405 
    406     # Set up universe, alphas and ML pipline
    407     context.universe = QTradableStocksUS()
    408     # if you are using IsAnnouncedAcqTarget, uncomment the next line
    409     # context.universe &= IsAnnouncedAcqTarget()
    410 
    411     ml_pipeline = make_ml_pipeline(
    412         context.universe,
    413         n_forward_days=PRED_N_FORWARD_DAYS,
    414         window_length=ML_TRAINING_WINDOW,
    415     )
    416     # Create our dynamic stock selector.
    417     attach_pipeline(ml_pipeline, 'alpha_model')
    418     # Add the risk pipeline
    419     attach_pipeline(risk_loading_pipeline(), 'risk_factors')
    420 
    421     context.past_predictions = {}
    422     context.hold_out_accuracy = 0
    423     context.hold_out_log_loss = 0
    424     context.hold_out_returns_spread_bps = 0
    425 
    426 
    427 def evaluate_and_shift_hold_out(output, context):
    428     # Look at past predictions to evaluate classifier accuracy on hold-out data
    429     # A day has passed, shift days and drop old ones
    430     context.past_predictions = {
    431         k - 1: v
    432         for k, v in context.past_predictions.iteritems()
    433         if k > 0
    434     }
    435 
    436     if 0 in context.past_predictions:
    437         # Past predictions for the current day exist, so we can use todays'
    438         # n-back returns to evaluate them
    439         raw_returns = output['Returns']
    440         raw_predictions = context.past_predictions[0]
    441 
    442         # Join to match up equities
    443         returns, predictions = raw_returns.align(raw_predictions, join='inner')
    444 
    445         # Binarize returns
    446         returns_binary = returns > returns.median()
    447         predictions_binary = predictions > 0.5
    448 
    449         # Compute performance metrics
    450         context.hold_out_accuracy = metrics.accuracy_score(
    451             returns_binary.values,
    452             predictions_binary.values,
    453         )
    454         context.hold_out_log_loss = metrics.log_loss(
    455             returns_binary.values,
    456             predictions.values,
    457         )
    458         long_rets = returns[predictions_binary == 1].mean()
    459         short_rets = returns[predictions_binary == 0].mean()
    460         context.hold_out_returns_spread_bps = (long_rets - short_rets) * 10000
    461 
    462     # Store current predictions
    463     context.past_predictions[PRED_N_FORWARD_DAYS] = context.predicted_probs
    464 
    465 
    466 def before_trading_start(context, data):
    467     """
    468     Called every day before market open.
    469     """
    470     output = pipeline_output('alpha_model')
    471     context.predicted_probs = output['ML']
    472     context.predicted_probs.index.rename(['date', 'equity'], inplace=True)
    473     
    474     context.risk_loadings = pipeline_output('risk_factors')
    475 
    476     evaluate_and_shift_hold_out(output, context)
    477 
    478     # These are the securities that we are interested in trading each day.
    479     context.security_list = context.predicted_probs.index
    480 
    481 
    482 def rebalance(context, data):
    483     """
    484     Execute orders according to our schedule_function() timing.
    485     """ 
    486         
    487     predictions = context.predicted_probs
    488 
    489     # Filter out stocks that can not be traded
    490     predictions = predictions.loc[data.can_trade(predictions.index)]
    491     # Select top and bottom N stocks
    492     n_long_short = min(N_STOCKS_TO_TRADE // 2, len(predictions) // 2)
    493     predictions_top_bottom = pd.concat([
    494         predictions.nlargest(n_long_short),
    495         predictions.nsmallest(n_long_short),
    496     ])
    497 
    498     # If classifier predicts many identical values, the top might contain
    499     # duplicate stocks
    500     predictions_top_bottom = predictions_top_bottom.iloc[
    501         ~predictions_top_bottom.index.duplicated()
    502     ]
    503 
    504     # predictions are probabilities ranging from 0 to 1
    505     predictions_top_bottom = (predictions_top_bottom - 0.5) * 2
    506 
    507     # pull in the risk factor loadings
    508     risk_loadings = context.risk_loadings
    509     
    510     # Setup Optimization Objective
    511     # Factor-weighted portfolio
    512     objective = opt.TargetWeights(predictions_top_bottom)
    513 
    514     # Setup Optimization Constraints
    515     constrain_gross_leverage = opt.MaxGrossExposure(1.0)
    516     constrain_pos_size = opt.PositionConcentration.with_equal_bounds(
    517         -0.02,
    518         +0.02,
    519     )
    520     market_neutral = opt.DollarNeutral()
    521 
    522     if predictions_top_bottom.index.duplicated().any():
    523         log.debug(predictions_top_bottom.head())
    524     
    525     risk_neutral = opt.experimental.RiskModelExposure(
    526         risk_model_loadings=risk_loadings
    527     )
    528 
    529     # Run the optimization. This will calculate new portfolio weights and
    530     # manage moving our portfolio toward the target.
    531     order_optimal_portfolio(
    532         objective=objective,
    533         constraints=[
    534             constrain_gross_leverage,
    535             constrain_pos_size,
    536             market_neutral,
    537             risk_neutral
    538         ],
    539     )
    540 
    541 
    542 def record_vars(context, data):
    543     """
    544     Plot variables at the end of each day.
    545     """
    546     record(
    547         leverage=context.account.leverage,
    548         hold_out_accuracy=context.hold_out_accuracy,
    549         hold_out_log_loss=context.hold_out_log_loss,
    550         hold_out_returns_spread_bps=context.hold_out_returns_spread_bps,
    551     )
    552 
    553 
    554 def handle_data(context, data):
    555     pass