ml-finance-python

python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
edhec_risk_kit_203.py

(19208B)
      1 import pandas as pd
      2 import numpy as np
      3 
      4 def get_ffme_returns():
      5     """
      6     Load the Fama-French Dataset for the returns of the Top and Bottom Deciles by MarketCap
      7     """
      8     me_m = pd.read_csv("data/Portfolios_Formed_on_ME_monthly_EW.csv",
      9                        header=0, index_col=0, na_values=-99.99)
     10     rets = me_m[['Lo 10', 'Hi 10']]
     11     rets.columns = ['SmallCap', 'LargeCap']
     12     rets = rets/100
     13     rets.index = pd.to_datetime(rets.index, format="%Y%m").to_period('M')
     14     return rets
     15 
     16 def get_fff_returns():
     17     """
     18     Load the Fama-French Research Factor Monthly Dataset
     19     """
     20     rets = pd.read_csv("data/F-F_Research_Data_Factors_m.csv",
     21                        header=0, index_col=0, na_values=-99.99)/100
     22     rets.index = pd.to_datetime(rets.index, format="%Y%m").to_period('M')
     23     return rets
     24 
     25 
     26 def get_hfi_returns():
     27     """
     28     Load and format the EDHEC Hedge Fund Index Returns
     29     """
     30     hfi = pd.read_csv("data/edhec-hedgefundindices.csv",
     31                       header=0, index_col=0, parse_dates=True)
     32     hfi = hfi/100
     33     hfi.index = hfi.index.to_period('M')
     34     return hfi
     35 
     36 def get_ind_file(filetype, ew=False):
     37     """
     38     Load and format the Ken French 30 Industry Portfolios files
     39     """
     40     known_types = ["returns", "nfirms", "size"]
     41     if filetype not in known_types:
     42         raise ValueError(f"filetype must be one of:{','.join(known_types)}")
     43     if filetype is "returns":
     44         name = "ew_rets" if ew else "vw_rets"
     45         divisor = 100
     46     elif filetype is "nfirms":
     47         name = "nfirms"
     48         divisor = 1
     49     elif filetype is "size":
     50         name = "size"
     51         divisor = 1
     52                          
     53     ind = pd.read_csv(f"data/ind30_m_{name}.csv", header=0, index_col=0)/divisor
     54     ind.index = pd.to_datetime(ind.index, format="%Y%m").to_period('M')
     55     ind.columns = ind.columns.str.strip()
     56     return ind
     57 
     58 def get_ind_returns(ew=False):
     59     """
     60     Load and format the Ken French 30 Industry Portfolios Value Weighted Monthly Returns
     61     """
     62     return get_ind_file("returns", ew=ew)
     63 
     64 def get_ind_nfirms():
     65     """
     66     Load and format the Ken French 30 Industry Portfolios Average number of Firms
     67     """
     68     return get_ind_file("nfirms")
     69 
     70 def get_ind_size():
     71     """
     72     Load and format the Ken French 30 Industry Portfolios Average size (market cap)
     73     """
     74     return get_ind_file("size")
     75 
     76                          
     77 def get_total_market_index_returns():
     78     """
     79     Load the 30 industry portfolio data and derive the returns of a capweighted total market index
     80     """
     81     ind_nfirms = get_ind_nfirms()
     82     ind_size = get_ind_size()
     83     ind_return = get_ind_returns()
     84     ind_mktcap = ind_nfirms * ind_size
     85     total_mktcap = ind_mktcap.sum(axis=1)
     86     ind_capweight = ind_mktcap.divide(total_mktcap, axis="rows")
     87     total_market_return = (ind_capweight * ind_return).sum(axis="columns")
     88     return total_market_return
     89                          
     90 def skewness(r):
     91     """
     92     Alternative to scipy.stats.skew()
     93     Computes the skewness of the supplied Series or DataFrame
     94     Returns a float or a Series
     95     """
     96     demeaned_r = r - r.mean()
     97     # use the population standard deviation, so set dof=0
     98     sigma_r = r.std(ddof=0)
     99     exp = (demeaned_r**3).mean()
    100     return exp/sigma_r**3
    101 
    102 
    103 def kurtosis(r):
    104     """
    105     Alternative to scipy.stats.kurtosis()
    106     Computes the kurtosis of the supplied Series or DataFrame
    107     Returns a float or a Series
    108     """
    109     demeaned_r = r - r.mean()
    110     # use the population standard deviation, so set dof=0
    111     sigma_r = r.std(ddof=0)
    112     exp = (demeaned_r**4).mean()
    113     return exp/sigma_r**4
    114 
    115 
    116 def compound(r):
    117     """
    118     returns the result of compounding the set of returns in r
    119     """
    120     return np.expm1(np.log1p(r).sum())
    121 
    122                          
    123 def annualize_rets(r, periods_per_year):
    124     """
    125     Annualizes a set of returns
    126     We should infer the periods per year
    127     but that is currently left as an exercise
    128     to the reader :-)
    129     """
    130     compounded_growth = (1+r).prod()
    131     n_periods = r.shape[0]
    132     return compounded_growth**(periods_per_year/n_periods)-1
    133 
    134 
    135 def annualize_vol(r, periods_per_year):
    136     """
    137     Annualizes the vol of a set of returns
    138     We should infer the periods per year
    139     but that is currently left as an exercise
    140     to the reader :-)
    141     """
    142     return r.std()*(periods_per_year**0.5)
    143 
    144 
    145 def sharpe_ratio(r, riskfree_rate, periods_per_year):
    146     """
    147     Computes the annualized sharpe ratio of a set of returns
    148     """
    149     # convert the annual riskfree rate to per period
    150     rf_per_period = (1+riskfree_rate)**(1/periods_per_year)-1
    151     excess_ret = r - rf_per_period
    152     ann_ex_ret = annualize_rets(excess_ret, periods_per_year)
    153     ann_vol = annualize_vol(r, periods_per_year)
    154     return ann_ex_ret/ann_vol
    155 
    156 
    157 import scipy.stats
    158 def is_normal(r, level=0.01):
    159     """
    160     Applies the Jarque-Bera test to determine if a Series is normal or not
    161     Test is applied at the 1% level by default
    162     Returns True if the hypothesis of normality is accepted, False otherwise
    163     """
    164     if isinstance(r, pd.DataFrame):
    165         return r.aggregate(is_normal)
    166     else:
    167         statistic, p_value = scipy.stats.jarque_bera(r)
    168         return p_value > level
    169 
    170 
    171 def drawdown(return_series: pd.Series):
    172     """Takes a time series of asset returns.
    173        returns a DataFrame with columns for
    174        the wealth index, 
    175        the previous peaks, and 
    176        the percentage drawdown
    177     """
    178     wealth_index = 1000*(1+return_series).cumprod()
    179     previous_peaks = wealth_index.cummax()
    180     drawdowns = (wealth_index - previous_peaks)/previous_peaks
    181     return pd.DataFrame({"Wealth": wealth_index, 
    182                          "Previous Peak": previous_peaks, 
    183                          "Drawdown": drawdowns})
    184 
    185 
    186 def semideviation(r):
    187     """
    188     Returns the semideviation aka negative semideviation of r
    189     r must be a Series or a DataFrame, else raises a TypeError
    190     """
    191     if isinstance(r, pd.Series):
    192         is_negative = r < 0
    193         return r[is_negative].std(ddof=0)
    194     elif isinstance(r, pd.DataFrame):
    195         return r.aggregate(semideviation)
    196     else:
    197         raise TypeError("Expected r to be a Series or DataFrame")
    198 
    199 
    200 def var_historic(r, level=5):
    201     """
    202     Returns the historic Value at Risk at a specified level
    203     i.e. returns the number such that "level" percent of the returns
    204     fall below that number, and the (100-level) percent are above
    205     """
    206     if isinstance(r, pd.DataFrame):
    207         return r.aggregate(var_historic, level=level)
    208     elif isinstance(r, pd.Series):
    209         return -np.percentile(r, level)
    210     else:
    211         raise TypeError("Expected r to be a Series or DataFrame")
    212 
    213 
    214 def cvar_historic(r, level=5):
    215     """
    216     Computes the Conditional VaR of Series or DataFrame
    217     """
    218     if isinstance(r, pd.Series):
    219         is_beyond = r <= var_historic(r, level=level)
    220         return -r[is_beyond].mean()
    221     elif isinstance(r, pd.DataFrame):
    222         return r.aggregate(cvar_historic, level=level)
    223     else:
    224         raise TypeError("Expected r to be a Series or DataFrame")
    225 
    226 
    227 from scipy.stats import norm
    228 def var_gaussian(r, level=5, modified=False):
    229     """
    230     Returns the Parametric Gauusian VaR of a Series or DataFrame
    231     If "modified" is True, then the modified VaR is returned,
    232     using the Cornish-Fisher modification
    233     """
    234     # compute the Z score assuming it was Gaussian
    235     z = norm.ppf(level/100)
    236     if modified:
    237         # modify the Z score based on observed skewness and kurtosis
    238         s = skewness(r)
    239         k = kurtosis(r)
    240         z = (z +
    241                 (z**2 - 1)*s/6 +
    242                 (z**3 -3*z)*(k-3)/24 -
    243                 (2*z**3 - 5*z)*(s**2)/36
    244             )
    245     return -(r.mean() + z*r.std(ddof=0))
    246 
    247 
    248 def portfolio_return(weights, returns):
    249     """
    250     Computes the return on a portfolio from constituent returns and weights
    251     weights are a numpy array or Nx1 matrix and returns are a numpy array or Nx1 matrix
    252     """
    253     return weights.T @ returns
    254 
    255 
    256 def portfolio_vol(weights, covmat):
    257     """
    258     Computes the vol of a portfolio from a covariance matrix and constituent weights
    259     weights are a numpy array or N x 1 maxtrix and covmat is an N x N matrix
    260     """
    261     return (weights.T @ covmat @ weights)**0.5
    262 
    263 
    264 def plot_ef2(n_points, er, cov):
    265     """
    266     Plots the 2-asset efficient frontier
    267     """
    268     if er.shape[0] != 2 or er.shape[0] != 2:
    269         raise ValueError("plot_ef2 can only plot 2-asset frontiers")
    270     weights = [np.array([w, 1-w]) for w in np.linspace(0, 1, n_points)]
    271     rets = [portfolio_return(w, er) for w in weights]
    272     vols = [portfolio_vol(w, cov) for w in weights]
    273     ef = pd.DataFrame({
    274         "Returns": rets, 
    275         "Volatility": vols
    276     })
    277     return ef.plot.line(x="Volatility", y="Returns", style=".-")
    278 
    279 
    280 from scipy.optimize import minimize
    281 
    282 def minimize_vol(target_return, er, cov):
    283     """
    284     Returns the optimal weights that achieve the target return
    285     given a set of expected returns and a covariance matrix
    286     """
    287     n = er.shape[0]
    288     init_guess = np.repeat(1/n, n)
    289     bounds = ((0.0, 1.0),) * n # an N-tuple of 2-tuples!
    290     # construct the constraints
    291     weights_sum_to_1 = {'type': 'eq',
    292                         'fun': lambda weights: np.sum(weights) - 1
    293     }
    294     return_is_target = {'type': 'eq',
    295                         'args': (er,),
    296                         'fun': lambda weights, er: target_return - portfolio_return(weights,er)
    297     }
    298     weights = minimize(portfolio_vol, init_guess,
    299                        args=(cov,), method='SLSQP',
    300                        options={'disp': False},
    301                        constraints=(weights_sum_to_1,return_is_target),
    302                        bounds=bounds)
    303     return weights.x
    304 
    305 
    306 def tracking_error(r_a, r_b):
    307     """
    308     Returns the Tracking Error between the two return series
    309     """
    310     return np.sqrt(((r_a - r_b)**2).sum())
    311 
    312                          
    313 def msr(riskfree_rate, er, cov):
    314     """
    315     Returns the weights of the portfolio that gives you the maximum sharpe ratio
    316     given the riskfree rate and expected returns and a covariance matrix
    317     """
    318     n = er.shape[0]
    319     init_guess = np.repeat(1/n, n)
    320     bounds = ((0.0, 1.0),) * n # an N-tuple of 2-tuples!
    321     # construct the constraints
    322     weights_sum_to_1 = {'type': 'eq',
    323                         'fun': lambda weights: np.sum(weights) - 1
    324     }
    325     def neg_sharpe(weights, riskfree_rate, er, cov):
    326         """
    327         Returns the negative of the sharpe ratio
    328         of the given portfolio
    329         """
    330         r = portfolio_return(weights, er)
    331         vol = portfolio_vol(weights, cov)
    332         return -(r - riskfree_rate)/vol
    333     
    334     weights = minimize(neg_sharpe, init_guess,
    335                        args=(riskfree_rate, er, cov), method='SLSQP',
    336                        options={'disp': False},
    337                        constraints=(weights_sum_to_1,),
    338                        bounds=bounds)
    339     return weights.x
    340 
    341 
    342 def gmv(cov):
    343     """
    344     Returns the weights of the Global Minimum Volatility portfolio
    345     given a covariance matrix
    346     """
    347     n = cov.shape[0]
    348     return msr(0, np.repeat(1, n), cov)
    349 
    350 
    351 def optimal_weights(n_points, er, cov):
    352     """
    353     Returns a list of weights that represent a grid of n_points on the efficient frontier
    354     """
    355     target_rs = np.linspace(er.min(), er.max(), n_points)
    356     weights = [minimize_vol(target_return, er, cov) for target_return in target_rs]
    357     return weights
    358 
    359 
    360 def plot_ef(n_points, er, cov, style='.-', legend=False, show_cml=False, riskfree_rate=0, show_ew=False, show_gmv=False):
    361     """
    362     Plots the multi-asset efficient frontier
    363     """
    364     weights = optimal_weights(n_points, er, cov)
    365     rets = [portfolio_return(w, er) for w in weights]
    366     vols = [portfolio_vol(w, cov) for w in weights]
    367     ef = pd.DataFrame({
    368         "Returns": rets, 
    369         "Volatility": vols
    370     })
    371     ax = ef.plot.line(x="Volatility", y="Returns", style=style, legend=legend)
    372     if show_cml:
    373         ax.set_xlim(left = 0)
    374         # get MSR
    375         w_msr = msr(riskfree_rate, er, cov)
    376         r_msr = portfolio_return(w_msr, er)
    377         vol_msr = portfolio_vol(w_msr, cov)
    378         # add CML
    379         cml_x = [0, vol_msr]
    380         cml_y = [riskfree_rate, r_msr]
    381         ax.plot(cml_x, cml_y, color='green', marker='o', linestyle='dashed', linewidth=2, markersize=10)
    382     if show_ew:
    383         n = er.shape[0]
    384         w_ew = np.repeat(1/n, n)
    385         r_ew = portfolio_return(w_ew, er)
    386         vol_ew = portfolio_vol(w_ew, cov)
    387         # add EW
    388         ax.plot([vol_ew], [r_ew], color='goldenrod', marker='o', markersize=10)
    389     if show_gmv:
    390         w_gmv = gmv(cov)
    391         r_gmv = portfolio_return(w_gmv, er)
    392         vol_gmv = portfolio_vol(w_gmv, cov)
    393         # add EW
    394         ax.plot([vol_gmv], [r_gmv], color='midnightblue', marker='o', markersize=10)
    395         
    396         return ax
    397 
    398                          
    399 def run_cppi(risky_r, safe_r=None, m=3, start=1000, floor=0.8, riskfree_rate=0.03, drawdown=None):
    400     """
    401     Run a backtest of the CPPI strategy, given a set of returns for the risky asset
    402     Returns a dictionary containing: Asset Value History, Risk Budget History, Risky Weight History
    403     """
    404     # set up the CPPI parameters
    405     dates = risky_r.index
    406     n_steps = len(dates)
    407     account_value = start
    408     floor_value = start*floor
    409     peak = account_value
    410     if isinstance(risky_r, pd.Series): 
    411         risky_r = pd.DataFrame(risky_r, columns=["R"])
    412 
    413     if safe_r is None:
    414         safe_r = pd.DataFrame().reindex_like(risky_r)
    415         safe_r.values[:] = riskfree_rate/12 # fast way to set all values to a number
    416     # set up some DataFrames for saving intermediate values
    417     account_history = pd.DataFrame().reindex_like(risky_r)
    418     risky_w_history = pd.DataFrame().reindex_like(risky_r)
    419     cushion_history = pd.DataFrame().reindex_like(risky_r)
    420     floorval_history = pd.DataFrame().reindex_like(risky_r)
    421     peak_history = pd.DataFrame().reindex_like(risky_r)
    422 
    423     for step in range(n_steps):
    424         if drawdown is not None:
    425             peak = np.maximum(peak, account_value)
    426             floor_value = peak*(1-drawdown)
    427         cushion = (account_value - floor_value)/account_value
    428         risky_w = m*cushion
    429         risky_w = np.minimum(risky_w, 1)
    430         risky_w = np.maximum(risky_w, 0)
    431         safe_w = 1-risky_w
    432         risky_alloc = account_value*risky_w
    433         safe_alloc = account_value*safe_w
    434         # recompute the new account value at the end of this step
    435         account_value = risky_alloc*(1+risky_r.iloc[step]) + safe_alloc*(1+safe_r.iloc[step])
    436         # save the histories for analysis and plotting
    437         cushion_history.iloc[step] = cushion
    438         risky_w_history.iloc[step] = risky_w
    439         account_history.iloc[step] = account_value
    440         floorval_history.iloc[step] = floor_value
    441         peak_history.iloc[step] = peak
    442     risky_wealth = start*(1+risky_r).cumprod()
    443     backtest_result = {
    444         "Wealth": account_history,
    445         "Risky Wealth": risky_wealth, 
    446         "Risk Budget": cushion_history,
    447         "Risky Allocation": risky_w_history,
    448         "m": m,
    449         "start": start,
    450         "floor": floor,
    451         "risky_r":risky_r,
    452         "safe_r": safe_r,
    453         "drawdown": drawdown,
    454         "peak": peak_history,
    455         "floor": floorval_history
    456     }
    457     return backtest_result
    458 
    459 
    460 def summary_stats(r, riskfree_rate=0.03):
    461     """
    462     Return a DataFrame that contains aggregated summary stats for the returns in the columns of r
    463     """
    464     ann_r = r.aggregate(annualize_rets, periods_per_year=12)
    465     ann_vol = r.aggregate(annualize_vol, periods_per_year=12)
    466     ann_sr = r.aggregate(sharpe_ratio, riskfree_rate=riskfree_rate, periods_per_year=12)
    467     dd = r.aggregate(lambda r: drawdown(r).Drawdown.min())
    468     skew = r.aggregate(skewness)
    469     kurt = r.aggregate(kurtosis)
    470     cf_var5 = r.aggregate(var_gaussian, modified=True)
    471     hist_cvar5 = r.aggregate(cvar_historic)
    472     return pd.DataFrame({
    473         "Annualized Return": ann_r,
    474         "Annualized Vol": ann_vol,
    475         "Skewness": skew,
    476         "Kurtosis": kurt,
    477         "Cornish-Fisher VaR (5%)": cf_var5,
    478         "Historic CVaR (5%)": hist_cvar5,
    479         "Sharpe Ratio": ann_sr,
    480         "Max Drawdown": dd
    481     })
    482 
    483                          
    484 def gbm(n_years = 10, n_scenarios=1000, mu=0.07, sigma=0.15, steps_per_year=12, s_0=100.0, prices=True):
    485     """
    486     Evolution of Geometric Brownian Motion trajectories, such as for Stock Prices through Monte Carlo
    487     :param n_years:  The number of years to generate data for
    488     :param n_paths: The number of scenarios/trajectories
    489     :param mu: Annualized Drift, e.g. Market Return
    490     :param sigma: Annualized Volatility
    491     :param steps_per_year: granularity of the simulation
    492     :param s_0: initial value
    493     :return: a numpy array of n_paths columns and n_years*steps_per_year rows
    494     """
    495     # Derive per-step Model Parameters from User Specifications
    496     dt = 1/steps_per_year
    497     n_steps = int(n_years*steps_per_year) + 1
    498     # the standard way ...
    499     # rets_plus_1 = np.random.normal(loc=mu*dt+1, scale=sigma*np.sqrt(dt), size=(n_steps, n_scenarios))
    500     # without discretization error ...
    501     rets_plus_1 = np.random.normal(loc=(1+mu)**dt, scale=(sigma*np.sqrt(dt)), size=(n_steps, n_scenarios))
    502     rets_plus_1[0] = 1
    503     ret_val = s_0*pd.DataFrame(rets_plus_1).cumprod() if prices else rets_plus_1-1
    504     return ret_val
    505 
    506                          
    507 import statsmodels.api as sm
    508 def regress(dependent_variable, explanatory_variables, alpha=True):
    509     """
    510     Runs a linear regression to decompose the dependent variable into the explanatory variables
    511     returns an object of type statsmodel's RegressionResults on which you can call
    512        .summary() to print a full summary
    513        .params for the coefficients
    514        .tvalues and .pvalues for the significance levels
    515        .rsquared_adj and .rsquared for quality of fit
    516     """
    517     if alpha:
    518         explanatory_variables = explanatory_variables.copy()
    519         explanatory_variables["Alpha"] = 1
    520     
    521     lm = sm.OLS(dependent_variable, explanatory_variables).fit()
    522     return lm
    523 
    524 def portfolio_tracking_error(weights, ref_r, bb_r):
    525     """
    526     returns the tracking error between the reference returns
    527     and a portfolio of building block returns held with given weights
    528     """
    529     return tracking_error(ref_r, (weights*bb_r).sum(axis=1))
    530                          
    531 def style_analysis(dependent_variable, explanatory_variables):
    532     """
    533     Returns the optimal weights that minimizes the Tracking error between
    534     a portfolio of the explanatory variables and the dependent variable
    535     """
    536     n = explanatory_variables.shape[1]
    537     init_guess = np.repeat(1/n, n)
    538     bounds = ((0.0, 1.0),) * n # an N-tuple of 2-tuples!
    539     # construct the constraints
    540     weights_sum_to_1 = {'type': 'eq',
    541                         'fun': lambda weights: np.sum(weights) - 1
    542     }
    543     solution = minimize(portfolio_tracking_error, init_guess,
    544                        args=(dependent_variable, explanatory_variables,), method='SLSQP',
    545                        options={'disp': False},
    546                        constraints=(weights_sum_to_1,),
    547                        bounds=bounds)
    548     weights = pd.Series(solution.x, index=explanatory_variables.columns)
    549     return weights
    550 
    551     
    552