ml-finance-python

python scripts for finance machine learning

git clone https://9o.is/git/ml-finance-python.git

get_data.py

(4271B)


      1 #!/usr/bin/env python
      2 # -*- coding: utf-8 -*-
      3 __author__ = 'Stefan Jansen'
      4 
      5 import pandas as pd
      6 import pandas_datareader.data as web
      7 
      8 pd.set_option('display.expand_frame_repr', False)
      9 
     10 
     11 def get_wiki_prices():
     12     """source: https://www.quandl.com/api/v3/datatables/WIKI/PRICES?qopts.export=true&api_key=<API_KEY>
     13         Download and rename to wiki_prices.csv
     14     """
     15 
     16     df = pd.read_csv('wiki_prices.csv',
     17                      parse_dates=['date'],
     18                      index_col=['date', 'ticker'],
     19                      infer_datetime_format=True)
     20 
     21     print(df.info(null_counts=True))
     22     with pd.HDFStore('assets.h5') as store:
     23         store.put('quandl/wiki/prices', df)
     24 
     25 
     26 def get_wiki_constitutents():
     27     """source: https://www.quandl.com/api/v3/databases/WIKI/codes?api_key=<API_KEY>
     28         Download and rename to wiki_stocks.csv
     29     """
     30     df = pd.read_csv('wiki_stocks.csv', header=None)
     31     df = pd.concat([df[0].str.split('/', expand=True)[1].str.strip(),
     32                     df[1].str.split('(', expand=True)[0].str.strip()], axis=1)
     33     df.columns = ['symbol', 'name']
     34     print(df.info(null_counts=True))
     35     with pd.HDFStore('assets.h5') as store:
     36         store.put('quandl/wiki/prices', df)
     37 
     38 
     39 def get_sp500_prices():
     40     """Download historical S&P 500 prices from stooq"""
     41     df = pd.read_csv('https://stooq.com/q/d/l/?s=^spx&i=d', parse_dates=['Date'])
     42     df = df.rename(columns=str.lower).set_index('date')
     43     print(df.info())
     44     with pd.HDFStore('assets.h5') as store:
     45         store.put('sp500/prices', df)
     46 
     47 
     48 def get_sp500_constituents():
     49     """Download current S&P 500 constituents from Wikipedia"""
     50     df = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies', header=0)[0]
     51     df.columns = ['ticker', 'name', 'sec_filings', 'gics_sector', 'gics_sub_industry',
     52                   'location', 'first_added', 'cik', 'founded']
     53     df = df.drop('sec_filings', axis=1).set_index('ticker')
     54     print(df.info())
     55     with pd.HDFStore('assets.h5') as store:
     56         store.put('sp500/stocks', df)
     57 
     58 
     59 def get_nasdaq_companies():
     60     """Download list of companies traded on NASDAQ, AMEX and NYSE"""
     61     url = 'https://www.nasdaq.com/screening/companies-by-name.aspx?letter=0&exchange={}&render=download'
     62     exchanges = ['NASDAQ', 'AMEX', 'NYSE']
     63     df = pd.concat([pd.read_csv(url.format(ex)) for ex in exchanges]).dropna(how='all', axis=1)
     64     df = df.rename(columns=str.lower).set_index('symbol').drop('summary quote', axis=1)
     65     print(df.info())
     66     with pd.HDFStore('assets.h5') as store:
     67         store.put('us_equities/stocks', df)
     68 
     69 
     70 def get_fred():
     71     """Download bond index data from FRED"""
     72     securities = {'BAMLCC0A0CMTRIV'   : 'US Corp Master TRI',
     73                   'BAMLHYH0A0HYM2TRIV': 'US High Yield TRI',
     74                   'BAMLEMCBPITRIV'    : 'Emerging Markets Corporate Plus TRI',
     75                   'GOLDAMGBD228NLBM'  : 'Gold (London, USD)',
     76                   'DGS10'             : '10-Year Treasury CMR',
     77                   }
     78 
     79     df = web.DataReader(name=list(securities.keys()), data_source='fred', start=2000)
     80     df = df.rename(columns=securities).dropna(how='all').resample('B').mean()
     81 
     82     with pd.HDFStore('assets.h5') as store:
     83         store.put('fred/assets', df)
     84 
     85 
     86 def get_treasury_index():
     87     name = 'S&P U.S. Treasury Bond Current 10-Year Index'
     88     df = pd.read_excel('treasury_10y.xls')
     89     df.Data = pd.to_datetime(df.Date)
     90     return df.set_index('Date').Index.resample('B').mean().to_frame('Treasury Index')
     91 
     92 
     93 def get_bcom():
     94     bcom = pd.read_csv('BCOM.csv', parse_dates=['Date'])
     95     return bcom.set_index('Date').Price.resample('B').mean().to_frame('BCOM')
     96 
     97 
     98 def get_stock_sample():
     99     data_dir = Path('..', '00_data')
    100     with pd.HDFStore(str(data_dir / 'assets.h5')) as store:
    101         df = store.get(join('quandl', 'wiki', 'prices'))
    102         close = df.adj_close.unstack().loc[str(start):str(end)]
    103         open = df.adj_open.unstack().loc[str(start):str(end)]
    104 
    105     nobs = close.count()
    106     close = close.loc[:, nobs[nobs == nobs.quantile(.9)].index]
    107     print(close.info(null_counts=True))
    108     open = open.loc[:, close.columns]
    109     print(open.info())
    110     with pd.HDFStore('alpha_factors.h5') as store:
    111         store.put('prices/open', open)
    112         store.put('prices/close', close)