ml-finance-python
python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
get_data.py
(4271B)
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 __author__ = 'Stefan Jansen'
4
5 import pandas as pd
6 import pandas_datareader.data as web
7
8 pd.set_option('display.expand_frame_repr', False)
9
10
11 def get_wiki_prices():
12 """source: https://www.quandl.com/api/v3/datatables/WIKI/PRICES?qopts.export=true&api_key=<API_KEY>
13 Download and rename to wiki_prices.csv
14 """
15
16 df = pd.read_csv('wiki_prices.csv',
17 parse_dates=['date'],
18 index_col=['date', 'ticker'],
19 infer_datetime_format=True)
20
21 print(df.info(null_counts=True))
22 with pd.HDFStore('assets.h5') as store:
23 store.put('quandl/wiki/prices', df)
24
25
26 def get_wiki_constitutents():
27 """source: https://www.quandl.com/api/v3/databases/WIKI/codes?api_key=<API_KEY>
28 Download and rename to wiki_stocks.csv
29 """
30 df = pd.read_csv('wiki_stocks.csv', header=None)
31 df = pd.concat([df[0].str.split('/', expand=True)[1].str.strip(),
32 df[1].str.split('(', expand=True)[0].str.strip()], axis=1)
33 df.columns = ['symbol', 'name']
34 print(df.info(null_counts=True))
35 with pd.HDFStore('assets.h5') as store:
36 store.put('quandl/wiki/prices', df)
37
38
39 def get_sp500_prices():
40 """Download historical S&P 500 prices from stooq"""
41 df = pd.read_csv('https://stooq.com/q/d/l/?s=^spx&i=d', parse_dates=['Date'])
42 df = df.rename(columns=str.lower).set_index('date')
43 print(df.info())
44 with pd.HDFStore('assets.h5') as store:
45 store.put('sp500/prices', df)
46
47
48 def get_sp500_constituents():
49 """Download current S&P 500 constituents from Wikipedia"""
50 df = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies', header=0)[0]
51 df.columns = ['ticker', 'name', 'sec_filings', 'gics_sector', 'gics_sub_industry',
52 'location', 'first_added', 'cik', 'founded']
53 df = df.drop('sec_filings', axis=1).set_index('ticker')
54 print(df.info())
55 with pd.HDFStore('assets.h5') as store:
56 store.put('sp500/stocks', df)
57
58
59 def get_nasdaq_companies():
60 """Download list of companies traded on NASDAQ, AMEX and NYSE"""
61 url = 'https://www.nasdaq.com/screening/companies-by-name.aspx?letter=0&exchange={}&render=download'
62 exchanges = ['NASDAQ', 'AMEX', 'NYSE']
63 df = pd.concat([pd.read_csv(url.format(ex)) for ex in exchanges]).dropna(how='all', axis=1)
64 df = df.rename(columns=str.lower).set_index('symbol').drop('summary quote', axis=1)
65 print(df.info())
66 with pd.HDFStore('assets.h5') as store:
67 store.put('us_equities/stocks', df)
68
69
70 def get_fred():
71 """Download bond index data from FRED"""
72 securities = {'BAMLCC0A0CMTRIV' : 'US Corp Master TRI',
73 'BAMLHYH0A0HYM2TRIV': 'US High Yield TRI',
74 'BAMLEMCBPITRIV' : 'Emerging Markets Corporate Plus TRI',
75 'GOLDAMGBD228NLBM' : 'Gold (London, USD)',
76 'DGS10' : '10-Year Treasury CMR',
77 }
78
79 df = web.DataReader(name=list(securities.keys()), data_source='fred', start=2000)
80 df = df.rename(columns=securities).dropna(how='all').resample('B').mean()
81
82 with pd.HDFStore('assets.h5') as store:
83 store.put('fred/assets', df)
84
85
86 def get_treasury_index():
87 name = 'S&P U.S. Treasury Bond Current 10-Year Index'
88 df = pd.read_excel('treasury_10y.xls')
89 df.Data = pd.to_datetime(df.Date)
90 return df.set_index('Date').Index.resample('B').mean().to_frame('Treasury Index')
91
92
93 def get_bcom():
94 bcom = pd.read_csv('BCOM.csv', parse_dates=['Date'])
95 return bcom.set_index('Date').Price.resample('B').mean().to_frame('BCOM')
96
97
98 def get_stock_sample():
99 data_dir = Path('..', '00_data')
100 with pd.HDFStore(str(data_dir / 'assets.h5')) as store:
101 df = store.get(join('quandl', 'wiki', 'prices'))
102 close = df.adj_close.unstack().loc[str(start):str(end)]
103 open = df.adj_open.unstack().loc[str(start):str(end)]
104
105 nobs = close.count()
106 close = close.loc[:, nobs[nobs == nobs.quantile(.9)].index]
107 print(close.info(null_counts=True))
108 open = open.loc[:, close.columns]
109 print(open.info())
110 with pd.HDFStore('alpha_factors.h5') as store:
111 store.put('prices/open', open)
112 store.put('prices/close', close)