ml-finance-python
python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
sa_selenium.py
(3595B)
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 __author__ = 'Stefan Jansen'
4
5 import re
6 from pathlib import Path
7 from random import random
8 from time import sleep
9 from urllib.parse import urljoin
10
11 import pandas as pd
12 from bs4 import BeautifulSoup
13 from furl import furl
14 from selenium import webdriver
15
16 transcript_path = Path('transcripts')
17
18
19 def store_result(meta, participants, content):
20 path = transcript_path / 'parsed' / meta['symbol']
21 if not path.exists():
22 path.mkdir(parents=True, exist_ok=True)
23 pd.DataFrame(content, columns=['speaker', 'q&a', 'content']).to_csv(path / 'content.csv', index=False)
24 pd.DataFrame(participants, columns=['type', 'name']).to_csv(path / 'participants.csv', index=False)
25 pd.Series(meta).to_csv(path / 'earnings.csv')
26
27
28 def parse_html(html):
29 date_pattern = re.compile(r'(\d{2})-(\d{2})-(\d{2})')
30 quarter_pattern = re.compile(r'(\bQ\d\b)')
31 soup = BeautifulSoup(html, 'lxml')
32
33 meta, participants, content = {}, [], []
34 h1 = soup.find('h1', itemprop='headline')
35 if h1 is None:
36 return
37 h1 = h1.text
38 meta['company'] = h1[:h1.find('(')].strip()
39 meta['symbol'] = h1[h1.find('(') + 1:h1.find(')')]
40
41 title = soup.find('div', class_='title')
42 if title is None:
43 return
44 title = title.text
45 print(title)
46 match = date_pattern.search(title)
47 if match:
48 m, d, y = match.groups()
49 meta['month'] = int(m)
50 meta['day'] = int(d)
51 meta['year'] = int(y)
52
53 match = quarter_pattern.search(title)
54 if match:
55 meta['quarter'] = match.group(0)
56
57 qa = 0
58 speaker_types = ['Executives', 'Analysts']
59 for header in [p.parent for p in soup.find_all('strong')]:
60 text = header.text.strip()
61 if text.lower().startswith('copyright'):
62 continue
63 elif text.lower().startswith('question-and'):
64 qa = 1
65 continue
66 elif any([type in text for type in speaker_types]):
67 for participant in header.find_next_siblings('p'):
68 if participant.find('strong'):
69 break
70 else:
71 participants.append([text, participant.text])
72 else:
73 p = []
74 for participant in header.find_next_siblings('p'):
75 if participant.find('strong'):
76 break
77 else:
78 p.append(participant.text)
79 content.append([header.text, qa, '\n'.join(p)])
80 return meta, participants, content
81
82
83 SA_URL = 'https://seekingalpha.com/'
84 TRANSCRIPT = re.compile('Earnings Call Transcript')
85
86 next_page = True
87 page = 1
88 driver = webdriver.Firefox()
89 while next_page:
90 print(f'Page: {page}')
91 url = f'{SA_URL}/earnings/earnings-call-transcripts/{page}'
92 driver.get(urljoin(SA_URL, url))
93 response = driver.page_source
94 page += 1
95 soup = BeautifulSoup(response, 'lxml')
96 links = soup.find_all(name='a', string=TRANSCRIPT)
97 if len(links) == 0:
98 next_page = False
99 else:
100 for link in links:
101 transcript_url = link.attrs.get('href')
102 article_url = furl(urljoin(SA_URL, transcript_url)).add({'part': 'single'})
103 driver.get(article_url.url)
104 html = driver.page_source
105 result = parse_html(html)
106 if result is not None:
107 meta, participants, content = result
108 meta['link'] = link
109 store_result(meta, participants, content)
110 sleep(5 + (random() - .5) * 2)
111
112 driver.close()
113 # pd.Series(articles).to_csv('articles.csv')