ml-finance-python

python scripts for finance machine learning

git clone https://9o.is/git/ml-finance-python.git

sa_selenium.py

(3595B)


      1 #!/usr/bin/env python
      2 # -*- coding: utf-8 -*-
      3 __author__ = 'Stefan Jansen'
      4 
      5 import re
      6 from pathlib import Path
      7 from random import random
      8 from time import sleep
      9 from urllib.parse import urljoin
     10 
     11 import pandas as pd
     12 from bs4 import BeautifulSoup
     13 from furl import furl
     14 from selenium import webdriver
     15 
     16 transcript_path = Path('transcripts')
     17 
     18 
     19 def store_result(meta, participants, content):
     20     path = transcript_path / 'parsed' / meta['symbol']
     21     if not path.exists():
     22         path.mkdir(parents=True, exist_ok=True)
     23     pd.DataFrame(content, columns=['speaker', 'q&a', 'content']).to_csv(path / 'content.csv', index=False)
     24     pd.DataFrame(participants, columns=['type', 'name']).to_csv(path / 'participants.csv', index=False)
     25     pd.Series(meta).to_csv(path / 'earnings.csv')
     26 
     27 
     28 def parse_html(html):
     29     date_pattern = re.compile(r'(\d{2})-(\d{2})-(\d{2})')
     30     quarter_pattern = re.compile(r'(\bQ\d\b)')
     31     soup = BeautifulSoup(html, 'lxml')
     32 
     33     meta, participants, content = {}, [], []
     34     h1 = soup.find('h1', itemprop='headline')
     35     if h1 is None:
     36         return
     37     h1 = h1.text
     38     meta['company'] = h1[:h1.find('(')].strip()
     39     meta['symbol'] = h1[h1.find('(') + 1:h1.find(')')]
     40 
     41     title = soup.find('div', class_='title')
     42     if title is None:
     43         return
     44     title = title.text
     45     print(title)
     46     match = date_pattern.search(title)
     47     if match:
     48         m, d, y = match.groups()
     49         meta['month'] = int(m)
     50         meta['day'] = int(d)
     51         meta['year'] = int(y)
     52 
     53     match = quarter_pattern.search(title)
     54     if match:
     55         meta['quarter'] = match.group(0)
     56 
     57     qa = 0
     58     speaker_types = ['Executives', 'Analysts']
     59     for header in [p.parent for p in soup.find_all('strong')]:
     60         text = header.text.strip()
     61         if text.lower().startswith('copyright'):
     62             continue
     63         elif text.lower().startswith('question-and'):
     64             qa = 1
     65             continue
     66         elif any([type in text for type in speaker_types]):
     67             for participant in header.find_next_siblings('p'):
     68                 if participant.find('strong'):
     69                     break
     70                 else:
     71                     participants.append([text, participant.text])
     72         else:
     73             p = []
     74             for participant in header.find_next_siblings('p'):
     75                 if participant.find('strong'):
     76                     break
     77                 else:
     78                     p.append(participant.text)
     79             content.append([header.text, qa, '\n'.join(p)])
     80     return meta, participants, content
     81 
     82 
     83 SA_URL = 'https://seekingalpha.com/'
     84 TRANSCRIPT = re.compile('Earnings Call Transcript')
     85 
     86 next_page = True
     87 page = 1
     88 driver = webdriver.Firefox()
     89 while next_page:
     90     print(f'Page: {page}')
     91     url = f'{SA_URL}/earnings/earnings-call-transcripts/{page}'
     92     driver.get(urljoin(SA_URL, url))
     93     response = driver.page_source
     94     page += 1
     95     soup = BeautifulSoup(response, 'lxml')
     96     links = soup.find_all(name='a', string=TRANSCRIPT)
     97     if len(links) == 0:
     98         next_page = False
     99     else:
    100         for link in links:
    101             transcript_url = link.attrs.get('href')
    102             article_url = furl(urljoin(SA_URL, transcript_url)).add({'part': 'single'})
    103             driver.get(article_url.url)
    104             html = driver.page_source
    105             result = parse_html(html)
    106             if result is not None:
    107                 meta, participants, content = result
    108                 meta['link'] = link
    109                 store_result(meta, participants, content)
    110             sleep(5 + (random() - .5) * 2)
    111 
    112 driver.close()
    113 # pd.Series(articles).to_csv('articles.csv')