ml-finance-python
python scripts for finance machine learning
git clone https://9o.is/git/ml-finance-python.git
middlewares.py
(4517B)
1 # -*- coding: utf-8 -*-
2
3 # Define here the models for your spider middleware
4 #
5 # See documentation in:
6 # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7
8 from scrapy import signals
9 from random import choice
10 from scrapy.exceptions import NotConfigured
11
12
13 class OpentableSpiderMiddleware(object):
14 # Not all methods need to be defined. If a method is not defined,
15 # scrapy acts as if the spider middleware does not modify the
16 # passed objects.
17
18 @classmethod
19 def from_crawler(cls, crawler):
20 # This method is used by Scrapy to create your spiders.
21 s = cls()
22 crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
23 return s
24
25 def process_spider_input(self, response, spider):
26 # Called for each response that goes through the spider
27 # middleware and into the spider.
28
29 # Should return None or raise an exception.
30 return None
31
32 def process_spider_output(self, response, result, spider):
33 # Called with the results returned from the Spider, after
34 # it has processed the response.
35
36 # Must return an iterable of Request, dict or Item objects.
37 for i in result:
38 yield i
39
40 def process_spider_exception(self, response, exception, spider):
41 # Called when a spider or process_spider_input() method
42 # (from other spider middleware) raises an exception.
43
44 # Should return either None or an iterable of Response, dict
45 # or Item objects.
46 pass
47
48 def process_start_requests(self, start_requests, spider):
49 # Called with the start requests of the spider, and works
50 # similarly to the process_spider_output() method, except
51 # that it doesn’t have a response associated.
52
53 # Must return only requests (not items).
54 for r in start_requests:
55 yield r
56
57 def spider_opened(self, spider):
58 spider.logger.info('Spider opened: %s' % spider.name)
59
60
61 class OpentableDownloaderMiddleware(object):
62 # Not all methods need to be defined. If a method is not defined,
63 # scrapy acts as if the downloader middleware does not modify the
64 # passed objects.
65
66 @classmethod
67 def from_crawler(cls, crawler):
68 # This method is used by Scrapy to create your spiders.
69 s = cls()
70 crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
71 return s
72
73 def process_request(self, request, spider):
74 # Called for each request that goes through the downloader
75 # middleware.
76
77 # Must either:
78 # - return None: continue processing this request
79 # - or return a Response object
80 # - or return a Request object
81 # - or raise IgnoreRequest: process_exception() methods of
82 # installed downloader middleware will be called
83 return None
84
85 def process_response(self, request, response, spider):
86 # Called with the response returned from the downloader.
87
88 # Must either;
89 # - return a Response object
90 # - return a Request object
91 # - or raise IgnoreRequest
92 return response
93
94 def process_exception(self, request, exception, spider):
95 # Called when a download handler or a process_request()
96 # (from other downloader middleware) raises an exception.
97
98 # Must either:
99 # - return None: continue processing this exception
100 # - return a Response object: stops process_exception() chain
101 # - return a Request object: stops process_exception() chain
102 pass
103
104 def spider_opened(self, spider):
105 spider.logger.info('Spider opened: %s' % spider.name)
106
107
108 class RotateUserAgentMiddleware(object):
109 """Rotate user-agent for each request."""
110
111 def __init__(self, user_agents):
112 self.enabled = False
113 self.user_agents = user_agents
114
115 @classmethod
116 def from_crawler(cls, crawler):
117 user_agents = crawler.settings.get('USER_AGENT_CHOICES', [])
118
119 if not user_agents:
120 raise NotConfigured("USER_AGENT_CHOICES not set or empty")
121
122 o = cls(user_agents)
123 crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
124
125 return o
126
127 def spider_opened(self, spider):
128 self.enabled = getattr(spider, 'rotate_user_agent', self.enabled)
129
130 def process_request(self, request, spider):
131 if not self.enabled or not self.user_agents:
132 return
133
134 request.headers['user-agent'] = choice(self.user_agents)