"""
This modules implements the CrawlSpider which is the recommended spider to use
for scraping typical web sites that requires crawling pages.
See documentation in docs/topics/spiders.rst
"""
import copy
from typing import Sequence
from scrapy.http import Request, HtmlResponse
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Spider
from scrapy.utils.spider import iterate_spider_output
def _identity(x):
return x
def _identity_process_request(request, response):
return request
def _get_method(method, spider):
if callable(method):
return method
elif isinstance(method, str):
return getattr(spider, method, None)
_default_link_extractor = LinkExtractor()
[docs]class Rule:
def __init__(
self,
link_extractor=None,
callback=None,
cb_kwargs=None,
follow=None,
process_links=None,
process_request=None,
errback=None,
):
self.link_extractor = link_extractor or _default_link_extractor
self.callback = callback
self.errback = errback
self.cb_kwargs = cb_kwargs or {}
self.process_links = process_links or _identity
self.process_request = process_request or _identity_process_request
self.follow = follow if follow is not None else not callback
def _compile(self, spider):
self.callback = _get_method(self.callback, spider)
self.errback = _get_method(self.errback, spider)
self.process_links = _get_method(self.process_links, spider)
self.process_request = _get_method(self.process_request, spider)
[docs]class CrawlSpider(Spider):
rules: Sequence[Rule] = ()
def __init__(self, *a, **kw):
super().__init__(*a, **kw)
self._compile_rules()
def _parse(self, response, **kwargs):
return self._parse_response(
response=response,
callback=self.parse_start_url,
cb_kwargs=kwargs,
follow=True,
)
[docs] def parse_start_url(self, response, **kwargs):
return []
def process_results(self, response, results):
return results
def _build_request(self, rule_index, link):
return Request(
url=link.url,
callback=self._callback,
errback=self._errback,
meta=dict(rule=rule_index, link_text=link.text),
)
def _requests_to_follow(self, response):
if not isinstance(response, HtmlResponse):
return
seen = set()
for rule_index, rule in enumerate(self._rules):
links = [lnk for lnk in rule.link_extractor.extract_links(response)
if lnk not in seen]
for link in rule.process_links(links):
seen.add(link)
request = self._build_request(rule_index, link)
yield rule.process_request(request, response)
def _callback(self, response):
rule = self._rules[response.meta['rule']]
return self._parse_response(response, rule.callback, rule.cb_kwargs, rule.follow)
def _errback(self, failure):
rule = self._rules[failure.request.meta['rule']]
return self._handle_failure(failure, rule.errback)
def _parse_response(self, response, callback, cb_kwargs, follow=True):
if callback:
cb_res = callback(response, **cb_kwargs) or ()
cb_res = self.process_results(response, cb_res)
for request_or_item in iterate_spider_output(cb_res):
yield request_or_item
if follow and self._follow_links:
for request_or_item in self._requests_to_follow(response):
yield request_or_item
def _handle_failure(self, failure, errback):
if errback:
results = errback(failure) or ()
for request_or_item in iterate_spider_output(results):
yield request_or_item
def _compile_rules(self):
self._rules = []
for rule in self.rules:
self._rules.append(copy.copy(rule))
self._rules[-1]._compile(self)
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super().from_crawler(crawler, *args, **kwargs)
spider._follow_links = crawler.settings.getbool('CRAWLSPIDER_FOLLOW_LINKS', True)
return spider