import sys
import logging
from abc import ABCMeta, abstractmethod
from scrapy.utils.python import to_unicode
logger = logging.getLogger(__name__)
def decode_robotstxt(robotstxt_body, spider, to_native_str_type=False):
try:
if to_native_str_type:
robotstxt_body = to_unicode(robotstxt_body)
else:
robotstxt_body = robotstxt_body.decode('utf-8')
except UnicodeDecodeError:
# If we found garbage or robots.txt in an encoding other than UTF-8, disregard it.
# Switch to 'allow all' state.
logger.warning(
"Failure while parsing robots.txt. File either contains garbage or "
"is in an encoding other than UTF-8, treating it as an empty file.",
exc_info=sys.exc_info(),
extra={'spider': spider},
)
robotstxt_body = ''
return robotstxt_body
[docs]class RobotParser(metaclass=ABCMeta):
[docs] @classmethod
@abstractmethod
def from_crawler(cls, crawler, robotstxt_body):
"""Parse the content of a robots.txt_ file as bytes. This must be a class method.
It must return a new instance of the parser backend.
:param crawler: crawler which made the request
:type crawler: :class:`~scrapy.crawler.Crawler` instance
:param robotstxt_body: content of a robots.txt_ file.
:type robotstxt_body: bytes
"""
pass
[docs] @abstractmethod
def allowed(self, url, user_agent):
"""Return ``True`` if ``user_agent`` is allowed to crawl ``url``, otherwise return ``False``.
:param url: Absolute URL
:type url: str
:param user_agent: User agent
:type user_agent: str
"""
pass
class PythonRobotParser(RobotParser):
def __init__(self, robotstxt_body, spider):
from urllib.robotparser import RobotFileParser
self.spider = spider
robotstxt_body = decode_robotstxt(robotstxt_body, spider, to_native_str_type=True)
self.rp = RobotFileParser()
self.rp.parse(robotstxt_body.splitlines())
@classmethod
def from_crawler(cls, crawler, robotstxt_body):
spider = None if not crawler else crawler.spider
o = cls(robotstxt_body, spider)
return o
def allowed(self, url, user_agent):
user_agent = to_unicode(user_agent)
url = to_unicode(url)
return self.rp.can_fetch(user_agent, url)
class ReppyRobotParser(RobotParser):
def __init__(self, robotstxt_body, spider):
from reppy.robots import Robots
self.spider = spider
self.rp = Robots.parse('', robotstxt_body)
@classmethod
def from_crawler(cls, crawler, robotstxt_body):
spider = None if not crawler else crawler.spider
o = cls(robotstxt_body, spider)
return o
def allowed(self, url, user_agent):
return self.rp.allowed(url, user_agent)
class RerpRobotParser(RobotParser):
def __init__(self, robotstxt_body, spider):
from robotexclusionrulesparser import RobotExclusionRulesParser
self.spider = spider
self.rp = RobotExclusionRulesParser()
robotstxt_body = decode_robotstxt(robotstxt_body, spider)
self.rp.parse(robotstxt_body)
@classmethod
def from_crawler(cls, crawler, robotstxt_body):
spider = None if not crawler else crawler.spider
o = cls(robotstxt_body, spider)
return o
def allowed(self, url, user_agent):
user_agent = to_unicode(user_agent)
url = to_unicode(url)
return self.rp.is_allowed(user_agent, url)
class ProtegoRobotParser(RobotParser):
def __init__(self, robotstxt_body, spider):
from protego import Protego
self.spider = spider
robotstxt_body = decode_robotstxt(robotstxt_body, spider)
self.rp = Protego.parse(robotstxt_body)
@classmethod
def from_crawler(cls, crawler, robotstxt_body):
spider = None if not crawler else crawler.spider
o = cls(robotstxt_body, spider)
return o
def allowed(self, url, user_agent):
user_agent = to_unicode(user_agent)
url = to_unicode(url)
return self.rp.can_fetch(url, user_agent)