# encoding=utf-8
'''Robots.txt exclusion directives.'''
import gettext
import logging
from wpull.thirdparty import robotexclusionrulesparser
from wpull.url import URLInfo
_logger = logging.getLogger(__name__)
_ = gettext.gettext
[docs]class RobotsTxtPool(object):
'''Pool of robots.txt parsers.'''
def __init__(self):
self._parsers = {}
[docs] def has_parser(self, url_info: URLInfo):
'''Return whether a parser has been created for the URL.'''
key = self.url_info_key(url_info)
return key in self._parsers
[docs] def can_fetch(self, url_info: URLInfo, user_agent: str):
'''Return whether the URL can be fetched.'''
key = self.url_info_key(url_info)
parser = self._parsers[key]
return parser.is_allowed(user_agent, url_info.url)
[docs] def load_robots_txt(self, url_info: URLInfo, text: str):
'''Load the robot.txt file.'''
key = self.url_info_key(url_info)
parser = robotexclusionrulesparser.RobotExclusionRulesParser()
parser.parse(text)
self._parsers[key] = parser
@classmethod
[docs] def url_info_key(cls, url_info: URLInfo) -> tuple:
return url_info.scheme, url_info.hostname, url_info.port