Source code for wpull.protocol.http.robots

# encoding=utf-8
'''Robots.txt file logistics.'''
import contextlib
import gettext
import logging
import os

import asyncio

from wpull.backport.logging import BraceMessage as __
import wpull.body
from wpull.errors import ServerError, ProtocolError
from wpull.protocol.http.request import Request, Response
from wpull.protocol.http.web import WebClient
from wpull.robotstxt import RobotsTxtPool
from wpull.url import URLInfo
import wpull.util


_logger = logging.getLogger(__name__)
_ = gettext.gettext


[docs]class NotInPoolError(Exception): '''The URL is not in the pool.''' pass
[docs]class RobotsTxtChecker(object): '''Robots.txt file fetcher and checker. args: web_client: Web Client. robots_txt_pool: Robots.txt Pool. ''' def __init__(self, web_client: WebClient=None, robots_txt_pool: RobotsTxtPool=None): self._web_client = web_client or WebClient() self._robots_txt_pool = robots_txt_pool or RobotsTxtPool() @property def web_client(self) -> WebClient: '''Return the WebClient.''' return self._web_client @property def robots_txt_pool(self) -> RobotsTxtPool: '''Return the RobotsTxtPool.''' return self._robots_txt_pool
[docs] def can_fetch_pool(self, request: Request): '''Return whether the request can be fetched based on the pool.''' url_info = request.url_info user_agent = request.fields.get('User-agent', '') if self._robots_txt_pool.has_parser(url_info): return self._robots_txt_pool.can_fetch(url_info, user_agent) else: raise NotInPoolError()
@asyncio.coroutine
[docs] def fetch_robots_txt(self, request: Request, file=None): '''Fetch the robots.txt file for the request. Coroutine. ''' url_info = request.url_info url = URLInfo.parse('{0}://{1}/robots.txt'.format( url_info.scheme, url_info.hostname_with_port)).url if not file: file = wpull.body.new_temp_file(os.getcwd(), hint='robots') with contextlib.closing(file): request = Request(url) session = self._web_client.session(request) while not session.done(): wpull.util.truncate_file(file.name) try: response = yield from session.start() yield from session.download(file=file) except ProtocolError: self._accept_as_blank(url_info) return status_code = response.status_code if 500 <= status_code <= 599: raise ServerError('Server returned error for robots.txt.') if status_code == 200: self._read_content(response, url_info) else: self._accept_as_blank(url_info)
@asyncio.coroutine
[docs] def can_fetch(self, request: Request, file=None) -> bool: '''Return whether the request can fetched. Args: request: Request. file: A file object to where the robots.txt contents are written. Coroutine. ''' try: return self.can_fetch_pool(request) except NotInPoolError: pass yield from self.fetch_robots_txt(request, file=file) return self.can_fetch_pool(request)
def _read_content(self, response: Response, original_url_info: URLInfo): '''Read response and parse the contents into the pool.''' data = response.body.read(4096) url_info = original_url_info try: self._robots_txt_pool.load_robots_txt(url_info, data) except ValueError: _logger.warning(__( _('Failed to parse {url} for robots exclusion rules. ' 'Ignoring.'), url_info.url)) self._accept_as_blank(url_info) else: _logger.debug(__('Got a good robots.txt for {0}.', url_info.url)) def _accept_as_blank(self, url_info: URLInfo): '''Mark the URL as OK in the pool.''' _logger.debug(__('Got empty robots.txt for {0}.', url_info.url)) self._robots_txt_pool.load_robots_txt(url_info, '')