'''Sitemap scraper'''
import gettext
import logging
import wpull.util
from wpull.backport.logging import StyleAdapter
from wpull.document.sitemap import SitemapReader
from wpull.document.util import detect_response_encoding
from wpull.pipeline.item import LinkType
from wpull.scraper.base import BaseExtractiveScraper, LinkContext, ScrapeResult
_ = gettext.gettext
_logger = StyleAdapter(logging.getLogger(__name__))
[docs]class SitemapScraper(SitemapReader, BaseExtractiveScraper):
'''Scrape Sitemaps'''
def __init__(self, html_parser, encoding_override=None):
super().__init__(html_parser)
self._encoding_override = encoding_override
[docs] def scrape(self, request, response, link_type=None):
if not self.is_supported(request=request, response=response):
return
if link_type and link_type != LinkType.sitemap:
return
base_url = request.url_info.url
encoding = self._encoding_override \
or detect_response_encoding(response)
link_contexts = set()
try:
with wpull.util.reset_file_offset(response.body):
link_iter = self.iter_processed_links(response.body, encoding,
base_url)
for link in link_iter:
link_contexts.add(LinkContext(link, linked=True))
except (UnicodeError, self._html_parser.parser_error) as error:
_logger.warning(
_('Failed to read document at ‘{url}’: {error}'),
url=request.url_info.url, error=error
)
return ScrapeResult(link_contexts, encoding)