Source code for wpull.scraper.css

'''Stylesheet scraper.'''

import gettext
import logging

import wpull.util
from wpull.backport.logging import BraceMessage as __
from wpull.document.css import CSSReader
from wpull.document.util import detect_response_encoding
from wpull.pipeline.item import LinkType
from wpull.scraper.base import BaseTextStreamScraper, LinkContext, ScrapeResult

_ = gettext.gettext
_logger = logging.getLogger(__name__)


[docs]class CSSScraper(CSSReader, BaseTextStreamScraper): '''Scrapes CSS stylesheet documents.''' def __init__(self, encoding_override=None): super().__init__() self._encoding_override = encoding_override
[docs] def iter_processed_text(self, file, encoding=None, base_url=None): links = super().iter_processed_text( file, encoding=encoding, base_url=base_url) for text, is_link in links: if is_link and len(text) < 500: yield (text, is_link) elif not is_link: yield (text, False)
[docs] def scrape(self, request, response, link_type=None): if not self.is_supported(request=request, response=response): return if link_type and link_type != LinkType.css: return link_contexts = set() base_url = request.url_info.url encoding = self._encoding_override or \ detect_response_encoding(response) try: with wpull.util.reset_file_offset(response.body): for link, context in self.iter_processed_links( response.body, encoding, base_url, context=True): if context == 'import': link_type = LinkType.css else: link_type = LinkType.media link_contexts.add(LinkContext(link, inline=True, link_type=link_type)) except UnicodeError as error: _logger.warning(__( _('Failed to read document at ‘{url}’: {error}'), url=request.url_info.url, error=error )) return ScrapeResult(link_contexts, encoding)