Source code for wpull.document.css

'''Stylesheet reader.'''
import codecs
import io
import re

from wpull.document.base import BaseDocumentDetector, BaseTextStreamReader, \
    VeryFalse
from wpull.regexstream import RegexStream
import wpull.string
import wpull.util


[docs]class CSSReader(BaseDocumentDetector, BaseTextStreamReader):
    '''Cascading Stylesheet Document Reader.'''
    URL_PATTERN = r'''url\(\s*(['"]?)(.{1,500}?)(?:\1)\s*\)'''
    IMPORT_URL_PATTERN = r'''@import\s*(?:url\()?['"]?([^\s'")]{1,500}).*?;'''
    URL_REGEX = re.compile(r'{}|{}'.format(URL_PATTERN, IMPORT_URL_PATTERN))
    BUFFER_SIZE = 1048576
    STREAM_REWIND = 4096

    @classmethod
[docs]    def is_url(cls, url_info):
        '''Return whether the document is likely to be CSS.'''
        if '.css' in url_info.path.lower():
            return True

    @classmethod
[docs]    def is_request(cls, request):
        '''Return whether the document is likely to be CSS.'''
        return cls.is_url(request.url_info)

    @classmethod
[docs]    def is_response(cls, response):
        '''Return whether the document is likely to be CSS.'''
        if 'css' in response.fields.get('content-type', '').lower():
            return True

        if response.body:
            # Stylesheet mistakenly served as HTML
            if 'html' in response.fields.get('content-type', '').lower():
                return cls.is_file(response.body)

    @classmethod
[docs]    def is_file(cls, file):
        '''Return whether the file is likely CSS.'''
        peeked_data = wpull.string.printable_bytes(
            wpull.util.peek_file(file)).lower()

        if b'<html' in peeked_data:
            return VeryFalse

        if re.search(br'@import |color:|background[a-z-]*:|font[a-z-]*:',
                     peeked_data):
            return True

[docs]    def iter_text(self, file, encoding=None):
        if isinstance(file, io.TextIOBase):
            stream = file
        else:
            stream = codecs.getreader(encoding or 'latin1')(file)

        regex_stream = RegexStream(stream, self.URL_REGEX)

        for match, text in regex_stream.stream():
            if match:
                yield (text, 'import' if match.group(3) else 'url')
            else:
                yield (text, False)