Source code for wpull.document.javascript

import codecs
import re
import io

from wpull.document.base import BaseTextStreamReader, \
    BaseDocumentDetector, VeryFalse
from wpull.regexstream import RegexStream
import wpull.string
import wpull.util


[docs]class JavaScriptReader(BaseDocumentDetector, BaseTextStreamReader):
    '''JavaScript Document Reader.'''
    # Pattern based from https://github.com/internetarchive/heritrix3/
    # blob/ffd248f7800dbd4bff1cf8afaa57a0a3e945ed85/modules/src/
    # main/java/org/archive/modules/extractor/ExtractorJS.java
    URL_PATTERN = r'''(\\{0,8}['"])(https?://[^'"]{1,500}|[^\s'"]{1,500})(?:\1)'''
    URL_REGEX = re.compile(URL_PATTERN)

    BUFFER_SIZE = 1048576
    STREAM_REWIND = 4096

    @classmethod
[docs]    def is_url(cls, url_info):
        '''Return whether the document is likely to be JS.'''
        if '.js' in url_info.path.lower():
            return True

    @classmethod
[docs]    def is_request(cls, request):
        '''Return whether the document is likely to be JS.'''
        return cls.is_url(request.url_info)

    @classmethod
[docs]    def is_response(cls, response):
        '''Return whether the document is likely to be JS.'''
        if 'javascript' in response.fields.get('content-type', '').lower():
            return True

        if response.body:
            # script mistakenly served as HTML
            if 'html' in response.fields.get('content-type', '').lower():
                return cls.is_file(response.body)

    @classmethod
[docs]    def is_file(cls, file):
        '''Return whether the file is likely JS.'''
        peeked_data = wpull.string.printable_bytes(
            wpull.util.peek_file(file)).lower()

        if b'<html' in peeked_data:
            return VeryFalse

        if re.search(br'var|function|settimeout|jquery\(',
                     peeked_data):
            return True

[docs]    def iter_text(self, file, encoding=None):
        if isinstance(file, io.TextIOBase):
            stream = file
        else:
            stream = codecs.getreader(encoding or 'latin1')(file)
        regex_stream = RegexStream(stream, self.URL_REGEX)

        for match, text in regex_stream.stream():
            yield (text, bool(match))

[docs]    def read_links(self, file, encoding=None):
        '''Return an iterator of links found in the document.

        Args:
            file: A file object containing the document.
            encoding (str): The encoding of the document.

        Returns:
            iterable: str
        '''
        return [item[0] for item in self.iter_text(file, encoding) if item[1]]