Source code for wpull.document.javascript

import codecs
import re
import io

from wpull.document.base import BaseTextStreamReader, \
    BaseDocumentDetector, VeryFalse
from wpull.regexstream import RegexStream
import wpull.string
import wpull.util


[docs]class JavaScriptReader(BaseDocumentDetector, BaseTextStreamReader): '''JavaScript Document Reader.''' # Pattern based from https://github.com/internetarchive/heritrix3/ # blob/ffd248f7800dbd4bff1cf8afaa57a0a3e945ed85/modules/src/ # main/java/org/archive/modules/extractor/ExtractorJS.java URL_PATTERN = r'''(\\{0,8}['"])(https?://[^'"]{1,500}|[^\s'"]{1,500})(?:\1)''' URL_REGEX = re.compile(URL_PATTERN) BUFFER_SIZE = 1048576 STREAM_REWIND = 4096 @classmethod
[docs] def is_url(cls, url_info): '''Return whether the document is likely to be JS.''' if '.js' in url_info.path.lower(): return True
@classmethod
[docs] def is_request(cls, request): '''Return whether the document is likely to be JS.''' return cls.is_url(request.url_info)
@classmethod
[docs] def is_response(cls, response): '''Return whether the document is likely to be JS.''' if 'javascript' in response.fields.get('content-type', '').lower(): return True if response.body: # script mistakenly served as HTML if 'html' in response.fields.get('content-type', '').lower(): return cls.is_file(response.body)
@classmethod
[docs] def is_file(cls, file): '''Return whether the file is likely JS.''' peeked_data = wpull.string.printable_bytes( wpull.util.peek_file(file)).lower() if b'<html' in peeked_data: return VeryFalse if re.search(br'var|function|settimeout|jquery\(', peeked_data): return True
[docs] def iter_text(self, file, encoding=None): if isinstance(file, io.TextIOBase): stream = file else: stream = codecs.getreader(encoding or 'latin1')(file) regex_stream = RegexStream(stream, self.URL_REGEX) for match, text in regex_stream.stream(): yield (text, bool(match))