Source code for wpull.document.htmlparse.lxml_

'''Parsing using lxml and libxml2.'''
import io

import lxml.html

from wpull.collections import EmptyFrozenDict, FrozenDict
from wpull.document.htmlparse.base import BaseParser
from wpull.document.htmlparse.element import Element, Comment
from wpull.document.xml import XMLDetector
import wpull.util


[docs]class HTMLParserTarget(object): '''An HTML parser target. Args: callback: A callback function. The function should accept one argument from :mod:`.document.htmlparse.element`. ''' # NOTE: If we ever support Python 2 again, byte strings may be # returned from lxml def __init__(self, callback): self.callback = callback self.tag = None self.attrib = None self.buffer = None self.tail_buffer = None
[docs] def start(self, tag, attrib): if self.buffer: self.callback(Element( self.tag, self.attrib, self.buffer.getvalue(), None, False )) self.buffer = None if self.tail_buffer: self.callback(Element( self.tag, EmptyFrozenDict(), None, self.tail_buffer.getvalue(), True )) self.tail_buffer = None self.tag = tag self.attrib = FrozenDict(attrib) self.buffer = io.StringIO()
[docs] def data(self, data): if self.buffer: self.buffer.write(data) if self.tail_buffer: self.tail_buffer.write(data)
[docs] def end(self, tag): if self.buffer: self.callback(Element( tag, self.attrib, self.buffer.getvalue(), None, False )) self.buffer = None if self.tail_buffer: self.callback(Element( self.tag, EmptyFrozenDict(), None, self.tail_buffer.getvalue(), True )) self.tail_buffer = None self.tail_buffer = io.StringIO() self.tag = tag
[docs] def comment(self, text): self.callback(Comment(text))
[docs] def close(self): if self.buffer: self.callback(Element( self.tag, self.attrib, self.buffer.getvalue(), None, False )) self.buffer = None if self.tail_buffer: self.callback(Element( self.tag, EmptyFrozenDict(), None, self.tail_buffer.getvalue(), True )) self.tail_buffer = None return True
[docs]class HTMLParser(BaseParser): '''HTML document parser. This reader uses lxml as the parser. ''' BUFFER_SIZE = 131072 @property def parser_error(self): return lxml.etree.LxmlError
[docs] def parse(self, file, encoding=None): parser_type = self.detect_parser_type(file, encoding=encoding) if parser_type == 'xhtml': # Use the HTML parser because there exists XHTML soup parser_type = 'html' for element in self.parse_lxml(file, encoding=encoding, parser_type=parser_type): yield element
[docs] def parse_lxml(self, file, encoding=None, target_class=HTMLParserTarget, parser_type='html'): '''Return an iterator of elements found in the document. Args: file: A file object containing the document. encoding (str): The encoding of the document. target_class: A class to be used for target parsing. parser_type (str): The type of parser to use. Accepted values: ``html``, ``xhtml``, ``xml``. Returns: iterator: Each item is an element from :mod:`.document.htmlparse.element` ''' if encoding: lxml_encoding = to_lxml_encoding(encoding) or 'latin1' else: lxml_encoding = encoding elements = [] callback_func = elements.append target = target_class(callback_func) if parser_type == 'html': parser = lxml.html.HTMLParser( encoding=lxml_encoding, target=target ) elif parser_type == 'xhtml': parser = lxml.html.XHTMLParser( encoding=lxml_encoding, target=target, recover=True ) else: parser = lxml.etree.XMLParser( encoding=lxml_encoding, target=target, recover=True ) if parser_type == 'html': # XXX: Force libxml2 to do full read in case of early "</html>" # See https://github.com/chfoo/wpull/issues/104 # See https://bugzilla.gnome.org/show_bug.cgi?id=727935 for dummy in range(3): parser.feed('<html>'.encode(encoding)) while True: data = file.read(self.BUFFER_SIZE) if not data: break parser.feed(data) for element in elements: yield element del elements[:] parser.close() for element in elements: yield element
@classmethod
[docs] def parse_doctype(cls, file, encoding=None): '''Get the doctype from the document. Returns: str, None ''' if encoding: lxml_encoding = to_lxml_encoding(encoding) or 'latin1' else: lxml_encoding = encoding try: parser = lxml.etree.XMLParser(encoding=lxml_encoding, recover=True) tree = lxml.etree.parse( io.BytesIO(wpull.util.peek_file(file)), parser=parser ) if tree.getroot() is not None: return tree.docinfo.doctype except lxml.etree.LxmlError: pass
@classmethod
[docs] def detect_parser_type(cls, file, encoding=None): '''Get the suitable parser type for the document. Returns: str ''' is_xml = XMLDetector.is_file(file) doctype = cls.parse_doctype(file, encoding=encoding) or '' if not doctype and is_xml: return 'xml' if 'XHTML' in doctype: return 'xhtml' return 'html'
[docs]def to_lxml_encoding(encoding): '''Check if lxml supports the specified encoding. Returns: str, None ''' # XXX: Workaround lxml not liking utf-16-le try: lxml.html.HTMLParser(encoding=encoding) except LookupError: encoding = encoding.replace('-', '') else: return encoding try: lxml.html.HTMLParser(encoding=encoding) except LookupError: encoding = encoding.replace('_', '') else: return encoding try: lxml.html.HTMLParser(encoding=encoding) except LookupError: pass else: return encoding