Source code for wpull.document.htmlparse.html5lib_

'''Parsing using html5lib python.'''
import html5lib.constants
import html5lib.tokenizer
import io
import os.path

from wpull.collections import FrozenDict, EmptyFrozenDict
from wpull.document.htmlparse.base import BaseParser
from wpull.document.htmlparse.element import Comment, Doctype, Element


DOCTYPE = html5lib.constants.tokenTypes['Doctype']
CHARACTERS = html5lib.constants.tokenTypes['Characters']
SPACE_CHARACTERS = html5lib.constants.tokenTypes['SpaceCharacters']
START_TAG = html5lib.constants.tokenTypes['StartTag']
END_TAG = html5lib.constants.tokenTypes['EndTag']
EMPTY_TAG = html5lib.constants.tokenTypes['EmptyTag']
COMMENT = html5lib.constants.tokenTypes['Comment']
PARSE_ERROR = html5lib.constants.tokenTypes['ParseError']


[docs]class HTMLParser(BaseParser): @property def parser_error(self): return ValueError
[docs] def parse(self, file, encoding=None): tokenizer = html5lib.tokenizer.HTMLTokenizer( file, encoding=encoding, useChardet=False if encoding else True, parseMeta=False if encoding else True, ) tag = None attrib = None buffer = None tail_buffer = None for token in tokenizer: token_type = token['type'] if token_type == START_TAG: if buffer: yield Element(tag, attrib, buffer.getvalue(), None, False) buffer = None if tail_buffer: yield Element(tag, EmptyFrozenDict(), None, tail_buffer.getvalue(), True) tail_buffer = None tag = token['name'] attrib = FrozenDict(dict(token['data'])) buffer = io.StringIO() if token['name'] == 'script': tokenizer.state = tokenizer.scriptDataState elif token_type in (CHARACTERS, SPACE_CHARACTERS): if buffer: buffer.write(token['data']) if tail_buffer: tail_buffer.write(token['data']) elif token_type == END_TAG: if buffer: yield Element(tag, attrib, buffer.getvalue(), None, False) buffer = None if tail_buffer: yield Element(tag, EmptyFrozenDict(), None, tail_buffer.getvalue(), True) tail_buffer = None tail_buffer = io.StringIO() tag = token['name'] elif token_type == COMMENT: yield Comment(token['data']) elif token_type == DOCTYPE: yield Doctype('{} {} {}'.format( token['name'], token['publicId'], token['systemId'])) elif token_type == PARSE_ERROR: pass else: raise ValueError('Unhandled token {}'.format(token)) if buffer: yield Element(tag, attrib, buffer.getvalue(), None, False) buffer = None if tail_buffer: yield Element(tag, EmptyFrozenDict(), None, tail_buffer.getvalue(), True) tail_buffer = None
if __name__ == '__main__': path = os.path.join( os.path.dirname(__file__), '..', '..', 'testing', 'samples', 'xkcd_1.html' ) with open(path, 'rb') as in_file: tokenizer = html5lib.tokenizer.HTMLTokenizer(in_file) for token in tokenizer: print(token) html_parser = HTMLParser() for element in html_parser.parse(in_file): print(element)