Source code for wpull.regexstream

'''Regular expression streams.'''


[docs]class RegexStream(object): '''Streams file with regular expressions. Args: file: File object. pattern: A compiled regular expression object. read_size (int): The size of a chunk of text that is searched. overlap_size (int): The amount of overlap between chunks of text that is searched. ''' def __init__(self, file, pattern, read_size=16384, overlap_size=4096): self._file = file self._pattern = pattern self._read_size = read_size self._overlap_size = overlap_size
[docs] def stream(self): '''Iterate the file stream. Returns: iterator: Each item is a tuple: 1. None, regex match 2. str ''' chunk_a = None chunk_b = None chunk_a_index = 0 chunk_b_index = 0 search_start_index = 0 while True: chunk_a = chunk_b chunk_a_index = chunk_b_index chunk_b = self._file.read(self._read_size) if chunk_a is None: continue chunk_b_index = chunk_a_index + len(chunk_a) if not chunk_a: break current_chunk = chunk_a + chunk_b[:self._overlap_size] offset_end = len(chunk_a) + self._overlap_size while True: offset_start = search_start_index - chunk_a_index match = self._pattern.search( current_chunk, offset_start, offset_end) if not match: unmatched_part = chunk_a[offset_start:] if unmatched_part: yield (None, unmatched_part) search_start_index += len(unmatched_part) break start_index, end_index = match.span(match.lastindex) unmatched_part = current_chunk[offset_start:start_index] if unmatched_part: yield (None, unmatched_part) yield (match, match.group(match.lastindex)) search_start_index += len(unmatched_part) + \ len(match.group(match.lastindex))