Source code for wpull.urlrewrite

'''URL rewriting.'''
import re
from wpull.url import parse_url_or_log, URLInfo


[docs]class URLRewriter(object): '''Clean up URLs.''' def __init__(self, hash_fragment: bool=False, session_id: bool=False): self._hash_fragment_enabled = hash_fragment self._session_id_enabled = session_id
[docs] def rewrite(self, url_info: URLInfo) -> URLInfo: '''Rewrite the given URL.''' if url_info.scheme not in ('http', 'https'): return url_info if self._session_id_enabled: url = '{scheme}://{authority}{path}?{query}#{fragment}'.format( scheme=url_info.scheme, authority=url_info.authority, path=strip_path_session_id(url_info.path), query=strip_query_session_id(url_info.query), fragment=url_info.fragment, ) url_info = parse_url_or_log(url) or url_info if self._hash_fragment_enabled and url_info.fragment.startswith('!'): if url_info.query: url = '{}&_escaped_fragment_={}'.format(url_info.url, url_info.fragment[1:]) else: url = '{}?_escaped_fragment_={}'.format(url_info.url, url_info.fragment[1:]) url_info = parse_url_or_log(url) or url_info return url_info
# The strip session ID functions are based from the surt project. # https://github.com/internetarchive/surt/blob/746f506dd6f0798adaa5bfd92101b73ed00f2831/surt/URLRegexTransformer.py # Copyright 2012-2013 Internet Archive. AGPL v3. SESSION_ID_PATH_PATTERNS = ( re.compile("^(.*/)(\((?:[a-z]\([0-9a-z]{24}\))+\)/)([^\?]+\.aspx.*)$", re.I), re.compile("^(.*/)(\\([0-9a-z]{24}\\)/)([^\\?]+\\.aspx.*)$", re.I), )
[docs]def strip_path_session_id(path): '''Strip session ID from URL path.''' for pattern in SESSION_ID_PATH_PATTERNS: match = pattern.match(path) if match: path = match.group(1) + match.group(3) return path
SESSION_ID_QUERY_PATTERNS = ( re.compile("^(.*)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I), re.compile("^(.*)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I), re.compile("^(.*)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I), re.compile("^(.*)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", re.I), re.compile("^(.*)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", re.I), )
[docs]def strip_query_session_id(query): for pattern in SESSION_ID_QUERY_PATTERNS: match = pattern.match(query) if match: if match.group(2): query = match.group(1) + match.group(2) else: query = match.group(1) return query