'''File names and paths.'''
import abc
import base64
import hashlib
import os
import re
import urllib.parse
import collections
[docs]class BasePathNamer(object, metaclass=abc.ABCMeta):
'''Base class for path namers.'''
@abc.abstractmethod
[docs] def get_filename(self, url_info):
'''Return the appropriate filename based on given URLInfo.'''
[docs]class PathNamer(BasePathNamer):
'''Path namer that creates a directory hierarchy based on the URL.
Args:
root (str): The base path.
index (str): The filename to use when the URL path does not indicate
one.
use_dir (bool): Include directories based on the URL path.
cut (int): Number of leading directories to cut from the file path.
protocol (bool): Include the URL scheme in the directory structure.
hostname (bool): Include the hostname in the directory structure.
safe_filename_args (dict): Keyword arguments for `safe_filename`.
See also: :func:`url_to_filename`, :func:`url_to_dir_path`,
:func:`safe_filename`.
'''
def __init__(self, root, index='index.html', use_dir=False, cut=None,
protocol=False, hostname=False, os_type='unix',
no_control=True, ascii_only=True,
case=None, max_filename_length=None):
self._root = root
self._index = index
self._cut = cut
self._protocol = protocol
self._hostname = hostname
self._use_dir = use_dir
self._os_type = os_type
self._no_control = no_control
self._ascii_only = ascii_only
self._case = case
self._max_filename_length = max_filename_length
if os.path.isfile(root):
raise IOError('Root cannot be a file.')
[docs] def get_filename(self, url_info):
url = url_info.url
alt_char = self._os_type == 'windows'
parts = []
if self._use_dir:
dir_parts = url_to_dir_parts(
url, self._protocol, self._hostname, alt_char=alt_char
)
for dummy in range(self._cut or 0):
if dir_parts:
del dir_parts[0]
parts.extend(dir_parts)
parts.append(url_to_filename(
url,
'.listing' if url_info.scheme == 'ftp' else self._index,
alt_char=alt_char
))
if url_info.scheme == 'ftp':
parts = [urllib.parse.unquote(part) for part in parts]
parts = [self.safe_filename(part) for part in parts]
return os.path.join(self._root, *parts)
[docs] def safe_filename(self, part):
'''Return a safe filename or file part.'''
return safe_filename(
part,
os_type=self._os_type, no_control=self._no_control,
ascii_only=self._ascii_only, case=self._case,
max_length=self._max_filename_length,
)
[docs]def url_to_filename(url, index='index.html', alt_char=False):
'''Return a filename from a URL.
Args:
url (str): The URL.
index (str): If a filename could not be derived from the URL path,
use index instead. For example, ``/images/`` will return
``index.html``.
alt_char (bool): If True, the character for the query deliminator
will be ``@`` intead of ``?``.
This function does not include the directories and does not sanitize
the filename.
Returns:
str
'''
assert isinstance(url, str), 'Expect str. Got {}.'.format(type(url))
url_split_result = urllib.parse.urlsplit(url)
filename = url_split_result.path.split('/')[-1]
if not filename:
filename = index
if url_split_result.query:
if alt_char:
query_delim = '@'
else:
query_delim = '?'
filename = '{0}{1}{2}'.format(
filename, query_delim, url_split_result.query
)
return filename
[docs]def url_to_dir_parts(url, include_protocol=False, include_hostname=False,
alt_char=False):
'''Return a list of directory parts from a URL.
Args:
url (str): The URL.
include_protocol (bool): If True, the scheme from the URL will be
included.
include_hostname (bool): If True, the hostname from the URL will be
included.
alt_char (bool): If True, the character for the port deliminator
will be ``+`` intead of ``:``.
This function does not include the filename and the paths are not
sanitized.
Returns:
list
'''
assert isinstance(url, str), 'Expect str. Got {}.'.format(type(url))
url_split_result = urllib.parse.urlsplit(url)
parts = []
if include_protocol:
parts.append(url_split_result.scheme)
if include_hostname:
hostname = url_split_result.hostname
if url_split_result.port:
if alt_char:
port_delim = '+'
else:
port_delim = ':'
hostname = '{0}{1}{2}'.format(
hostname, port_delim, url_split_result.port
)
parts.append(hostname)
for path_part in url_split_result.path.split('/'):
if path_part:
parts.append(path_part)
if not url.endswith('/') and parts:
parts.pop()
return parts
[docs]class PercentEncoder(collections.defaultdict):
'''Percent encoder.'''
# The percent-encoder was inspired from urllib.parse
def __init__(self, unix=False, control=False, windows=False, ascii_=False):
super().__init__()
self.unix = unix
self.control = control
self.windows = windows
self.ascii = ascii_
def __missing__(self, char):
assert isinstance(char, bytes), \
'Expect bytes. Got {}.'.format(type(char))
char_num = ord(char)
if ((self.unix and char == b'/')
or (self.control and
(0 <= char_num <= 31 or
self.ascii and 128 <= char_num <= 159))
or (self.windows and char in br'\|/:?"*<>')
or (self.ascii and char_num > 127)):
value = b'%' + base64.b16encode(char)
else:
value = char
self[char] = value
return value
[docs] def quote(self, bytes_string):
quoter = self.__getitem__
return b''.join(
[quoter(bytes_string[i:i + 1]) for i in range(len(bytes_string))]
)
_encoder_cache = {}
[docs]def safe_filename(filename, os_type='unix', no_control=True, ascii_only=True,
case=None, encoding='utf8', max_length=None):
'''Return a safe filename or path part.
Args:
filename (str): The filename or path component.
os_type (str): If ``unix``, escape the slash. If ``windows``, escape
extra Windows characters.
no_control (bool): If True, escape control characters.
ascii_only (bool): If True, escape non-ASCII characters.
case (str): If ``lower``, lowercase the string. If ``upper``, uppercase
the string.
encoding (str): The character encoding.
max_length (int): The maximum length of the filename.
This function assumes that `filename` has not already been percent-encoded.
Returns:
str
'''
assert isinstance(filename, str), \
'Expect str. Got {}.'.format(type(filename))
if filename in ('.', os.curdir):
new_filename = '%2E'
elif filename in ('.', os.pardir):
new_filename = '%2E%2E'
else:
unix = os_type == 'unix'
windows = os_type == 'windows'
encoder_args = (unix, no_control, windows, ascii_only)
if encoder_args not in _encoder_cache:
_encoder_cache[encoder_args] = PercentEncoder(
unix=unix, control=no_control, windows=windows,
ascii_=ascii_only
)
encoder = _encoder_cache[encoder_args]
encoded_filename = filename.encode(encoding)
new_filename = encoder.quote(encoded_filename).decode(encoding)
if os_type == 'windows':
if new_filename[-1] in ' .':
new_filename = '{0}{1:02X}'.format(
new_filename[:-1], new_filename[-1]
)
if max_length and len(new_filename) > max_length:
hash_obj = hashlib.sha1(new_filename.encode(encoding))
new_length = max(0, max_length - 8)
new_filename = '{0}{1}'.format(
new_filename[:new_length], hash_obj.hexdigest()[:8]
)
if case == 'lower':
new_filename = new_filename.lower()
elif case == 'upper':
new_filename = new_filename.upper()
return new_filename
[docs]def anti_clobber_dir_path(dir_path, suffix='.d'):
'''Return a directory path free of filenames.
Args:
dir_path (str): A directory path.
suffix (str): The suffix to append to the part of the path that is
a file.
Returns:
str
'''
dir_path = os.path.normpath(dir_path)
parts = dir_path.split(os.sep)
for index in range(len(parts)):
test_path = os.sep.join(parts[:index + 1])
if os.path.isfile(test_path):
parts[index] += suffix
return os.sep.join(parts)
return dir_path
[docs]def parse_content_disposition(text):
'''Parse a Content-Disposition header value.'''
match = re.search(r'filename\s*=\s*(.+)', text, re.IGNORECASE)
if not match:
return
filename = match.group(1)
if filename[0] in '"\'':
match = re.match(r'(.)(.+)(?!\\)\1', filename)
if match:
filename = match.group(2).replace('\\"', '"')
return filename
else:
filename = filename.partition(';')[0].strip()
return filename