Source code for wpull.string

# encoding=utf-8
'''String and binary data functions.'''
import codecs
import itertools

from wpull.thirdparty.dammit import UnicodeDammit, EncodingDetector


[docs]def to_bytes(instance, encoding='utf-8', error='strict'):
    '''Convert an instance recursively to bytes.'''
    if isinstance(instance, bytes):
        return instance
    elif hasattr(instance, 'encode'):
        return instance.encode(encoding, error)
    elif isinstance(instance, list):
        return list([to_bytes(item, encoding, error) for item in instance])
    elif isinstance(instance, tuple):
        return tuple([to_bytes(item, encoding, error) for item in instance])
    elif isinstance(instance, dict):
        return dict(
            [(to_bytes(key, encoding, error), to_bytes(value, encoding, error))
                for key, value in instance.items()])
    else:
        return instance


[docs]def to_str(instance, encoding='utf-8'):
    '''Convert an instance recursively to string.'''
    if isinstance(instance, str):
        return instance
    elif hasattr(instance, 'decode'):
        return instance.decode(encoding)
    elif isinstance(instance, list):
        return list([to_str(item, encoding) for item in instance])
    elif isinstance(instance, tuple):
        return tuple([to_str(item, encoding) for item in instance])
    elif isinstance(instance, dict):
        return dict(
            [(to_str(key, encoding), to_str(value, encoding))
                for key, value in instance.items()])
    else:
        return instance


[docs]def normalize_codec_name(name):
    '''Return the Python name of the encoder/decoder

    Returns:
        str, None
    '''
    name = UnicodeDammit.CHARSET_ALIASES.get(name.lower(), name)

    try:
        return codecs.lookup(name).name
    except (LookupError, TypeError, ValueError):
        # TypeError occurs when name contains \x00 (ValueError in Py3.5)
        pass


[docs]def detect_encoding(data, encoding=None, fallback='latin1', is_html=False):
    '''Detect the character encoding of the data.

    Returns:
        str: The name of the codec

    Raises:
        ValueError: The codec could not be detected. This error can only
        occur if fallback is not a "lossless" codec.
    '''
    if encoding:
        encoding = normalize_codec_name(encoding)

    bs4_detector = EncodingDetector(
        data,
        override_encodings=(encoding,) if encoding else (),
        is_html=is_html
    )
    candidates = itertools.chain(bs4_detector.encodings, (fallback,))

    for candidate in candidates:
        if not candidate:
            continue

        candidate = normalize_codec_name(candidate)

        if not candidate:
            continue

        if candidate == 'ascii' and fallback != 'ascii':
            # it's never ascii :)
            # Falling back on UTF-8/CP-1252/Latin-1 reduces chance of
            # failure
            continue

        if try_decoding(data, candidate):
            return candidate

    raise ValueError('Unable to detect encoding.')


[docs]def try_decoding(data, encoding):
    '''Return whether the Python codec could decode the data.'''
    try:
        data.decode(encoding, 'strict')
    except UnicodeError:
        # Data under 16 bytes is very unlikely to be truncated
        if len(data) > 16:
            for trim in (1, 2, 3):
                trimmed_data = data[:-trim]
                if trimmed_data:
                    try:
                        trimmed_data.decode(encoding, 'strict')
                    except UnicodeError:
                        continue
                    else:
                        return True
        return False
    else:
        return True


[docs]def format_size(num, format_str='{num:.1f} {unit}'):
    '''Format the file size into a human readable text.

    http://stackoverflow.com/a/1094933/1524507
    '''
    for unit in ('B', 'KiB', 'MiB', 'GiB'):
        if -1024 < num < 1024:
            return format_str.format(num=num, unit=unit)

        num /= 1024.0

    return format_str.format(num=num, unit='TiB')


ALL_BYTES = bytes(bytearray(range(256)))
CONTROL_BYTES = bytes(bytearray(
    itertools.chain(range(0, 32), range(127, 256))
))


[docs]def printable_bytes(data):
    '''Remove any bytes that is not printable ASCII.

    This function is intended for sniffing content types such as UTF-16
    encoded text.
    '''
    return data.translate(ALL_BYTES, CONTROL_BYTES)


[docs]def printable_str(text, keep_newlines=False):
    '''Escape any control or non-ASCII characters from string.

    This function is intended for use with strings from an untrusted
    source such as writing to a console or writing to logs. It is
    designed to prevent things like ANSI escape sequences from
    showing.

    Use :func:`repr` or :func:`ascii` instead for things such as
    Exception messages.
    '''
    if isinstance(text, str):
        new_text = ascii(text)[1:-1]
    else:
        new_text = ascii(text)

    if keep_newlines:
        new_text = new_text.replace('\\r', '\r').replace('\\n', '\n')

    return new_text


[docs]def coerce_str_to_ascii(string):
    '''Force the contents of the string to be ASCII.

    Anything not ASCII will be replaced with with a replacement character.

    .. deprecated :: 0.1002
       Use :func:`printable_str` instead.
    '''
    return string.encode('ascii', 'replace').decode('ascii')