Source code for wpull.cookie

# encoding=utf-8
'''HTTP Cookies.'''
from http.cookiejar import DefaultCookiePolicy
import http.cookiejar
import logging
import re
import time
import io
import traceback
import warnings

import wpull.util

_logger = logging.getLogger(__name__)


[docs]class DeFactoCookiePolicy(DefaultCookiePolicy): '''Cookie policy that limits the content and length of the cookie. Args: cookie_jar: The CookieJar instance. This policy class is *not* designed to be shared between CookieJar instances. ''' def __init__(self, *args, **kwargs): self.cookie_jar = kwargs.pop('cookie_jar') DefaultCookiePolicy.__init__(self, *args, **kwargs)
[docs] def set_ok(self, cookie, request): if not DefaultCookiePolicy.set_ok(self, cookie, request): return False try: new_cookie_length = (self.cookie_length(cookie.domain) + len(cookie.path) + len(cookie.name) + len(cookie.value or '')) except TypeError: # cookiejar is not infallible #220 _logger.debug('Cookie handling error', exc_info=1) return False if new_cookie_length >= 4100: return False if self.count_cookies(cookie.domain) >= 50: cookies = self.cookie_jar._cookies try: cookies[cookie.domain][cookie.path][cookie.name] except KeyError: return False if not wpull.util.is_ascii(str(cookie)): return False return True
[docs] def count_cookies(self, domain): '''Return the number of cookies for the given domain.''' cookies = self.cookie_jar._cookies if domain in cookies: return sum( [len(cookie) for cookie in cookies[domain].values()] ) else: return 0
[docs] def cookie_length(self, domain): '''Return approximate length of all cookie key-values for a domain.''' cookies = self.cookie_jar._cookies if domain not in cookies: return 0 length = 0 for path in cookies[domain]: for name in cookies[domain][path]: cookie = cookies[domain][path][name] length += len(path) + len(name) + len(cookie.value or '') return length
[docs]class BetterMozillaCookieJar(http.cookiejar.FileCookieJar): '''MozillaCookieJar that is compatible with Wget/Curl. It ignores file header checks and supports session cookies. ''' # This class from cpython/Lib/http/cookiejar.py changeset 95436:ea94f6c87f5d # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, # 2011, 2012, 2013, 2014, 2015 Python Software Foundation; All Rights # Reserved magic_re = re.compile(r'.') header = """\ # Netscape HTTP Cookie File # http://curl.haxx.se/rfc/cookie_spec.html # This is a generated file! Do not edit. """ def _really_load(self, f, filename, ignore_discard, ignore_expires): now = time.time() magic = f.readline() if not self.magic_re.search(magic): raise http.cookiejar.LoadError( "%r does not look like a Netscape format cookies file" % filename) line = "" try: while 1: line = f.readline() if line == "": break # last field may be absent, so keep any trailing tab if line.endswith("\n"): line = line[:-1] # skip comments and blank lines XXX what is $ for? if (line.strip().startswith(("#", "$")) or line.strip() == ""): continue domain, domain_specified, path, secure, expires, name, value = \ line.split("\t") secure = (secure == "TRUE") domain_specified = (domain_specified == "TRUE") if name == "": # cookies.txt regards 'Set-Cookie: foo' as a cookie # with no name, whereas http.cookiejar regards it as a # cookie with no value. name = value value = None initial_dot = domain.startswith(".") assert domain_specified == initial_dot discard = False if expires in ("0", ""): expires = None discard = True # assume path_specified is false c = http.cookiejar.Cookie( 0, name, value, None, False, domain, domain_specified, initial_dot, path, False, secure, expires, discard, None, None, {}) if not ignore_discard and c.discard: continue if not ignore_expires and c.is_expired(now): continue self.set_cookie(c) except OSError: raise except Exception: f = io.StringIO() traceback.print_exc(None, f) msg = f.getvalue() warnings.warn("http.cookiejar bug!\n%s" % msg, stacklevel=2) raise http.cookiejar.LoadError( "invalid Netscape format cookies file %r: %r" % (filename, line))
[docs] def save(self, filename=None, ignore_discard=False, ignore_expires=False): if filename is None: if self.filename is not None: filename = self.filename else: raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT) with open(filename, "w") as f: f.write(self.header) now = time.time() for cookie in self: if not ignore_discard and cookie.discard: continue if not ignore_expires and cookie.is_expired(now): continue if cookie.secure: secure = "TRUE" else: secure = "FALSE" if cookie.domain.startswith("."): initial_dot = "TRUE" else: initial_dot = "FALSE" if cookie.expires is not None: expires = str(cookie.expires) else: expires = "0" if cookie.value is None: # cookies.txt regards 'Set-Cookie: foo' as a cookie # with no name, whereas http.cookiejar regards it as a # cookie with no value. name = "" value = cookie.name else: name = cookie.name value = cookie.value f.write( "\t".join([cookie.domain, initial_dot, cookie.path, secure, expires, name, value]) + "\n")