Source code for wpull.application.tasks.download

import asyncio
import gettext
import logging
import functools

import tornado.netutil

from wpull.backport.logging import BraceMessage as __
from wpull.cookie import BetterMozillaCookieJar
from wpull.processor.coprocessor.phantomjs import PhantomJSParams
from wpull.namevalue import NameValueRecord
from wpull.pipeline.pipeline import ItemTask
from wpull.pipeline.session import ItemSession
from wpull.pipeline.app import AppSession
import wpull.resmon
import wpull.string

from wpull.protocol.http.stream import Stream as HTTPStream
import wpull.util
import wpull.processor.coprocessor.youtubedl
import wpull.driver.phantomjs
import wpull.application.hook

_logger = logging.getLogger(__name__)
_ = gettext.gettext


[docs]class ParserSetupTask(ItemTask[AppSession]): @asyncio.coroutine
[docs] def process(self, session: AppSession): self._build_html_parser(session) self._build_demux_document_scraper(session)
@classmethod def _build_html_parser(cls, session: AppSession): if session.args.html_parser == 'html5lib': from wpull.document.htmlparse.html5lib_ import HTMLParser else: from wpull.document.htmlparse.lxml_ import HTMLParser session.factory.class_map['HTMLParser'] = HTMLParser session.factory.new('HTMLParser') @classmethod def _build_demux_document_scraper(cls, session: AppSession): '''Create demux document scraper.''' session.factory.new( 'DemuxDocumentScraper', cls._build_document_scrapers(session)) @classmethod def _build_document_scrapers(cls, session: AppSession): '''Create the document scrapers. Returns: A list of document scrapers ''' html_parser = session.factory['HTMLParser'] element_walker = session.factory.new('ElementWalker') scrapers = [ session.factory.new( 'HTMLScraper', html_parser, element_walker, followed_tags=session.args.follow_tags, ignored_tags=session.args.ignore_tags, only_relative=session.args.relative, robots=session.args.robots, encoding_override=session.args.remote_encoding, ), ] if 'css' in session.args.link_extractors: css_scraper = session.factory.new( 'CSSScraper', encoding_override=session.args.remote_encoding, ) scrapers.append(css_scraper) element_walker.css_scraper = css_scraper if 'javascript' in session.args.link_extractors: javascript_scraper = session.factory.new( 'JavaScriptScraper', encoding_override=session.args.remote_encoding, ) scrapers.append(javascript_scraper) element_walker.javascript_scraper = javascript_scraper if session.args.sitemaps: scrapers.append(session.factory.new( 'SitemapScraper', html_parser, encoding_override=session.args.remote_encoding, )) return scrapers
[docs]class ClientSetupTask(ItemTask[AppSession]): @asyncio.coroutine
[docs] def process(self, session: AppSession): self._build_web_client(session) self._build_ftp_client(session)
@classmethod def _build_request_factory(cls, session: AppSession): '''Create the request factory. A request factory is any callable object that returns a :class:`.http.Request`. The callable must accept the same arguments to Request. Returns: A callable object ''' def request_factory(*args, **kwargs): request = session.factory.class_map['Request'](*args, **kwargs) user_agent = session.args.user_agent or session.default_user_agent request.fields['User-Agent'] = user_agent if session.args.referer: request.fields['Referer'] = session.args.referer for header_string in session.args.header: request.fields.parse(header_string) if session.args.http_compression: request.fields['Accept-Encoding'] = 'gzip, deflate' if session.args.no_cache: request.fields['Cache-Control'] = 'no-cache, must-revalidate' request.fields['Pragma'] = 'no-cache' return request return request_factory @classmethod def _build_http_client(cls, session: AppSession): '''Create the HTTP client. Returns: Client: An instance of :class:`.http.Client`. ''' # TODO: # recorder = self._build_recorder() stream_factory = functools.partial( HTTPStream, ignore_length=session.args.ignore_length, keep_alive=session.args.http_keep_alive) return session.factory.new( 'HTTPClient', connection_pool=session.factory['ConnectionPool'], stream_factory=stream_factory ) @classmethod def _build_web_client(cls, session: AppSession): '''Build Web Client.''' cookie_jar = cls._build_cookie_jar(session) http_client = cls._build_http_client(session) redirect_factory = functools.partial( session.factory.class_map['RedirectTracker'], max_redirects=session.args.max_redirect ) return session.factory.new( 'WebClient', http_client, redirect_tracker_factory=redirect_factory, cookie_jar=cookie_jar, request_factory=cls._build_request_factory(session), ) @classmethod def _build_cookie_jar(cls, session: AppSession): '''Build the cookie jar''' if not session.args.cookies: return if session.args.load_cookies or session.args.save_cookies: session.factory.set('CookieJar', BetterMozillaCookieJar) cookie_jar = session.factory.new('CookieJar') if session.args.load_cookies: cookie_jar.load(session.args.load_cookies, ignore_discard=True) else: cookie_jar = session.factory.new('CookieJar') policy = session.factory.new('CookiePolicy', cookie_jar=cookie_jar) cookie_jar.set_policy(policy) _logger.debug(__('Loaded cookies: {0}', list(cookie_jar))) cookie_jar_wrapper = session.factory.new( 'CookieJarWrapper', cookie_jar, save_filename=session.args.save_cookies, keep_session_cookies=session.args.keep_session_cookies, ) return cookie_jar_wrapper @classmethod def _build_ftp_client(cls, session: AppSession): '''Build FTP client.''' return session.factory.new( 'FTPClient', connection_pool=session.factory['ConnectionPool'], # TODO: recorder # recorder=session.factory['DemuxRecorder'], )
[docs]class ProxyServerSetupTask(ItemTask[AppSession]): @asyncio.coroutine
[docs] def process(self, session: AppSession): '''Build MITM proxy server.''' args = session.args if not (args.phantomjs or args.youtube_dl or args.proxy_server): return proxy_server = session.factory.new( 'HTTPProxyServer', session.factory['HTTPClient'], ) cookie_jar = session.factory.get('CookieJarWrapper') proxy_coprocessor = session.factory.new( 'ProxyCoprocessor', session ) proxy_socket = tornado.netutil.bind_sockets( session.args.proxy_server_port, address=session.args.proxy_server_address )[0] proxy_port = proxy_socket.getsockname()[1] proxy_async_server = yield from asyncio.start_server(proxy_server, sock=proxy_socket) session.async_servers.append(proxy_async_server) session.proxy_server_port = proxy_port
[docs]class ProcessorSetupTask(ItemTask[AppSession]): @asyncio.coroutine
[docs] def process(self, session: AppSession): self._build_processor(session)
@classmethod def _build_processor(cls, session: AppSession): '''Create the Processor Returns: Processor: An instance of :class:`.processor.BaseProcessor`. ''' web_processor = cls._build_web_processor(session) ftp_processor = cls._build_ftp_processor(session) delegate_processor = session.factory.new('Processor') delegate_processor.register('http', web_processor) delegate_processor.register('https', web_processor) delegate_processor.register('ftp', ftp_processor) @classmethod def _build_web_processor(cls, session: AppSession): '''Build WebProcessor.''' args = session.args url_filter = session.factory['DemuxURLFilter'] document_scraper = session.factory['DemuxDocumentScraper'] file_writer = session.factory['FileWriter'] post_data = cls._get_post_data(session.args) web_client = session.factory['WebClient'] robots_txt_checker = cls._build_robots_txt_checker(session) http_username = args.user or args.http_user http_password = args.password or args.http_password ftp_username = args.user or args.ftp_user ftp_password = args.password or args.ftp_password fetch_rule = session.factory.new( 'FetchRule', url_filter=url_filter, robots_txt_checker=robots_txt_checker, http_login=(http_username, http_password), ftp_login=(ftp_username, ftp_password), duration_timeout=args.session_timeout, ) waiter = session.factory.new( 'Waiter', wait=args.wait, random_wait=args.random_wait, max_wait=args.waitretry) result_rule = session.factory.new( 'ResultRule', ssl_verification=args.check_certificate, retry_connrefused=args.retry_connrefused, retry_dns_error=args.retry_dns_error, waiter=waiter, statistics=session.factory['Statistics'], ) processing_rule = session.factory.new( 'ProcessingRule', fetch_rule, document_scraper=document_scraper, sitemaps=session.args.sitemaps, url_rewriter=session.factory.get('URLRewriter'), ) web_processor_fetch_params = session.factory.new( 'WebProcessorFetchParams', post_data=post_data, strong_redirects=args.strong_redirects, content_on_error=args.content_on_error, ) processor = session.factory.new( 'WebProcessor', web_client, web_processor_fetch_params, ) return processor @classmethod def _build_ftp_processor(cls, session: AppSession): '''Build FTPProcessor.''' ftp_client = session.factory['FTPClient'] fetch_params = session.factory.new( 'FTPProcessorFetchParams', remove_listing=session.args.remove_listing, retr_symlinks=session.args.retr_symlinks, preserve_permissions=session.args.preserve_permissions, glob=session.args.glob, ) return session.factory.new( 'FTPProcessor', ftp_client, fetch_params, ) @classmethod def _get_post_data(cls, args): '''Return the post data.''' if args.post_data: return args.post_data elif args.post_file: return args.post_file.read() @classmethod def _build_robots_txt_checker(cls, session: AppSession): '''Build robots.txt checker.''' if session.args.robots: robots_txt_pool = session.factory.new('RobotsTxtPool') robots_txt_checker = session.factory.new( 'RobotsTxtChecker', web_client=session.factory['WebClient'], robots_txt_pool=robots_txt_pool ) return robots_txt_checker @classmethod def _build_recorder(cls, session: AppSession): '''Create the Recorder. Returns: DemuxRecorder: An instance of :class:`.recorder.DemuxRecorder`. ''' return session.factory.new('DemuxRecorder', recorders)
[docs]class CoprocessorSetupTask(ItemTask[ItemSession]): @asyncio.coroutine
[docs] def process(self, session: AppSession): args = session.args if args.phantomjs or args.youtube_dl or args.proxy_server: proxy_port = session.proxy_server_port assert proxy_port if args.phantomjs: phantomjs_coprocessor = self._build_phantomjs_coprocessor(session, proxy_port) else: phantomjs_coprocessor = None if args.youtube_dl: youtube_dl_coprocessor = self._build_youtube_dl_coprocessor(session, proxy_port) else: youtube_dl_coprocessor = None
@classmethod def _build_phantomjs_coprocessor(cls, session: AppSession, proxy_port: int): '''Build proxy server and PhantomJS client. controller, coprocessor.''' page_settings = {} default_headers = NameValueRecord() for header_string in session.args.header: default_headers.parse(header_string) # Since we can only pass a one-to-one mapping to PhantomJS, # we put these last since NameValueRecord.items() will use only the # first value added for each key. default_headers.add('Accept-Language', '*') if not session.args.http_compression: default_headers.add('Accept-Encoding', 'identity') default_headers = dict(default_headers.items()) if session.args.read_timeout: page_settings['resourceTimeout'] = session.args.read_timeout * 1000 page_settings['userAgent'] = session.args.user_agent \ or session.default_user_agent # Test early for executable wpull.driver.phantomjs.get_version(session.args.phantomjs_exe) phantomjs_params = PhantomJSParams( wait_time=session.args.phantomjs_wait, num_scrolls=session.args.phantomjs_scroll, smart_scroll=session.args.phantomjs_smart_scroll, snapshot=session.args.phantomjs_snapshot, custom_headers=default_headers, page_settings=page_settings, load_time=session.args.phantomjs_max_time, ) extra_args = [ '--proxy', '{}:{}'.format(session.args.proxy_server_address, proxy_port), '--ignore-ssl-errors=true' ] phantomjs_driver_factory = functools.partial( session.factory.class_map['PhantomJSDriver'], exe_path=session.args.phantomjs_exe, extra_args=extra_args, ) phantomjs_coprocessor = session.factory.new( 'PhantomJSCoprocessor', phantomjs_driver_factory, session.factory['ProcessingRule'], phantomjs_params, root_path=session.args.directory_prefix, warc_recorder=session.factory.get('WARCRecorder'), ) return phantomjs_coprocessor @classmethod def _build_youtube_dl_coprocessor(cls, session: AppSession, proxy_port: int): '''Build youtube-dl coprocessor.''' # Test early for executable wpull.processor.coprocessor.youtubedl.get_version(session.args.youtube_dl_exe) coprocessor = session.factory.new( 'YoutubeDlCoprocessor', session.args.youtube_dl_exe, (session.args.proxy_server_address, proxy_port), root_path=session.args.directory_prefix, user_agent=session.args.user_agent or session.default_user_agent, warc_recorder=session.factory.get('WARCRecorder'), inet_family=session.args.inet_family, # Proxy will always present a invalid MITM cert #check_certificate=session.args.check_certificate check_certificate=False ) return coprocessor
[docs]class ProcessTask(ItemTask[ItemSession]): @asyncio.coroutine
[docs] def process(self, session: ItemSession): yield from session.app_session.factory['Processor'].process(session) assert session.is_processed session.finish()
[docs]class BackgroundAsyncTask(ItemTask[ItemSession]): @asyncio.coroutine
[docs] def process(self, session: ItemSession): for task in session.app_session.background_async_tasks: if task.done(): yield from task
[docs]class CheckQuotaTask(ItemTask[ItemSession]): @asyncio.coroutine
[docs] def process(self, session: ItemSession): statistics = session.app_session.factory['Statistics'] if statistics.is_quota_exceeded: session.app_session.factory['Application'].stop()