Source code for wpull.application.tasks.rule

import gettext
import logging

import asyncio

from wpull.urlfilter import HTTPSOnlyFilter, SchemeFilter, RecursiveFilter, \
    FollowFTPFilter, SpanHostsFilter, ParentFilter, BackwardDomainFilter, \
    HostnameFilter, TriesFilter, RegexFilter, DirectoryFilter, \
    BackwardFilenameFilter, LevelFilter
from wpull.pipeline.pipeline import ItemTask
from wpull.pipeline.app import AppSession

_logger = logging.getLogger(__name__)
_ = gettext.gettext


[docs]class URLFiltersSetupTask(ItemTask[AppSession]): @asyncio.coroutine
[docs] def process(self, session: AppSession): self._build_url_rewriter(session) session.factory.new('DemuxURLFilter', self._build_url_filters(session))
@classmethod def _build_url_rewriter(cls, session: AppSession): '''Build URL rewriter if needed.''' if session.args.escaped_fragment or session.args.strip_session_id: return session.factory.new( 'URLRewriter', hash_fragment=session.args.escaped_fragment, session_id=session.args.strip_session_id ) @classmethod def _build_url_filters(cls, session: AppSession): '''Create the URL filter instances. Returns: A list of URL filter instances ''' args = session.args filters = [ HTTPSOnlyFilter() if args.https_only else SchemeFilter(), RecursiveFilter( enabled=args.recursive, page_requisites=args.page_requisites ), FollowFTPFilter(follow=args.follow_ftp), ] if args.no_parent: filters.append(ParentFilter()) if args.domains or args.exclude_domains: filters.append( BackwardDomainFilter(args.domains, args.exclude_domains) ) if args.hostnames or args.exclude_hostnames: filters.append( HostnameFilter(args.hostnames, args.exclude_hostnames) ) if args.tries: filters.append(TriesFilter(args.tries)) if args.level and args.recursive or args.page_requisites_level: filters.append( LevelFilter(args.level, inline_max_depth=args.page_requisites_level) ) if args.accept_regex or args.reject_regex: filters.append(RegexFilter(args.accept_regex, args.reject_regex)) if args.include_directories or args.exclude_directories: filters.append( DirectoryFilter( args.include_directories, args.exclude_directories ) ) if args.accept or args.reject: filters.append(BackwardFilenameFilter(args.accept, args.reject)) return filters
[docs]class URLFiltersPostURLImportSetupTask(ItemTask[AppSession]): @asyncio.coroutine
[docs] def process(self, session: AppSession): args = session.args span_hosts_filter = SpanHostsFilter( tuple(session.factory['URLTable'].get_hostnames()), enabled=args.span_hosts, page_requisites='page-requisites' in args.span_hosts_allow, linked_pages='linked-pages' in args.span_hosts_allow, ) demux_url_filter = session.factory['DemuxURLFilter'] demux_url_filter.url_filters.append(span_hosts_filter)