Source code for wpull.pipeline.item

# encoding=utf-8
'''URL items.'''
import enum
import gettext
import logging

from wpull.url import URLInfo

_ = gettext.gettext
_logger = logging.getLogger(__name__)


[docs]class Status(enum.Enum): '''URL status.''' todo = 'todo' '''The item has not yet been processed.''' in_progress = 'in_progress' '''The item is in progress of being processed.''' done = 'done' '''The item has been processed successfully.''' error = 'error' '''The item encountered an error during processing.''' skipped = 'skipped' '''The item was excluded from processing due to some rejection filters.'''
[docs]class LinkType(enum.Enum): '''The type of contents that a link is expected to have.''' html = 'html' '''HTML document.''' css = 'css' '''Stylesheet file. Recursion on links is usually safe.''' javascript = 'javascript' '''JavaScript file. Possible to recurse links on this file.''' media = 'media' '''Image or video file. Recursion on this type will not be useful.''' sitemap = 'sitemap' '''A Sitemap.xml file.''' file = 'file' '''FTP File.''' directory = 'directory' '''FTP directory.'''
[docs]class URLDatabaseMixin:
[docs] def database_items(self): for name in self.database_attributes: value = getattr(self, name) if value is not None: yield name, value
[docs]class URLProperties(URLDatabaseMixin): '''URL properties that determine whether a URL is fetched. Attributes: parent_url (str): The parent or referral URL that linked to this URL. root_url (str): The earliest ancestor URL of this URL. This URL is typically the URL supplied at the start of the program. status (Status): Processing status of this URL. try_count (int): The number of attempts on this URL. level (int): The recursive depth of this URL. A level of ``0`` indicates the URL was initially supplied to the program (the top URL). Level ``1`` means the URL was linked from the top URL. inline_level (int): Whether this URL was an embedded object (such as an image or a stylesheet) of the parent URL. The value represents the recursive depth of the object. For example, an iframe is depth 1 and the images in the iframe is depth 2. link_type (LinkType): Describes the expected document type. ''' database_attributes = ('parent_url', 'root_url', 'status', 'try_count', 'level', 'inline_level', 'link_type', 'priority') def __init__(self): self.parent_url = None self.root_url = None self.status = None self.try_count = None self.level = None self.inline_level = None self.link_type = None self.priority = None @property def parent_url_info(self): '''Return URL Info for the parent URL''' return URLInfo.parse(self.parent_url) @property def root_url_info(self): '''Return URL Info for the root URL''' return URLInfo.parse(self.parent_url)
[docs]class URLData(URLDatabaseMixin): '''Data associated fetching the URL. post_data (str): If given, the URL should be fetched as a POST request containing `post_data`. ''' database_attributes = ('post_data',) def __init__(self): self.post_data = None
[docs]class URLResult(URLDatabaseMixin): '''Data associated with the fetched URL. status_code (int): The HTTP or FTP status code. filename (str): The path to where the file was saved. ''' database_attributes = ('status_code', 'filename') def __init__(self): self.status_code = None self.filename = None
[docs]class URLRecord(URLProperties, URLData, URLResult): '''An entry in the URL table describing a URL to be downloaded. Attributes: url (str): The URL. ''' def __init__(self): super().__init__() self.url = None @property def url_info(self) -> URLInfo: '''Return URL Info for this URL''' return URLInfo.parse(self.url)