Source code for wpull.processor.coprocessor.proxy

import gettext
import logging
from http.cookiejar import CookieJar

from typing import Optional, cast

import wpull.string
from wpull.application.hook import Actions
from wpull.backport.logging import BraceMessage as __
from wpull.database.base import BaseURLTable
from wpull.pipeline.app import AppSession
from wpull.pipeline.item import URLRecord, Status
from wpull.pipeline.session import ItemSession
from wpull.processor.rule import FetchRule, ResultRule, ProcessingRule
from wpull.processor.web import WebProcessor
from wpull.protocol.http.request import Request, Response
from wpull.proxy.server import HTTPProxyServer, HTTPProxySession
from wpull.cookiewrapper import CookieJarWrapper
from wpull.writer import BaseFileWriter

_logger = logging.getLogger(__name__)
_ = gettext.gettext


[docs]class ProxyItemSession(ItemSession): @property def is_virtual(self): return True
[docs] def skip(self): self._processed = True self.set_status(Status.skipped)
[docs]class ProxyCoprocessor(object): '''Proxy coprocessor.''' def __init__(self, app_session: AppSession): self._app_session = app_session proxy_server = cast(HTTPProxyServer, self._app_session.factory['HTTPProxyServer']) proxy_server.event_dispatcher.add_listener( HTTPProxyServer.Event.begin_session, self._proxy_server_session_callback) def _proxy_server_session_callback(self, session: HTTPProxySession): ProxyCoprocessorSession(self._app_session, session)
[docs]class ProxyCoprocessorSession(object): def __init__(self, app_session: AppSession, http_proxy_session: HTTPProxySession): self._app_session = app_session self._http_proxy_session = http_proxy_session self._cookie_jar = cast( CookieJarWrapper, self._app_session.factory.get('CookieJarWrapper') ) self._fetch_rule = cast( FetchRule, self._app_session.factory['FetchRule'] ) self._result_rule = cast( ResultRule, self._app_session.factory['ResultRule'] ) self._processing_rule = cast( ProcessingRule, self._app_session.factory['ProcessingRule'] ) file_writer = cast( BaseFileWriter, self._app_session.factory['FileWriter'] ) self._file_writer_session = file_writer.session() self._item_session = None http_proxy_session.hook_dispatcher.connect( HTTPProxySession.Event.client_request, self._client_request_callback ) http_proxy_session.hook_dispatcher.connect( HTTPProxySession.Event.server_begin_response, self._server_begin_response_callback ) http_proxy_session.event_dispatcher.add_listener( HTTPProxySession.Event.server_end_response, self._server_end_response_callback ) http_proxy_session.event_dispatcher.add_listener( HTTPProxySession.Event.server_response_error, self._server_response_error_callback ) @classmethod def _new_url_record(cls, request: Request) -> URLRecord: '''Return new empty URLRecord.''' url_record = URLRecord() url_record.url = request.url_info.url url_record.status = Status.in_progress url_record.try_count = 0 url_record.level = 0 return url_record def _new_item_session(self, request: Request) -> ProxyItemSession: url_table = cast(BaseURLTable, self._app_session.factory['URLTable']) url_table.add_one(request.url_info.url) return ProxyItemSession(self._app_session, self._new_url_record(request)) def _client_request_callback(self, request: Request): '''Request callback handler.''' self._item_session = self._new_item_session(request) self._item_session.request = request if self._cookie_jar: self._cookie_jar.add_cookie_header(request) verdict, reason = self._fetch_rule.check_subsequent_web_request(self._item_session) self._file_writer_session.process_request(request) if verdict: _logger.info(__( _('Fetching ‘{url}’.'), url=request.url_info.url )) return verdict def _server_begin_response_callback(self, response: Response): '''Pre-response callback handler.''' self._item_session.response = response if self._cookie_jar: self._cookie_jar.extract_cookies(response, self._item_session.request) action = self._result_rule.handle_pre_response(self._item_session) self._file_writer_session.process_response(response) return action == Actions.NORMAL def _server_end_response_callback(self, respoonse: Response): '''Response callback handler.''' request = self._item_session.request response = self._item_session.response _logger.info(__( _('Fetched ‘{url}’: {status_code} {reason}. ' 'Length: {content_length} [{content_type}].'), url=request.url, status_code=response.status_code, reason=wpull.string.printable_str(response.reason), content_length=wpull.string.printable_str( response.fields.get('Content-Length', _('none'))), content_type=wpull.string.printable_str( response.fields.get('Content-Type', _('none'))), )) self._result_rule.handle_response(self._item_session) if response.status_code in WebProcessor.DOCUMENT_STATUS_CODES: filename = self._file_writer_session.save_document(response) self._processing_rule.scrape_document(self._item_session) self._result_rule.handle_document(self._item_session, filename) elif response.status_code in WebProcessor.NO_DOCUMENT_STATUS_CODES: self._file_writer_session.discard_document(response) self._result_rule.handle_no_document(self._item_session) else: self._file_writer_session.discard_document(response) self._result_rule.handle_document_error(self._item_session) def _server_response_error_callback(self, error: BaseException): self._result_rule.handle_error(self._item_session, error)