diff --git a/aioscrapy/__init__.py b/aioscrapy/__init__.py index d6b9b11..453b7b2 100644 --- a/aioscrapy/__init__.py +++ b/aioscrapy/__init__.py @@ -8,10 +8,12 @@ # Declare top-level shortcuts from aioscrapy.spiders import Spider from aioscrapy.http import Request, FormRequest +from aioscrapy.settings import Settings +from aioscrapy.crawler import Crawler __all__ = [ - '__version__', 'version_info', 'Spider', 'Request', 'FormRequest', + '__version__', 'version_info', 'Spider', 'Request', 'FormRequest', 'Crawler' ] diff --git a/aioscrapy/core/engine.py b/aioscrapy/core/engine.py index b81ea8a..7e3c629 100644 --- a/aioscrapy/core/engine.py +++ b/aioscrapy/core/engine.py @@ -4,6 +4,7 @@ import logging from typing import Optional, AsyncGenerator, Union, Callable +import aioscrapy from aioscrapy import Spider from aioscrapy import signals from aioscrapy.core.downloader import DownloaderTV @@ -35,7 +36,7 @@ def remove_request(self, request: Request) -> None: class ExecutionEngine(object): - def __init__(self, crawler) -> None: + def __init__(self, crawler: "aioscrapy.Crawler") -> None: self.crawler = crawler self.settings = crawler.settings self.signals = crawler.signals diff --git a/aioscrapy/core/scheduler.py b/aioscrapy/core/scheduler.py index 997d4b5..ec99945 100644 --- a/aioscrapy/core/scheduler.py +++ b/aioscrapy/core/scheduler.py @@ -1,9 +1,7 @@ from abc import abstractmethod from typing import Optional, Type, TypeVar -from aioscrapy import Spider -from aioscrapy.dupefilters import DupeFilterBase -from aioscrapy.http.request import Request +import aioscrapy from aioscrapy.queue import AbsQueue from aioscrapy.statscollectors import StatsCollector from aioscrapy.utils.misc import load_instance @@ -29,7 +27,7 @@ def __subclasscheck__(cls, subclass): class BaseScheduler(metaclass=BaseSchedulerMeta): @classmethod - async def from_crawler(cls, crawler): + async def from_crawler(cls, crawler: "aioscrapy.Crawler") -> "BaseScheduler": """ Factory method which receives the current :class:`~scrapy.crawler.Crawler` object as argument. """ @@ -53,7 +51,7 @@ async def has_pending_requests(self) -> bool: raise NotImplementedError() @abstractmethod - async def enqueue_request(self, request: Request) -> bool: + async def enqueue_request(self, request: aioscrapy.Request) -> bool: """ Process a request received by the engine. @@ -67,7 +65,7 @@ async def enqueue_request(self, request: Request) -> bool: raise NotImplementedError() @abstractmethod - async def next_request(self) -> Optional[Request]: + async def next_request(self) -> Optional[aioscrapy.Request]: """ Return the next :class:`~scrapy.http.Request` to be processed, or ``None`` to indicate that there are no requests to be considered ready at the moment. @@ -87,7 +85,7 @@ class Scheduler(BaseScheduler): def __init__( self, queue: AbsQueue, - spider: Spider, + spider: aioscrapy.Spider, stats=Optional[StatsCollector], persist: bool = True ): @@ -97,7 +95,7 @@ def __init__( self.persist = persist @classmethod - async def from_crawler(cls: Type[SchedulerTV], crawler) -> SchedulerTV: + async def from_crawler(cls: Type[SchedulerTV], crawler: "aioscrapy.Crawler") -> SchedulerTV: instance = cls( await load_instance(crawler.settings['SCHEDULER_QUEUE_CLASS'], spider=crawler.spider), crawler.spider, @@ -120,13 +118,13 @@ async def close(self, reason: str) -> None: async def flush(self) -> None: await call_helper(self.queue.clear) - async def enqueue_request(self, request: Request) -> bool: + async def enqueue_request(self, request: aioscrapy.Request) -> bool: await call_helper(self.queue.push, request) if self.stats: self.stats.inc_value(self.queue.inc_key, spider=self.spider) return True - async def next_request(self, count: int = 1) -> Optional[Request]: + async def next_request(self, count: int = 1) -> Optional[aioscrapy.Request]: async for request in self.queue.pop(count): if request and self.stats: self.stats.inc_value(self.queue.inc_key, spider=self.spider) diff --git a/aioscrapy/core/scraper.py b/aioscrapy/core/scraper.py index 689f932..f52e208 100644 --- a/aioscrapy/core/scraper.py +++ b/aioscrapy/core/scraper.py @@ -4,6 +4,7 @@ import logging from typing import Any, AsyncGenerator, Set, Union, Optional +import aioscrapy from aioscrapy import signals, Spider from aioscrapy.exceptions import CloseSpider, DropItem, IgnoreRequest from aioscrapy.http import Request, Response @@ -54,7 +55,7 @@ class Scraper: def __init__( self, - crawler, + crawler: "aioscrapy.Crawler", slot: Slot, spidermw: SpiderMiddlewareManager, itemproc: ItemPipelineManager, @@ -72,7 +73,7 @@ def __init__( self.concurrent_parser = asyncio.Semaphore(crawler.settings.getint('CONCURRENT_PARSER', 1)) @classmethod - async def from_crawler(cls, crawler): + async def from_crawler(cls, crawler: "aioscrapy.Crawler") -> "Scraper": instance: "Scraper" = cls( crawler, Slot(crawler.settings.getint('SCRAPER_SLOT_MAX_ACTIVE_SIZE')), diff --git a/aioscrapy/crawler.py b/aioscrapy/crawler.py index 6faaddf..3c55746 100644 --- a/aioscrapy/crawler.py +++ b/aioscrapy/crawler.py @@ -4,7 +4,7 @@ import signal import sys import warnings -from typing import Optional +from typing import Optional, Type, Union, Any from zope.interface.exceptions import DoesNotImplement @@ -15,8 +15,9 @@ MultipleInvalid = None from zope.interface.verify import verifyClass +from aioscrapy.logformatter import LogFormatter from aioscrapy import signals, Spider -from aioscrapy.settings import overridden_settings +from aioscrapy.settings import overridden_settings, Settings from aioscrapy.utils.log import ( get_scrapy_root_handler, install_scrapy_root_handler, @@ -31,7 +32,6 @@ from aioscrapy.utils.tools import async_generator_wrapper from aioscrapy.middleware import ExtensionManager from aioscrapy.core.engine import ExecutionEngine -from aioscrapy.settings import Settings from aioscrapy.signalmanager import SignalManager from aioscrapy.utils.ossignal import install_shutdown_handlers, signal_names from aioscrapy.statscollectors import StatsCollector @@ -41,7 +41,7 @@ class Crawler: - def __init__(self, spidercls, settings=None): + def __init__(self, spidercls: Type[Spider], settings: Union[Settings, dict, None] = None) -> None: if isinstance(spidercls, Spider): raise ValueError('The spidercls argument must be a class, not an object') @@ -49,7 +49,7 @@ def __init__(self, spidercls, settings=None): if isinstance(settings, dict) or settings is None: settings = Settings(settings) - self.spidercls: Spider = spidercls + self.spidercls = spidercls self.settings = settings.copy() self.spidercls.update_settings(self.settings) @@ -59,10 +59,10 @@ def __init__(self, spidercls, settings=None): self.crawling = False self.spider: Optional[Spider] = None self.engine: Optional[ExecutionEngine] = None - self.extensions: Optional[ExecutionEngine] = None - self.logformatter: Optional[ExecutionEngine] = None + self.extensions: Optional[ExtensionManager] = None + self.logformatter: Optional[LogFormatter] = None - async def crawl(self, *args, **kwargs): + async def crawl(self, *args, **kwargs) -> None: try: if self.crawling: raise RuntimeError("Crawling already taking place") @@ -96,7 +96,7 @@ async def crawl(self, *args, **kwargs): await self.engine.close() raise e - async def stop(self): + async def stop(self) -> None: """Starts a graceful stop of the crawler and returns a deferred that is fired when the crawler is stopped.""" if self.crawling: @@ -112,7 +112,7 @@ class CrawlerRunner: ) @staticmethod - def _get_spider_loader(settings): + def _get_spider_loader(settings: Settings) -> ISpiderLoader: """ Get SpiderLoader instance from settings """ cls_path = settings.get('SPIDER_LOADER_CLASS') loader_cls = load_object(cls_path) @@ -128,7 +128,7 @@ def _get_spider_loader(settings): ) return loader_cls.from_settings(settings.frozencopy()) - def __init__(self, settings=None): + def __init__(self, settings: Union[Settings, dict, None] = None) -> None: if isinstance(settings, dict) or settings is None: settings = Settings(settings) self.settings = settings @@ -144,12 +144,18 @@ def spiders(self): category=AioScrapyDeprecationWarning, stacklevel=2) return self.spider_loader - def crawl_soon(self, crawler_or_spidercls, *args, settings=None, **kwargs): + def crawl_soon( + self, + crawler_or_spidercls: Union[Type[Spider], Crawler], + *args, + settings: Union[Settings, dict, None] = None, + **kwargs + ) -> None: crawler = self.crawl(crawler_or_spidercls, settings=settings) self.crawlers.setdefault(crawler, (args, kwargs)) self.active_crawler(crawler, *args, **kwargs) - def active_crawler(self, crawler, *args, **kwargs): + def active_crawler(self, crawler: Crawler, *args, **kwargs) -> None: task = asyncio.create_task(crawler.crawl(*args, **kwargs)) self._active.add(task) @@ -161,7 +167,13 @@ def _done(result): task.add_done_callback(_done) - def crawl(self, crawler_or_spidercls, *args, settings=None, **kwargs): + def crawl( + self, + crawler_or_spidercls: Union[Type[Spider], Crawler], + *args, + settings: Union[Settings, dict, None] = None, + **kwargs + ) -> Crawler: if isinstance(crawler_or_spidercls, Spider): raise ValueError( 'The crawler_or_spidercls argument cannot be a spider object, ' @@ -170,7 +182,11 @@ def crawl(self, crawler_or_spidercls, *args, settings=None, **kwargs): self.crawlers.setdefault(crawler, (args, kwargs)) return crawler - def create_crawler(self, crawler_or_spidercls, settings): + def create_crawler( + self, + crawler_or_spidercls: Union[Type[Spider], Crawler, str], + settings: Union[Settings, dict, None] + ) -> Crawler: if isinstance(crawler_or_spidercls, Spider): raise ValueError( 'The crawler_or_spidercls argument cannot be a spider object, ' @@ -179,44 +195,52 @@ def create_crawler(self, crawler_or_spidercls, settings): return crawler_or_spidercls return self._create_crawler(crawler_or_spidercls, settings) - def _create_crawler(self, spidercls, settings): + def _create_crawler( + self, + spidercls: Union[Type[Spider], str], + settings: Union[Settings, dict, None] + ) -> Crawler: if isinstance(spidercls, str): spidercls = self.spider_loader.load(spidercls) return Crawler(spidercls, settings=settings) - async def stop(self): - return await asyncio.gather(*[c.stop() for c in self.crawlers]) + async def stop(self) -> None: + await asyncio.gather(*[c.stop() for c in self.crawlers]) class CrawlerProcess(CrawlerRunner): - def __init__(self, settings=None, install_root_handler=True): + def __init__( + self, + settings: Union[Settings, dict, None] = None, + install_root_handler: bool = True + ) -> None: super().__init__(settings) install_shutdown_handlers(self._signal_shutdown) configure_logging(self.settings, install_root_handler) - def _signal_shutdown(self, signum, _): + def _signal_shutdown(self, signum: Any, _) -> None: install_shutdown_handlers(self._signal_kill) signame = signal_names[signum] logger.info("Received %(signame)s, shutting down gracefully. Send again to force ", {'signame': signame}) asyncio.create_task(self._graceful_stop_reactor()) - def _signal_kill(self, signum, _): + def _signal_kill(self, signum: Any, _) -> None: install_shutdown_handlers(signal.SIG_IGN) signame = signal_names[signum] logger.info('Received %(signame)s twice, forcing unclean shutdown', {'signame': signame}) asyncio.create_task(self._stop_reactor()) - async def run(self): + async def run(self) -> None: for crawler, (args, kwargs) in self.crawlers.items(): self.active_crawler(crawler, *args, **kwargs) while self._active: await asyncio.gather(*self._active) await self.recycle_db_connect() - def start(self): + def start(self) -> None: if sys.platform.startswith('win'): asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) else: @@ -227,17 +251,17 @@ def start(self): pass asyncio.run(self.run()) - async def _graceful_stop_reactor(self): + async def _graceful_stop_reactor(self) -> None: await self.stop() await self.recycle_db_connect() - async def _stop_reactor(self): + async def _stop_reactor(self) -> None: try: await self.recycle_db_connect() finally: asyncio.get_event_loop().stop() - async def recycle_db_connect(self): - # 回收所以的链接 + async def recycle_db_connect(self) -> None: + # recycle pool of db_manager if not len(self._active): await db_manager.close_all() diff --git a/aioscrapy/db/__init__.py b/aioscrapy/db/__init__.py index f4591c9..690a9e5 100644 --- a/aioscrapy/db/__init__.py +++ b/aioscrapy/db/__init__.py @@ -1,6 +1,9 @@ import logging +from typing import Any -from aioscrapy.db._aioredis import redis_manager +import aioscrapy +from aioscrapy.db.absmanager import AbsDBPoolManager +from aioscrapy.db.aioredis import redis_manager db_manager_map = { 'redis': redis_manager @@ -8,7 +11,7 @@ try: from aiomysql import create_pool - from aioscrapy.db._aiomysql import mysql_manager + from aioscrapy.db.aiomysql import mysql_manager db_manager_map['mysql'] = mysql_manager except ImportError: @@ -16,7 +19,7 @@ try: import aio_pika - from aioscrapy.db._aiorabbitmq import rabbitmq_manager + from aioscrapy.db.aiorabbitmq import rabbitmq_manager db_manager_map['rabbitmq'] = rabbitmq_manager except ImportError: @@ -30,22 +33,22 @@ class DBManager: @staticmethod - def get_manager(db_type): + def get_manager(db_type: str) -> AbsDBPoolManager: manager = db_manager_map.get(db_type) assert manager is not None, f"Not support db type:{db_type}" return manager - def get_pool(self, db_type, alias='default'): + def get_pool(self, db_type: str, alias='default') -> Any: manager = self.get_manager(db_type) return manager.get_pool(alias) @staticmethod - async def close_all(): + async def close_all() -> None: for manager in db_manager_map.values(): await manager.close_all() @staticmethod - async def from_dict(db_args: dict): + async def from_dict(db_args: dict) -> None: for db_type, args in db_args.items(): manager = db_manager_map.get(db_type) if manager is None: @@ -53,14 +56,14 @@ async def from_dict(db_args: dict): await manager.from_dict(args) @staticmethod - async def from_settings(settings: "aioscrapy.settings.Setting"): + async def from_settings(settings: aioscrapy.Settings) -> None: for manager in db_manager_map.values(): await manager.from_settings(settings) - async def from_crawler(self, crawler): + async def from_crawler(self, crawler: "aioscrapy.Crawler") -> None: return await self.from_settings(crawler.settings) - def __getattr__(self, db_type: str): + def __getattr__(self, db_type: str) -> Any: if db_type not in db_manager_map: raise AttributeError(f'Not support db type: {db_type}') return db_manager_map[db_type] diff --git a/aioscrapy/db/absmanager.py b/aioscrapy/db/absmanager.py index a88e3f6..bc95185 100644 --- a/aioscrapy/db/absmanager.py +++ b/aioscrapy/db/absmanager.py @@ -1,6 +1,6 @@ from abc import ABCMeta, abstractmethod -import aioscrapy.crawler +import aioscrapy class AbsDBPoolManager(object, metaclass=ABCMeta): @@ -31,10 +31,10 @@ async def from_dict(self, db_args: dict): """Create pool with dict""" @abstractmethod - async def from_settings(self, settings: "aioscrapy.settings.Settings"): + async def from_settings(self, settings: aioscrapy.Settings): """Create pool with settings""" - async def from_crawler(self, crawler: "aioscrapy.crawler.Crawler"): + async def from_crawler(self, crawler: "aioscrapy.Crawler"): return await self.from_settings(crawler.settings) def __call__(self, alias: str): diff --git a/aioscrapy/db/_aiomysql.py b/aioscrapy/db/aiomysql.py similarity index 97% rename from aioscrapy/db/_aiomysql.py rename to aioscrapy/db/aiomysql.py index f2280c6..62e813b 100644 --- a/aioscrapy/db/_aiomysql.py +++ b/aioscrapy/db/aiomysql.py @@ -4,6 +4,7 @@ from aiomysql import create_pool +import aioscrapy from aioscrapy.db.absmanager import AbsDBPoolManager logger = logging.getLogger(__name__) @@ -84,14 +85,13 @@ async def from_dict(self, db_args: dict): for alias, mysql_args in db_args.items(): await self.create(alias, mysql_args) - async def from_settings(self, settings: "aioscrapy.settings.Setting"): + async def from_settings(self, settings: aioscrapy.Settings): for alias, mysql_args in settings.getdict('MYSQL_ARGS').items(): await self.create(alias, mysql_args) mysql_manager = AioMysqlPoolManager() - if __name__ == '__main__': import asyncio diff --git a/aioscrapy/db/_aiorabbitmq.py b/aioscrapy/db/aiorabbitmq.py similarity index 98% rename from aioscrapy/db/_aiorabbitmq.py rename to aioscrapy/db/aiorabbitmq.py index 0e73234..d3ccf57 100644 --- a/aioscrapy/db/_aiorabbitmq.py +++ b/aioscrapy/db/aiorabbitmq.py @@ -5,6 +5,7 @@ from aio_pika.exceptions import QueueEmpty from aio_pika.pool import Pool +import aioscrapy from aioscrapy.db.absmanager import AbsDBPoolManager logger = logging.getLogger(__name__) @@ -133,7 +134,7 @@ async def from_dict(self, db_args: dict): for alias, rabbitmq_args in db_args.items(): await self.create(alias, rabbitmq_args) - async def from_settings(self, settings: "aioscrapy.settings.Settings"): + async def from_settings(self, settings: aioscrapy.Settings): for alias, rabbitmq_args in settings.getdict('RABBITMQ_ARGS').items(): await self.create(alias, rabbitmq_args) diff --git a/aioscrapy/db/_aioredis.py b/aioscrapy/db/aioredis.py similarity index 97% rename from aioscrapy/db/_aioredis.py rename to aioscrapy/db/aioredis.py index a1f64a0..81c0ff7 100644 --- a/aioscrapy/db/_aioredis.py +++ b/aioscrapy/db/aioredis.py @@ -1,5 +1,6 @@ from redis.asyncio import BlockingConnectionPool, Redis +import aioscrapy from aioscrapy.db.absmanager import AbsDBPoolManager @@ -59,7 +60,7 @@ async def from_dict(self, db_args: dict): for alias, redis_args in db_args.items(): await self.create(alias, redis_args) - async def from_settings(self, settings: "aioscrapy.settings.Settings"): + async def from_settings(self, settings: aioscrapy.Settings): """Create redis with settings""" for alias, redis_args in settings.getdict('REDIS_ARGS').items(): await self.create(alias, redis_args) @@ -67,7 +68,6 @@ async def from_settings(self, settings: "aioscrapy.settings.Settings"): redis_manager = AioRedisPoolManager() - if __name__ == '__main__': import asyncio diff --git a/aioscrapy/http/request/__init__.py b/aioscrapy/http/request/__init__.py index 435309b..a98bdc2 100644 --- a/aioscrapy/http/request/__init__.py +++ b/aioscrapy/http/request/__init__.py @@ -5,19 +5,29 @@ See documentation in docs/topics/request-response.rst """ import hashlib -from typing import Callable -from typing import Iterable, Optional, Union +import inspect +import json +from typing import Callable, List, Optional, Tuple, Type, TypeVar from w3lib.url import canonicalize_url from w3lib.url import safe_url_string +import aioscrapy from aioscrapy.http.headers import Headers from aioscrapy.utils.curl import curl_to_request_kwargs -from aioscrapy.utils.python import to_bytes +from aioscrapy.utils.python import to_unicode from aioscrapy.utils.url import escape_ajax +RequestTypeVar = TypeVar("RequestTypeVar", bound="Request") + class Request(object): + attributes: Tuple[str, ...] = ( + "url", "callback", "method", "headers", "body", + "cookies", "meta", "encoding", "priority", + "dont_filter", "errback", "flags", "cb_kwargs", + "fingerprint" + ) def __init__( self, @@ -32,7 +42,7 @@ def __init__( priority: int = 0, dont_filter: bool = False, errback: Optional[Callable] = None, - flags: Optional[None] = None, + flags: Optional[List[str]] = None, cb_kwargs: Optional[Callable] = None, fingerprint: Optional[str] = None, ): @@ -57,21 +67,21 @@ def __init__( self.flags = [] if flags is None else list(flags) @property - def cb_kwargs(self): + def cb_kwargs(self) -> dict: if self._cb_kwargs is None: self._cb_kwargs = {} return self._cb_kwargs @property - def meta(self): + def meta(self) -> dict: if self._meta is None: self._meta = {} return self._meta - def _get_url(self): + def _get_url(self) -> str: return self._url - def _set_url(self, url): + def _set_url(self, url: str) -> None: if not isinstance(url, str): raise TypeError(f'Request url must be str or unicode, got {type(url).__name__}') @@ -87,21 +97,18 @@ def _set_url(self, url): url = property(_get_url, _set_url) - def _get_body(self): + def _get_body(self) -> str: return self._body - def _set_body(self, body): - if body is None: - self._body = '' - else: - self._body = body + def _set_body(self, body: str) -> None: + self._body = '' if body is None else body body = property(_get_body, _set_body) - def _set_fingerprint(self, fingerprint): + def _set_fingerprint(self, fingerprint: str) -> None: self._fingerprint = fingerprint - def _get_fingerprint(self): + def _get_fingerprint(self) -> str: if self._fingerprint is None: self._fingerprint = self.make_fingerprint() return self._fingerprint @@ -109,75 +116,83 @@ def _get_fingerprint(self): fingerprint = property(_get_fingerprint, _set_fingerprint) @property - def encoding(self): + def encoding(self) -> str: return self._encoding - def __str__(self): + def __str__(self) -> str: return f"<{self.method} {self.url}>" __repr__ = __str__ - def copy(self): + def copy(self) -> "Request": """Return a copy of this Request""" return self.replace() - def replace(self, *args, **kwargs): - """Create a new Request with the same attributes except for those - given new values. - """ - for x in ['url', 'method', 'headers', 'body', 'cookies', 'meta', 'flags', 'encoding', - 'priority', 'dont_filter', 'callback', 'errback', 'cb_kwargs', 'fingerprint']: + def replace(self, *args, **kwargs) -> "Request": + """Create a new Request with the same attributes except for those given new values.""" + for x in self.attributes: kwargs.setdefault(x, getattr(self, x)) cls = kwargs.pop('cls', self.__class__) return cls(*args, **kwargs) @classmethod - def from_curl(cls, curl_command, ignore_unknown_options=True, **kwargs): - """Create a Request object from a string containing a `cURL - `_ command. It populates the HTTP method, the - URL, the headers, the cookies and the body. It accepts the same - arguments as the :class:`Request` class, taking preference and - overriding the values of the same arguments contained in the cURL - command. - - Unrecognized options are ignored by default. To raise an error when - finding unknown options call this method by passing - ``ignore_unknown_options=False``. - - .. caution:: Using :meth:`from_curl` from :class:`~scrapy.http.Request` - subclasses, such as :class:`~scrapy.http.JSONRequest`, or - :class:`~scrapy.http.XmlRpcRequest`, as well as having - :ref:`downloader middlewares ` - and - :ref:`spider middlewares ` - enabled, such as - :class:`~scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware`, - :class:`~scrapy.downloadermiddlewares.useragent.UserAgentMiddleware`, - or - :class:`~scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware`, - may modify the :class:`~scrapy.http.Request` object. - - To translate a cURL command into a aioscrapy request, - you may use `curl2scrapy `_. - - """ + def from_curl( + cls: Type[RequestTypeVar], curl_command: str, ignore_unknown_options: bool = True, **kwargs + ) -> RequestTypeVar: + """Create a Request object from a string containing a `cURL""" request_kwargs = curl_to_request_kwargs(curl_command, ignore_unknown_options) request_kwargs.update(kwargs) return cls(**request_kwargs) - def serialize(self, callback): - from aioscrapy.utils.reqser import request_to_dict - return callback(request_to_dict(self)) - def make_fingerprint( self, - include_headers: Optional[Iterable[Union[bytes, str]]] = None, keep_fragments: bool = False, - ): + ) -> str: """ make the request fingerprint. """ - fp = hashlib.sha1() - fp.update(to_bytes(self.method)) - fp.update(to_bytes(canonicalize_url(self.url, keep_fragments=keep_fragments))) - fp.update(to_bytes(self.body) or b'') - fp.hexdigest() - return fp.hexdigest() + return hashlib.sha1( + json.dumps({ + 'method': to_unicode(self.method), + 'url': canonicalize_url(self.url, keep_fragments=keep_fragments), + 'body': self.body, + }, sort_keys=True).encode() + ).hexdigest() + + def to_dict(self, *, spider: Optional["aioscrapy.Spider"] = None) -> dict: + """Return a dictionary containing the Request's data. + + Use :func:`~scrapy.utils.request.request_from_dict` to convert back into a :class:`~scrapy.Request` object. + + If a spider is given, this method will try to find out the name of the spider methods used as callback + and errback and include them in the output dict, raising an exception if they cannot be found. + """ + d = { + "url": self.url, # urls are safe (safe_string_url) + "callback": _find_method(spider, self.callback) if callable(self.callback) else self.callback, + "errback": _find_method(spider, self.errback) if callable(self.errback) else self.errback, + "headers": dict(self.headers), + } + if self._fingerprint: + d['fingerprint'] = self._fingerprint + + for attr in self.attributes: + if attr != 'fingerprint': + d.setdefault(attr, getattr(self, attr)) + if type(self) is not Request: + d["_class"] = self.__module__ + '.' + self.__class__.__name__ + return d + + +def _find_method(obj, func): + """Helper function for Request.to_dict""" + # Only instance methods contain ``__func__`` + if obj and hasattr(func, '__func__'): + members = inspect.getmembers(obj, predicate=inspect.ismethod) + for name, obj_func in members: + # We need to use __func__ to access the original function object because instance + # method objects are generated each time attribute is retrieved from instance. + # + # Reference: The standard type hierarchy + # https://docs.python.org/3/reference/datamodel.html + if obj_func.__func__ is func.__func__: + return name + raise ValueError(f"Function {func} is not an instance method in: {obj}") diff --git a/aioscrapy/http/request/form.py b/aioscrapy/http/request/form.py index dea0d12..79972d4 100644 --- a/aioscrapy/http/request/form.py +++ b/aioscrapy/http/request/form.py @@ -4,18 +4,19 @@ See documentation in docs/topics/request-response.rst """ - +from typing import List, Optional, Tuple, Union from urllib.parse import urlencode from aioscrapy.http.request import Request from aioscrapy.utils.python import to_bytes, is_listlike +FormdataType = Optional[Union[dict, List[Tuple[str, str]]]] + class FormRequest(Request): valid_form_methods = ['GET', 'POST'] - def __init__(self, *args, **kwargs): - formdata = kwargs.pop('formdata', None) + def __init__(self, *args, formdata: FormdataType = None, **kwargs) -> None: if formdata and kwargs.get('method') is None: kwargs['method'] = 'POST' @@ -23,12 +24,12 @@ def __init__(self, *args, **kwargs): if formdata: items = formdata.items() if isinstance(formdata, dict) else formdata - querystr = _urlencode(items, self.encoding) + form_query: str = _urlencode(items, self.encoding) if self.method == 'POST': self.headers.setdefault('Content-Type', 'application/x-www-form-urlencoded') - self._set_body(querystr) + self._set_body(form_query) else: - self._set_url(self.url + ('&' if '?' in self.url else '?') + querystr) + self._set_url(self.url + ('&' if '?' in self.url else '?') + form_query) def _urlencode(seq, enc): diff --git a/aioscrapy/http/request/json_request.py b/aioscrapy/http/request/json_request.py index 661cd05..48bf2d0 100644 --- a/aioscrapy/http/request/json_request.py +++ b/aioscrapy/http/request/json_request.py @@ -8,15 +8,17 @@ import copy import json import warnings +from typing import Optional, Tuple from aioscrapy.http.request import Request from aioscrapy.utils.deprecate import create_deprecated_class class JsonRequest(Request): - def __init__(self, *args, **kwargs): - dumps_kwargs = copy.deepcopy(kwargs.pop('dumps_kwargs', {})) - dumps_kwargs.setdefault('sort_keys', True) + attributes: Tuple[str, ...] = Request.attributes + ("dumps_kwargs",) + + def __init__(self, *args, dumps_kwargs: Optional[dict] = None, **kwargs) -> None: + dumps_kwargs = copy.deepcopy(dumps_kwargs) if dumps_kwargs is not None else {} self._dumps_kwargs = dumps_kwargs body_passed = kwargs.get('body', None) is not None @@ -36,7 +38,11 @@ def __init__(self, *args, **kwargs): self.headers.setdefault('Content-Type', 'application/json') self.headers.setdefault('Accept', 'application/json, text/javascript, */*; q=0.01') - def replace(self, *args, **kwargs): + @property + def dumps_kwargs(self) -> dict: + return self._dumps_kwargs + + def replace(self, *args, **kwargs) -> Request: body_passed = kwargs.get('body', None) is not None data = kwargs.pop('data', None) data_passed = data is not None @@ -49,7 +55,7 @@ def replace(self, *args, **kwargs): return super().replace(*args, **kwargs) - def _dumps(self, data): + def _dumps(self, data: dict) -> str: """Convert to JSON """ return json.dumps(data, **self._dumps_kwargs) diff --git a/aioscrapy/queue/__init__.py b/aioscrapy/queue/__init__.py index 0e9848e..110b589 100644 --- a/aioscrapy/queue/__init__.py +++ b/aioscrapy/queue/__init__.py @@ -1,33 +1,22 @@ from abc import ABCMeta, abstractmethod -from typing import Optional +from typing import Optional, Any +import aioscrapy from aioscrapy.serializer import AbsSerializer -from aioscrapy.utils.reqser import request_to_dict, request_from_dict +from aioscrapy.utils.reqser import request_from_dict -class AbsQueue(object, metaclass=ABCMeta): +class AbsQueue(metaclass=ABCMeta): """Per-spider base queue class""" def __init__( - self, container, - spider: Optional[str] = None, + self, + container: Any, + spider: Optional[aioscrapy.Spider] = None, key: Optional[str] = None, serializer: Optional[AbsSerializer] = None - ): - """Initialize per-spider redis queue. - - Parameters - ---------- - container : Redis/Queue - The queue for Request. - spider : Spider - aioscrapy spider instance. - key: str - Redis key where to put and get messages. - serializer : object - Serializer object with ``loads`` and ``dumps`` methods. - - """ + ) -> None: + """Initialize per-spider queue""" self.container = container self.spider = spider self.key = key @@ -35,40 +24,40 @@ def __init__( @property @abstractmethod - def inc_key(self): + def inc_key(self) -> str: """stats inc_value""" @classmethod @abstractmethod - async def from_spider(cls, spider) -> "AbsQueue": + async def from_spider(cls, spider: aioscrapy.Spider) -> "AbsQueue": """get queue instance from spider""" - def _encode_request(self, request): + def _encode_request(self, request: aioscrapy.Request) -> Any: """Encode a request object""" - obj = request_to_dict(request, self.spider) + obj = request.to_dict(spider=self.spider) return self.serializer.dumps(obj) - def _decode_request(self, encoded_request): + def _decode_request(self, encoded_request: Any) -> aioscrapy.Request: """Decode an request previously encoded""" obj = self.serializer.loads(encoded_request) - return request_from_dict(obj, self.spider) + return request_from_dict(obj, spider=self.spider) - def __len__(self): + def __len__(self) -> None: """Return the length of the queue""" raise Exception('please use len()') @abstractmethod - async def len(self): + async def len(self) -> int: """Return the length of the queue""" @abstractmethod - async def push(self, request): + async def push(self, request: aioscrapy.Request) -> None: """Push a request""" @abstractmethod - async def pop(self, timeout=0): + async def pop(self, timeout: int = 0) -> Optional[aioscrapy.Request]: """Pop a request""" @abstractmethod - async def clear(self): + async def clear(self) -> None: """Clear queue/stack""" diff --git a/aioscrapy/queue/memory.py b/aioscrapy/queue/memory.py index 815747b..ab31879 100644 --- a/aioscrapy/queue/memory.py +++ b/aioscrapy/queue/memory.py @@ -2,6 +2,7 @@ from asyncio.queues import QueueEmpty from typing import Optional +import aioscrapy from aioscrapy.queue import AbsQueue from aioscrapy.serializer import AbsSerializer from aioscrapy.utils.misc import load_object @@ -11,21 +12,22 @@ class MemoryQueueBase(AbsQueue): inc_key = 'scheduler/enqueued/memory' def __init__( - self, container, spider, + self, + container: Queue, + spider: Optional[aioscrapy.Spider], key: Optional[str] = None, serializer: Optional[AbsSerializer] = None, max_size: int = 0 - ): + ) -> None: super().__init__(container, spider, key, serializer) self.max_size = max_size @classmethod - async def from_spider(cls, spider) -> "MemoryQueueBase": - settings = spider.settings - max_size = settings.getint("QUEUE_MAXSIZE", 0) - queue = cls.get_queue(max_size) - queue_key = settings.get("SCHEDULER_QUEUE_KEY", '%(spider)s:requests') - serializer = settings.get("SCHEDULER_SERIALIZER", "aioscrapy.serializer.PickleSerializer") + async def from_spider(cls, spider: aioscrapy.Spider) -> "MemoryQueueBase": + max_size: int = spider.settings.getint("QUEUE_MAXSIZE", 0) + queue: Queue = cls.get_queue(max_size) + queue_key: str = spider.settings.get("SCHEDULER_QUEUE_KEY", '%(spider)s:requests') + serializer: str = spider.settings.get("SCHEDULER_SERIALIZER", "aioscrapy.serializer.PickleSerializer") serializer: AbsSerializer = load_object(serializer) return cls( queue, @@ -39,14 +41,14 @@ def len(self) -> int: return self.container.qsize() @staticmethod - def get_queue(max_size): + def get_queue(max_size: int) -> Queue: raise NotImplementedError - async def push(self, request): + async def push(self, request) -> None: data = self._encode_request(request) await self.container.put(data) - async def pop(self, count: int = 1): + async def pop(self, count: int = 1) -> None: for _ in range(count): try: data = self.container.get_nowait() @@ -54,34 +56,34 @@ async def pop(self, count: int = 1): break yield self._decode_request(data) - async def clear(self, timeout: int = 0): + async def clear(self, timeout: int = 0) -> None: self.container = self.get_queue(self.max_size) class MemoryFifoQueue(MemoryQueueBase): @staticmethod - def get_queue(max_size): + def get_queue(max_size: int) -> Queue: return Queue(max_size) class MemoryLifoQueue(MemoryFifoQueue): @staticmethod - def get_queue(max_size): + def get_queue(max_size: int) -> LifoQueue: return LifoQueue(max_size) class MemoryPriorityQueue(MemoryFifoQueue): @staticmethod - def get_queue(max_size): + def get_queue(max_size: int) -> PriorityQueue: return PriorityQueue(max_size) - async def push(self, request): + async def push(self, request: aioscrapy.Request) -> None: data = self._encode_request(request) score = request.priority await self.container.put((score, data)) - async def pop(self, count: int = 1): + async def pop(self, count: int = 1) -> Optional[aioscrapy.Request]: for _ in range(count): try: score, data = self.container.get_nowait() diff --git a/aioscrapy/queue/rabbitmq.py b/aioscrapy/queue/rabbitmq.py index 3d77eb6..a6354c3 100644 --- a/aioscrapy/queue/rabbitmq.py +++ b/aioscrapy/queue/rabbitmq.py @@ -1,3 +1,6 @@ +from typing import Optional + +import aioscrapy from aioscrapy.db import db_manager from aioscrapy.queue import AbsQueue from aioscrapy.serializer import AbsSerializer @@ -8,11 +11,11 @@ class RabbitMqPriorityQueue(AbsQueue): inc_key = 'scheduler/enqueued/rabbitmq' @classmethod - def from_dict(cls, data: dict) -> "AbsQueue": - alias = data.get("alias", 'queue') - server = db_manager.rabbitmq.executor(alias) - spider_name = data["spider_name"] - serializer = data.get("serializer", "aioscrapy.serializer.JsonSerializer") + def from_dict(cls, data: dict) -> "RabbitMqPriorityQueue": + alias: str = data.get("alias", 'queue') + server: aioscrapy.db.aiorabbitmq.RabbitmqExecutor = db_manager.rabbitmq.executor(alias) + spider_name: str = data["spider_name"] + serializer: str = data.get("serializer", "aioscrapy.serializer.JsonSerializer") serializer: AbsSerializer = load_object(serializer) return cls( server, @@ -21,12 +24,11 @@ def from_dict(cls, data: dict) -> "AbsQueue": ) @classmethod - async def from_spider(cls, spider) -> "RabbitMqPriorityQueue": - settings = spider.settings - alias = settings.get("SCHEDULER_QUEUE_ALIAS", 'queue') - executor = db_manager.rabbitmq.executor(alias) - queue_key = settings.get("SCHEDULER_QUEUE_KEY", '%(spider)s:requests') - serializer = settings.get("SCHEDULER_SERIALIZER", "aioscrapy.serializer.JsonSerializer") + async def from_spider(cls, spider: aioscrapy.Spider) -> "RabbitMqPriorityQueue": + alias: str = spider.settings.get("SCHEDULER_QUEUE_ALIAS", 'queue') + executor: aioscrapy.db.aiorabbitmq.RabbitmqExecutor = db_manager.rabbitmq.executor(alias) + queue_key: str = spider.settings.get("SCHEDULER_QUEUE_KEY", '%(spider)s:requests') + serializer: str = spider.settings.get("SCHEDULER_SERIALIZER", "aioscrapy.serializer.JsonSerializer") serializer: AbsSerializer = load_object(serializer) return cls( executor, @@ -38,7 +40,7 @@ async def from_spider(cls, spider) -> "RabbitMqPriorityQueue": async def len(self) -> int: return await self.container.get_message_count(self.key) - async def push(self, request): + async def push(self, request: aioscrapy.Request) -> None: data = self._encode_request(request) score = request.priority await self.container.publish( @@ -47,13 +49,13 @@ async def push(self, request): priority=score ) - async def pop(self, count: int = 1): + async def pop(self, count: int = 1) -> Optional[aioscrapy.Request]: result = await self.container.get_message(self.key) if result: yield self._decode_request(result) - async def clear(self): - return await self.container.clean_message_queue(self.key) + async def clear(self) -> None: + await self.container.clean_message_queue(self.key) SpiderPriorityQueue = RabbitMqPriorityQueue diff --git a/aioscrapy/queue/redis.py b/aioscrapy/queue/redis.py index 13abb4c..be65546 100644 --- a/aioscrapy/queue/redis.py +++ b/aioscrapy/queue/redis.py @@ -1,10 +1,12 @@ import logging from abc import ABC +from typing import Optional -from aioscrapy.queue import AbsQueue +import aioscrapy from aioscrapy.db import db_manager -from aioscrapy.utils.misc import load_object +from aioscrapy.queue import AbsQueue from aioscrapy.serializer import AbsSerializer +from aioscrapy.utils.misc import load_object logger = logging.getLogger(__name__) @@ -13,11 +15,11 @@ class RedisQueueBase(AbsQueue, ABC): inc_key = 'scheduler/enqueued/redis' @classmethod - def from_dict(cls, data: dict) -> "AbsQueue": - alias = data.get("alias", 'queue') - server = db_manager.redis(alias) - spider_name = data["spider_name"] - serializer = data.get("serializer", "aioscrapy.serializer.JsonSerializer") + def from_dict(cls, data: dict) -> "RedisQueueBase": + alias: str = data.get("alias", 'queue') + server: aioscrapy.db.aioredis.Redis = db_manager.redis(alias) + spider_name: str = data["spider_name"] + serializer: str = data.get("serializer", "aioscrapy.serializer.JsonSerializer") serializer: AbsSerializer = load_object(serializer) return cls( server, @@ -26,12 +28,11 @@ def from_dict(cls, data: dict) -> "AbsQueue": ) @classmethod - async def from_spider(cls, spider) -> "RedisQueueBase": - settings = spider.settings - alias = settings.get("SCHEDULER_QUEUE_ALIAS", 'queue') - server = db_manager.redis(alias) - queue_key = settings.get("SCHEDULER_QUEUE_KEY", '%(spider)s:requests') - serializer = settings.get("SCHEDULER_SERIALIZER", "aioscrapy.serializer.JsonSerializer") + async def from_spider(cls, spider: aioscrapy.Spider) -> "RedisQueueBase": + alias: str = spider.settings.get("SCHEDULER_QUEUE_ALIAS", 'queue') + server: aioscrapy.db.aioredis.Redis = db_manager.redis(alias) + queue_key: str = spider.settings.get("SCHEDULER_QUEUE_KEY", '%(spider)s:requests') + serializer: str = spider.settings.get("SCHEDULER_SERIALIZER", "aioscrapy.serializer.JsonSerializer") serializer: AbsSerializer = load_object(serializer) return cls( server, @@ -40,7 +41,7 @@ async def from_spider(cls, spider) -> "RedisQueueBase": serializer=serializer ) - async def clear(self): + async def clear(self) -> None: """Clear queue/stack""" await self.container.delete(self.key) @@ -48,14 +49,14 @@ async def clear(self): class RedisFifoQueue(RedisQueueBase): """Per-spider FIFO queue""" - async def len(self): + async def len(self) -> int: return await self.container.llen(self.key) - async def push(self, request): + async def push(self, request: aioscrapy.Request) -> None: """Push a request""" await self.container.lpush(self.key, self._encode_request(request)) - async def pop(self, count: int = 1): + async def pop(self, count: int = 1) -> Optional[aioscrapy.Request]: """Pop a request""" async with self.container.pipeline(transaction=True) as pipe: for _ in range(count): @@ -69,16 +70,16 @@ async def pop(self, count: int = 1): class RedisPriorityQueue(RedisQueueBase): """Per-spider priority queue abstraction using redis' sorted set""" - async def len(self): + async def len(self) -> int: return await self.container.zcard(self.key) - async def push(self, request): + async def push(self, request: aioscrapy.Request) -> None: """Push a request""" data = self._encode_request(request) score = request.priority await self.container.zadd(self.key, {data: score}) - async def pop(self, count: int = 1): + async def pop(self, count: int = 1) -> Optional[aioscrapy.Request]: async with self.container.pipeline(transaction=True) as pipe: stop = count - 1 if count - 1 > 0 else 0 results, _ = await ( @@ -93,14 +94,14 @@ async def pop(self, count: int = 1): class RedisLifoQueue(RedisQueueBase): """Per-spider LIFO queue.""" - async def len(self): + async def len(self) -> int: return await self.container.llen(self.key) - async def push(self, request): + async def push(self, request: aioscrapy.Request) -> None: """Push a request""" await self.container.lpush(self.key, self._encode_request(request)) - async def pop(self, count: int = 1): + async def pop(self, count: int = 1) -> Optional[aioscrapy.Request]: """Pop a request""" async with self.container.pipeline(transaction=True) as pipe: for _ in range(count): @@ -111,7 +112,6 @@ async def pop(self, count: int = 1): yield self._decode_request(result) -# TODO: Deprecate the use of these names. SpiderQueue = RedisFifoQueue SpiderStack = RedisLifoQueue SpiderPriorityQueue = RedisPriorityQueue diff --git a/aioscrapy/settings/default_settings.py b/aioscrapy/settings/default_settings.py index 777bbc8..63ff60c 100644 --- a/aioscrapy/settings/default_settings.py +++ b/aioscrapy/settings/default_settings.py @@ -14,7 +14,6 @@ """ import sys -from importlib import import_module from os.path import join, abspath, dirname AUTOTHROTTLE_ENABLED = False @@ -166,7 +165,4 @@ URLLENGTH_LIMIT = 2083 -USER_AGENT = f'Aioscrapy/{import_module("aioscrapy").__version__}' - - CLOSE_SPIDER_ON_IDLE = False diff --git a/aioscrapy/utils/misc.py b/aioscrapy/utils/misc.py index 1f5bae1..a0f9835 100644 --- a/aioscrapy/utils/misc.py +++ b/aioscrapy/utils/misc.py @@ -2,7 +2,8 @@ from importlib import import_module from pkgutil import iter_modules -from .tools import call_helper + +from aioscrapy.utils.tools import call_helper def walk_modules(path): diff --git a/aioscrapy/utils/reqser.py b/aioscrapy/utils/reqser.py index 1906a53..55a3974 100644 --- a/aioscrapy/utils/reqser.py +++ b/aioscrapy/utils/reqser.py @@ -1,97 +1,15 @@ """ Helper functions for serializing (and deserializing) requests. """ -import inspect +from typing import Optional -from aioscrapy.http import Request -from aioscrapy.utils.misc import load_object -from aioscrapy.utils.python import to_unicode +import aioscrapy +from aioscrapy.utils.request import request_from_dict as _from_dict -def request_to_dict(request, spider=None): - """Convert Request object to a dict. +def request_to_dict(request: "aioscrapy.Request", spider: Optional["aioscrapy.Spider"] = None) -> dict: + return request.to_dict(spider=spider) - If a spider is given, it will try to find out the name of the spider method - used in the callback and store that as the callback. - """ - cb = request.callback - if callable(cb): - cb = _find_method(spider, cb) - eb = request.errback - if callable(eb): - eb = _find_method(spider, eb) - d = { - 'url': to_unicode(request.url), # urls should be safe (safe_string_url) - 'callback': cb, - 'errback': eb, - 'method': request.method, - 'headers': dict(request.headers), - 'body': request.body, - 'cookies': request.cookies, - 'meta': request.meta, - '_encoding': request._encoding, - 'priority': request.priority, - 'dont_filter': request.dont_filter, - 'flags': request.flags, - 'cb_kwargs': request.cb_kwargs, - 'fingerprint': request._fingerprint, - } - if type(request) is not Request: - d['_class'] = request.__module__ + '.' + request.__class__.__name__ - return d - -def request_from_dict(d, spider=None): - """Create Request object from a dict. - - If a spider is given, it will try to resolve the callbacks looking at the - spider for methods with the same name. - """ - cb = d.get('callback') or 'parse' - if cb and spider: - cb = _get_method(spider, cb) - eb = d.get('errback') - if eb and spider: - eb = _get_method(spider, eb) - request_cls = load_object(d['_class']) if '_class' in d else Request - return request_cls( - url=to_unicode(d['url']), - callback=cb, - errback=eb, - method=d.get('method', 'GET'), - headers=d.get('headers'), - body=d.get('body'), - cookies=d.get('cookies'), - meta=d.get('meta'), - encoding=d.get('_encoding', 'utf-8'), - priority=d.get('priority', 0), - dont_filter=d.get('dont_filter', True), - flags=d.get('flags'), - cb_kwargs=d.get('cb_kwargs'), - fingerprint=d.get('fingerprint'), - ) - - -def _find_method(obj, func): - # Only instance methods contain ``__func__`` - if obj and hasattr(func, '__func__'): - members = inspect.getmembers(obj, predicate=inspect.ismethod) - for name, obj_func in members: - # We need to use __func__ to access the original - # function object because instance method objects - # are generated each time attribute is retrieved from - # instance. - # - # Reference: The standard type hierarchy - # https://docs.python.org/3/reference/datamodel.html - if obj_func.__func__ is func.__func__: - return name - raise ValueError(f"Function {func} is not an instance method in: {obj}") - - -def _get_method(obj, name): - name = str(name) - try: - return getattr(obj, name) - except AttributeError: - raise ValueError(f"Method {name!r} not found in: {obj}") +def request_from_dict(d: dict, spider: Optional["aioscrapy.Spider"] = None) -> "aioscrapy.Request": + return _from_dict(d, spider=spider) diff --git a/aioscrapy/utils/request.py b/aioscrapy/utils/request.py index cdd65ab..b9fc0af 100644 --- a/aioscrapy/utils/request.py +++ b/aioscrapy/utils/request.py @@ -8,8 +8,9 @@ from w3lib.http import headers_dict_to_raw -from aioscrapy.http import Request +from aioscrapy import Spider, Request from aioscrapy.utils.httpobj import urlparse_cached +from aioscrapy.utils.misc import load_object from aioscrapy.utils.python import to_bytes, to_unicode @@ -36,3 +37,27 @@ def referer_str(request: Request) -> Optional[str]: if referrer is None: return referrer return to_unicode(referrer, errors='replace') + + +def request_from_dict(d: dict, *, spider: Optional[Spider] = None) -> Request: + """Create a :class:`~scrapy.Request` object from a dict. + + If a spider is given, it will try to resolve the callbacks looking at the + spider for methods with the same name. + """ + request_cls = load_object(d["_class"]) if "_class" in d else Request + kwargs = {key: value for key, value in d.items() if key in request_cls.attributes} + if d.get("callback") and spider: + kwargs["callback"] = _get_method(spider, d["callback"]) + if d.get("errback") and spider: + kwargs["errback"] = _get_method(spider, d["errback"]) + return request_cls(**kwargs) + + +def _get_method(obj, name): + """Helper function for request_from_dict""" + name = str(name) + try: + return getattr(obj, name) + except AttributeError: + raise ValueError(f"Method {name!r} not found in: {obj}")