diff --git a/aioscrapy/__init__.py b/aioscrapy/__init__.py
index d6b9b11..453b7b2 100644
--- a/aioscrapy/__init__.py
+++ b/aioscrapy/__init__.py
@@ -8,10 +8,12 @@
# Declare top-level shortcuts
from aioscrapy.spiders import Spider
from aioscrapy.http import Request, FormRequest
+from aioscrapy.settings import Settings
+from aioscrapy.crawler import Crawler
__all__ = [
- '__version__', 'version_info', 'Spider', 'Request', 'FormRequest',
+ '__version__', 'version_info', 'Spider', 'Request', 'FormRequest', 'Crawler'
]
diff --git a/aioscrapy/core/engine.py b/aioscrapy/core/engine.py
index b81ea8a..7e3c629 100644
--- a/aioscrapy/core/engine.py
+++ b/aioscrapy/core/engine.py
@@ -4,6 +4,7 @@
import logging
from typing import Optional, AsyncGenerator, Union, Callable
+import aioscrapy
from aioscrapy import Spider
from aioscrapy import signals
from aioscrapy.core.downloader import DownloaderTV
@@ -35,7 +36,7 @@ def remove_request(self, request: Request) -> None:
class ExecutionEngine(object):
- def __init__(self, crawler) -> None:
+ def __init__(self, crawler: "aioscrapy.Crawler") -> None:
self.crawler = crawler
self.settings = crawler.settings
self.signals = crawler.signals
diff --git a/aioscrapy/core/scheduler.py b/aioscrapy/core/scheduler.py
index 997d4b5..ec99945 100644
--- a/aioscrapy/core/scheduler.py
+++ b/aioscrapy/core/scheduler.py
@@ -1,9 +1,7 @@
from abc import abstractmethod
from typing import Optional, Type, TypeVar
-from aioscrapy import Spider
-from aioscrapy.dupefilters import DupeFilterBase
-from aioscrapy.http.request import Request
+import aioscrapy
from aioscrapy.queue import AbsQueue
from aioscrapy.statscollectors import StatsCollector
from aioscrapy.utils.misc import load_instance
@@ -29,7 +27,7 @@ def __subclasscheck__(cls, subclass):
class BaseScheduler(metaclass=BaseSchedulerMeta):
@classmethod
- async def from_crawler(cls, crawler):
+ async def from_crawler(cls, crawler: "aioscrapy.Crawler") -> "BaseScheduler":
"""
Factory method which receives the current :class:`~scrapy.crawler.Crawler` object as argument.
"""
@@ -53,7 +51,7 @@ async def has_pending_requests(self) -> bool:
raise NotImplementedError()
@abstractmethod
- async def enqueue_request(self, request: Request) -> bool:
+ async def enqueue_request(self, request: aioscrapy.Request) -> bool:
"""
Process a request received by the engine.
@@ -67,7 +65,7 @@ async def enqueue_request(self, request: Request) -> bool:
raise NotImplementedError()
@abstractmethod
- async def next_request(self) -> Optional[Request]:
+ async def next_request(self) -> Optional[aioscrapy.Request]:
"""
Return the next :class:`~scrapy.http.Request` to be processed, or ``None``
to indicate that there are no requests to be considered ready at the moment.
@@ -87,7 +85,7 @@ class Scheduler(BaseScheduler):
def __init__(
self,
queue: AbsQueue,
- spider: Spider,
+ spider: aioscrapy.Spider,
stats=Optional[StatsCollector],
persist: bool = True
):
@@ -97,7 +95,7 @@ def __init__(
self.persist = persist
@classmethod
- async def from_crawler(cls: Type[SchedulerTV], crawler) -> SchedulerTV:
+ async def from_crawler(cls: Type[SchedulerTV], crawler: "aioscrapy.Crawler") -> SchedulerTV:
instance = cls(
await load_instance(crawler.settings['SCHEDULER_QUEUE_CLASS'], spider=crawler.spider),
crawler.spider,
@@ -120,13 +118,13 @@ async def close(self, reason: str) -> None:
async def flush(self) -> None:
await call_helper(self.queue.clear)
- async def enqueue_request(self, request: Request) -> bool:
+ async def enqueue_request(self, request: aioscrapy.Request) -> bool:
await call_helper(self.queue.push, request)
if self.stats:
self.stats.inc_value(self.queue.inc_key, spider=self.spider)
return True
- async def next_request(self, count: int = 1) -> Optional[Request]:
+ async def next_request(self, count: int = 1) -> Optional[aioscrapy.Request]:
async for request in self.queue.pop(count):
if request and self.stats:
self.stats.inc_value(self.queue.inc_key, spider=self.spider)
diff --git a/aioscrapy/core/scraper.py b/aioscrapy/core/scraper.py
index 689f932..f52e208 100644
--- a/aioscrapy/core/scraper.py
+++ b/aioscrapy/core/scraper.py
@@ -4,6 +4,7 @@
import logging
from typing import Any, AsyncGenerator, Set, Union, Optional
+import aioscrapy
from aioscrapy import signals, Spider
from aioscrapy.exceptions import CloseSpider, DropItem, IgnoreRequest
from aioscrapy.http import Request, Response
@@ -54,7 +55,7 @@ class Scraper:
def __init__(
self,
- crawler,
+ crawler: "aioscrapy.Crawler",
slot: Slot,
spidermw: SpiderMiddlewareManager,
itemproc: ItemPipelineManager,
@@ -72,7 +73,7 @@ def __init__(
self.concurrent_parser = asyncio.Semaphore(crawler.settings.getint('CONCURRENT_PARSER', 1))
@classmethod
- async def from_crawler(cls, crawler):
+ async def from_crawler(cls, crawler: "aioscrapy.Crawler") -> "Scraper":
instance: "Scraper" = cls(
crawler,
Slot(crawler.settings.getint('SCRAPER_SLOT_MAX_ACTIVE_SIZE')),
diff --git a/aioscrapy/crawler.py b/aioscrapy/crawler.py
index 6faaddf..3c55746 100644
--- a/aioscrapy/crawler.py
+++ b/aioscrapy/crawler.py
@@ -4,7 +4,7 @@
import signal
import sys
import warnings
-from typing import Optional
+from typing import Optional, Type, Union, Any
from zope.interface.exceptions import DoesNotImplement
@@ -15,8 +15,9 @@
MultipleInvalid = None
from zope.interface.verify import verifyClass
+from aioscrapy.logformatter import LogFormatter
from aioscrapy import signals, Spider
-from aioscrapy.settings import overridden_settings
+from aioscrapy.settings import overridden_settings, Settings
from aioscrapy.utils.log import (
get_scrapy_root_handler,
install_scrapy_root_handler,
@@ -31,7 +32,6 @@
from aioscrapy.utils.tools import async_generator_wrapper
from aioscrapy.middleware import ExtensionManager
from aioscrapy.core.engine import ExecutionEngine
-from aioscrapy.settings import Settings
from aioscrapy.signalmanager import SignalManager
from aioscrapy.utils.ossignal import install_shutdown_handlers, signal_names
from aioscrapy.statscollectors import StatsCollector
@@ -41,7 +41,7 @@
class Crawler:
- def __init__(self, spidercls, settings=None):
+ def __init__(self, spidercls: Type[Spider], settings: Union[Settings, dict, None] = None) -> None:
if isinstance(spidercls, Spider):
raise ValueError('The spidercls argument must be a class, not an object')
@@ -49,7 +49,7 @@ def __init__(self, spidercls, settings=None):
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
- self.spidercls: Spider = spidercls
+ self.spidercls = spidercls
self.settings = settings.copy()
self.spidercls.update_settings(self.settings)
@@ -59,10 +59,10 @@ def __init__(self, spidercls, settings=None):
self.crawling = False
self.spider: Optional[Spider] = None
self.engine: Optional[ExecutionEngine] = None
- self.extensions: Optional[ExecutionEngine] = None
- self.logformatter: Optional[ExecutionEngine] = None
+ self.extensions: Optional[ExtensionManager] = None
+ self.logformatter: Optional[LogFormatter] = None
- async def crawl(self, *args, **kwargs):
+ async def crawl(self, *args, **kwargs) -> None:
try:
if self.crawling:
raise RuntimeError("Crawling already taking place")
@@ -96,7 +96,7 @@ async def crawl(self, *args, **kwargs):
await self.engine.close()
raise e
- async def stop(self):
+ async def stop(self) -> None:
"""Starts a graceful stop of the crawler and returns a deferred that is
fired when the crawler is stopped."""
if self.crawling:
@@ -112,7 +112,7 @@ class CrawlerRunner:
)
@staticmethod
- def _get_spider_loader(settings):
+ def _get_spider_loader(settings: Settings) -> ISpiderLoader:
""" Get SpiderLoader instance from settings """
cls_path = settings.get('SPIDER_LOADER_CLASS')
loader_cls = load_object(cls_path)
@@ -128,7 +128,7 @@ def _get_spider_loader(settings):
)
return loader_cls.from_settings(settings.frozencopy())
- def __init__(self, settings=None):
+ def __init__(self, settings: Union[Settings, dict, None] = None) -> None:
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
self.settings = settings
@@ -144,12 +144,18 @@ def spiders(self):
category=AioScrapyDeprecationWarning, stacklevel=2)
return self.spider_loader
- def crawl_soon(self, crawler_or_spidercls, *args, settings=None, **kwargs):
+ def crawl_soon(
+ self,
+ crawler_or_spidercls: Union[Type[Spider], Crawler],
+ *args,
+ settings: Union[Settings, dict, None] = None,
+ **kwargs
+ ) -> None:
crawler = self.crawl(crawler_or_spidercls, settings=settings)
self.crawlers.setdefault(crawler, (args, kwargs))
self.active_crawler(crawler, *args, **kwargs)
- def active_crawler(self, crawler, *args, **kwargs):
+ def active_crawler(self, crawler: Crawler, *args, **kwargs) -> None:
task = asyncio.create_task(crawler.crawl(*args, **kwargs))
self._active.add(task)
@@ -161,7 +167,13 @@ def _done(result):
task.add_done_callback(_done)
- def crawl(self, crawler_or_spidercls, *args, settings=None, **kwargs):
+ def crawl(
+ self,
+ crawler_or_spidercls: Union[Type[Spider], Crawler],
+ *args,
+ settings: Union[Settings, dict, None] = None,
+ **kwargs
+ ) -> Crawler:
if isinstance(crawler_or_spidercls, Spider):
raise ValueError(
'The crawler_or_spidercls argument cannot be a spider object, '
@@ -170,7 +182,11 @@ def crawl(self, crawler_or_spidercls, *args, settings=None, **kwargs):
self.crawlers.setdefault(crawler, (args, kwargs))
return crawler
- def create_crawler(self, crawler_or_spidercls, settings):
+ def create_crawler(
+ self,
+ crawler_or_spidercls: Union[Type[Spider], Crawler, str],
+ settings: Union[Settings, dict, None]
+ ) -> Crawler:
if isinstance(crawler_or_spidercls, Spider):
raise ValueError(
'The crawler_or_spidercls argument cannot be a spider object, '
@@ -179,44 +195,52 @@ def create_crawler(self, crawler_or_spidercls, settings):
return crawler_or_spidercls
return self._create_crawler(crawler_or_spidercls, settings)
- def _create_crawler(self, spidercls, settings):
+ def _create_crawler(
+ self,
+ spidercls: Union[Type[Spider], str],
+ settings: Union[Settings, dict, None]
+ ) -> Crawler:
if isinstance(spidercls, str):
spidercls = self.spider_loader.load(spidercls)
return Crawler(spidercls, settings=settings)
- async def stop(self):
- return await asyncio.gather(*[c.stop() for c in self.crawlers])
+ async def stop(self) -> None:
+ await asyncio.gather(*[c.stop() for c in self.crawlers])
class CrawlerProcess(CrawlerRunner):
- def __init__(self, settings=None, install_root_handler=True):
+ def __init__(
+ self,
+ settings: Union[Settings, dict, None] = None,
+ install_root_handler: bool = True
+ ) -> None:
super().__init__(settings)
install_shutdown_handlers(self._signal_shutdown)
configure_logging(self.settings, install_root_handler)
- def _signal_shutdown(self, signum, _):
+ def _signal_shutdown(self, signum: Any, _) -> None:
install_shutdown_handlers(self._signal_kill)
signame = signal_names[signum]
logger.info("Received %(signame)s, shutting down gracefully. Send again to force ",
{'signame': signame})
asyncio.create_task(self._graceful_stop_reactor())
- def _signal_kill(self, signum, _):
+ def _signal_kill(self, signum: Any, _) -> None:
install_shutdown_handlers(signal.SIG_IGN)
signame = signal_names[signum]
logger.info('Received %(signame)s twice, forcing unclean shutdown',
{'signame': signame})
asyncio.create_task(self._stop_reactor())
- async def run(self):
+ async def run(self) -> None:
for crawler, (args, kwargs) in self.crawlers.items():
self.active_crawler(crawler, *args, **kwargs)
while self._active:
await asyncio.gather(*self._active)
await self.recycle_db_connect()
- def start(self):
+ def start(self) -> None:
if sys.platform.startswith('win'):
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
else:
@@ -227,17 +251,17 @@ def start(self):
pass
asyncio.run(self.run())
- async def _graceful_stop_reactor(self):
+ async def _graceful_stop_reactor(self) -> None:
await self.stop()
await self.recycle_db_connect()
- async def _stop_reactor(self):
+ async def _stop_reactor(self) -> None:
try:
await self.recycle_db_connect()
finally:
asyncio.get_event_loop().stop()
- async def recycle_db_connect(self):
- # 回收所以的链接
+ async def recycle_db_connect(self) -> None:
+ # recycle pool of db_manager
if not len(self._active):
await db_manager.close_all()
diff --git a/aioscrapy/db/__init__.py b/aioscrapy/db/__init__.py
index f4591c9..690a9e5 100644
--- a/aioscrapy/db/__init__.py
+++ b/aioscrapy/db/__init__.py
@@ -1,6 +1,9 @@
import logging
+from typing import Any
-from aioscrapy.db._aioredis import redis_manager
+import aioscrapy
+from aioscrapy.db.absmanager import AbsDBPoolManager
+from aioscrapy.db.aioredis import redis_manager
db_manager_map = {
'redis': redis_manager
@@ -8,7 +11,7 @@
try:
from aiomysql import create_pool
- from aioscrapy.db._aiomysql import mysql_manager
+ from aioscrapy.db.aiomysql import mysql_manager
db_manager_map['mysql'] = mysql_manager
except ImportError:
@@ -16,7 +19,7 @@
try:
import aio_pika
- from aioscrapy.db._aiorabbitmq import rabbitmq_manager
+ from aioscrapy.db.aiorabbitmq import rabbitmq_manager
db_manager_map['rabbitmq'] = rabbitmq_manager
except ImportError:
@@ -30,22 +33,22 @@
class DBManager:
@staticmethod
- def get_manager(db_type):
+ def get_manager(db_type: str) -> AbsDBPoolManager:
manager = db_manager_map.get(db_type)
assert manager is not None, f"Not support db type:{db_type}"
return manager
- def get_pool(self, db_type, alias='default'):
+ def get_pool(self, db_type: str, alias='default') -> Any:
manager = self.get_manager(db_type)
return manager.get_pool(alias)
@staticmethod
- async def close_all():
+ async def close_all() -> None:
for manager in db_manager_map.values():
await manager.close_all()
@staticmethod
- async def from_dict(db_args: dict):
+ async def from_dict(db_args: dict) -> None:
for db_type, args in db_args.items():
manager = db_manager_map.get(db_type)
if manager is None:
@@ -53,14 +56,14 @@ async def from_dict(db_args: dict):
await manager.from_dict(args)
@staticmethod
- async def from_settings(settings: "aioscrapy.settings.Setting"):
+ async def from_settings(settings: aioscrapy.Settings) -> None:
for manager in db_manager_map.values():
await manager.from_settings(settings)
- async def from_crawler(self, crawler):
+ async def from_crawler(self, crawler: "aioscrapy.Crawler") -> None:
return await self.from_settings(crawler.settings)
- def __getattr__(self, db_type: str):
+ def __getattr__(self, db_type: str) -> Any:
if db_type not in db_manager_map:
raise AttributeError(f'Not support db type: {db_type}')
return db_manager_map[db_type]
diff --git a/aioscrapy/db/absmanager.py b/aioscrapy/db/absmanager.py
index a88e3f6..bc95185 100644
--- a/aioscrapy/db/absmanager.py
+++ b/aioscrapy/db/absmanager.py
@@ -1,6 +1,6 @@
from abc import ABCMeta, abstractmethod
-import aioscrapy.crawler
+import aioscrapy
class AbsDBPoolManager(object, metaclass=ABCMeta):
@@ -31,10 +31,10 @@ async def from_dict(self, db_args: dict):
"""Create pool with dict"""
@abstractmethod
- async def from_settings(self, settings: "aioscrapy.settings.Settings"):
+ async def from_settings(self, settings: aioscrapy.Settings):
"""Create pool with settings"""
- async def from_crawler(self, crawler: "aioscrapy.crawler.Crawler"):
+ async def from_crawler(self, crawler: "aioscrapy.Crawler"):
return await self.from_settings(crawler.settings)
def __call__(self, alias: str):
diff --git a/aioscrapy/db/_aiomysql.py b/aioscrapy/db/aiomysql.py
similarity index 97%
rename from aioscrapy/db/_aiomysql.py
rename to aioscrapy/db/aiomysql.py
index f2280c6..62e813b 100644
--- a/aioscrapy/db/_aiomysql.py
+++ b/aioscrapy/db/aiomysql.py
@@ -4,6 +4,7 @@
from aiomysql import create_pool
+import aioscrapy
from aioscrapy.db.absmanager import AbsDBPoolManager
logger = logging.getLogger(__name__)
@@ -84,14 +85,13 @@ async def from_dict(self, db_args: dict):
for alias, mysql_args in db_args.items():
await self.create(alias, mysql_args)
- async def from_settings(self, settings: "aioscrapy.settings.Setting"):
+ async def from_settings(self, settings: aioscrapy.Settings):
for alias, mysql_args in settings.getdict('MYSQL_ARGS').items():
await self.create(alias, mysql_args)
mysql_manager = AioMysqlPoolManager()
-
if __name__ == '__main__':
import asyncio
diff --git a/aioscrapy/db/_aiorabbitmq.py b/aioscrapy/db/aiorabbitmq.py
similarity index 98%
rename from aioscrapy/db/_aiorabbitmq.py
rename to aioscrapy/db/aiorabbitmq.py
index 0e73234..d3ccf57 100644
--- a/aioscrapy/db/_aiorabbitmq.py
+++ b/aioscrapy/db/aiorabbitmq.py
@@ -5,6 +5,7 @@
from aio_pika.exceptions import QueueEmpty
from aio_pika.pool import Pool
+import aioscrapy
from aioscrapy.db.absmanager import AbsDBPoolManager
logger = logging.getLogger(__name__)
@@ -133,7 +134,7 @@ async def from_dict(self, db_args: dict):
for alias, rabbitmq_args in db_args.items():
await self.create(alias, rabbitmq_args)
- async def from_settings(self, settings: "aioscrapy.settings.Settings"):
+ async def from_settings(self, settings: aioscrapy.Settings):
for alias, rabbitmq_args in settings.getdict('RABBITMQ_ARGS').items():
await self.create(alias, rabbitmq_args)
diff --git a/aioscrapy/db/_aioredis.py b/aioscrapy/db/aioredis.py
similarity index 97%
rename from aioscrapy/db/_aioredis.py
rename to aioscrapy/db/aioredis.py
index a1f64a0..81c0ff7 100644
--- a/aioscrapy/db/_aioredis.py
+++ b/aioscrapy/db/aioredis.py
@@ -1,5 +1,6 @@
from redis.asyncio import BlockingConnectionPool, Redis
+import aioscrapy
from aioscrapy.db.absmanager import AbsDBPoolManager
@@ -59,7 +60,7 @@ async def from_dict(self, db_args: dict):
for alias, redis_args in db_args.items():
await self.create(alias, redis_args)
- async def from_settings(self, settings: "aioscrapy.settings.Settings"):
+ async def from_settings(self, settings: aioscrapy.Settings):
"""Create redis with settings"""
for alias, redis_args in settings.getdict('REDIS_ARGS').items():
await self.create(alias, redis_args)
@@ -67,7 +68,6 @@ async def from_settings(self, settings: "aioscrapy.settings.Settings"):
redis_manager = AioRedisPoolManager()
-
if __name__ == '__main__':
import asyncio
diff --git a/aioscrapy/http/request/__init__.py b/aioscrapy/http/request/__init__.py
index 435309b..a98bdc2 100644
--- a/aioscrapy/http/request/__init__.py
+++ b/aioscrapy/http/request/__init__.py
@@ -5,19 +5,29 @@
See documentation in docs/topics/request-response.rst
"""
import hashlib
-from typing import Callable
-from typing import Iterable, Optional, Union
+import inspect
+import json
+from typing import Callable, List, Optional, Tuple, Type, TypeVar
from w3lib.url import canonicalize_url
from w3lib.url import safe_url_string
+import aioscrapy
from aioscrapy.http.headers import Headers
from aioscrapy.utils.curl import curl_to_request_kwargs
-from aioscrapy.utils.python import to_bytes
+from aioscrapy.utils.python import to_unicode
from aioscrapy.utils.url import escape_ajax
+RequestTypeVar = TypeVar("RequestTypeVar", bound="Request")
+
class Request(object):
+ attributes: Tuple[str, ...] = (
+ "url", "callback", "method", "headers", "body",
+ "cookies", "meta", "encoding", "priority",
+ "dont_filter", "errback", "flags", "cb_kwargs",
+ "fingerprint"
+ )
def __init__(
self,
@@ -32,7 +42,7 @@ def __init__(
priority: int = 0,
dont_filter: bool = False,
errback: Optional[Callable] = None,
- flags: Optional[None] = None,
+ flags: Optional[List[str]] = None,
cb_kwargs: Optional[Callable] = None,
fingerprint: Optional[str] = None,
):
@@ -57,21 +67,21 @@ def __init__(
self.flags = [] if flags is None else list(flags)
@property
- def cb_kwargs(self):
+ def cb_kwargs(self) -> dict:
if self._cb_kwargs is None:
self._cb_kwargs = {}
return self._cb_kwargs
@property
- def meta(self):
+ def meta(self) -> dict:
if self._meta is None:
self._meta = {}
return self._meta
- def _get_url(self):
+ def _get_url(self) -> str:
return self._url
- def _set_url(self, url):
+ def _set_url(self, url: str) -> None:
if not isinstance(url, str):
raise TypeError(f'Request url must be str or unicode, got {type(url).__name__}')
@@ -87,21 +97,18 @@ def _set_url(self, url):
url = property(_get_url, _set_url)
- def _get_body(self):
+ def _get_body(self) -> str:
return self._body
- def _set_body(self, body):
- if body is None:
- self._body = ''
- else:
- self._body = body
+ def _set_body(self, body: str) -> None:
+ self._body = '' if body is None else body
body = property(_get_body, _set_body)
- def _set_fingerprint(self, fingerprint):
+ def _set_fingerprint(self, fingerprint: str) -> None:
self._fingerprint = fingerprint
- def _get_fingerprint(self):
+ def _get_fingerprint(self) -> str:
if self._fingerprint is None:
self._fingerprint = self.make_fingerprint()
return self._fingerprint
@@ -109,75 +116,83 @@ def _get_fingerprint(self):
fingerprint = property(_get_fingerprint, _set_fingerprint)
@property
- def encoding(self):
+ def encoding(self) -> str:
return self._encoding
- def __str__(self):
+ def __str__(self) -> str:
return f"<{self.method} {self.url}>"
__repr__ = __str__
- def copy(self):
+ def copy(self) -> "Request":
"""Return a copy of this Request"""
return self.replace()
- def replace(self, *args, **kwargs):
- """Create a new Request with the same attributes except for those
- given new values.
- """
- for x in ['url', 'method', 'headers', 'body', 'cookies', 'meta', 'flags', 'encoding',
- 'priority', 'dont_filter', 'callback', 'errback', 'cb_kwargs', 'fingerprint']:
+ def replace(self, *args, **kwargs) -> "Request":
+ """Create a new Request with the same attributes except for those given new values."""
+ for x in self.attributes:
kwargs.setdefault(x, getattr(self, x))
cls = kwargs.pop('cls', self.__class__)
return cls(*args, **kwargs)
@classmethod
- def from_curl(cls, curl_command, ignore_unknown_options=True, **kwargs):
- """Create a Request object from a string containing a `cURL
- `_ command. It populates the HTTP method, the
- URL, the headers, the cookies and the body. It accepts the same
- arguments as the :class:`Request` class, taking preference and
- overriding the values of the same arguments contained in the cURL
- command.
-
- Unrecognized options are ignored by default. To raise an error when
- finding unknown options call this method by passing
- ``ignore_unknown_options=False``.
-
- .. caution:: Using :meth:`from_curl` from :class:`~scrapy.http.Request`
- subclasses, such as :class:`~scrapy.http.JSONRequest`, or
- :class:`~scrapy.http.XmlRpcRequest`, as well as having
- :ref:`downloader middlewares `
- and
- :ref:`spider middlewares `
- enabled, such as
- :class:`~scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware`,
- :class:`~scrapy.downloadermiddlewares.useragent.UserAgentMiddleware`,
- or
- :class:`~scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware`,
- may modify the :class:`~scrapy.http.Request` object.
-
- To translate a cURL command into a aioscrapy request,
- you may use `curl2scrapy `_.
-
- """
+ def from_curl(
+ cls: Type[RequestTypeVar], curl_command: str, ignore_unknown_options: bool = True, **kwargs
+ ) -> RequestTypeVar:
+ """Create a Request object from a string containing a `cURL"""
request_kwargs = curl_to_request_kwargs(curl_command, ignore_unknown_options)
request_kwargs.update(kwargs)
return cls(**request_kwargs)
- def serialize(self, callback):
- from aioscrapy.utils.reqser import request_to_dict
- return callback(request_to_dict(self))
-
def make_fingerprint(
self,
- include_headers: Optional[Iterable[Union[bytes, str]]] = None,
keep_fragments: bool = False,
- ):
+ ) -> str:
""" make the request fingerprint. """
- fp = hashlib.sha1()
- fp.update(to_bytes(self.method))
- fp.update(to_bytes(canonicalize_url(self.url, keep_fragments=keep_fragments)))
- fp.update(to_bytes(self.body) or b'')
- fp.hexdigest()
- return fp.hexdigest()
+ return hashlib.sha1(
+ json.dumps({
+ 'method': to_unicode(self.method),
+ 'url': canonicalize_url(self.url, keep_fragments=keep_fragments),
+ 'body': self.body,
+ }, sort_keys=True).encode()
+ ).hexdigest()
+
+ def to_dict(self, *, spider: Optional["aioscrapy.Spider"] = None) -> dict:
+ """Return a dictionary containing the Request's data.
+
+ Use :func:`~scrapy.utils.request.request_from_dict` to convert back into a :class:`~scrapy.Request` object.
+
+ If a spider is given, this method will try to find out the name of the spider methods used as callback
+ and errback and include them in the output dict, raising an exception if they cannot be found.
+ """
+ d = {
+ "url": self.url, # urls are safe (safe_string_url)
+ "callback": _find_method(spider, self.callback) if callable(self.callback) else self.callback,
+ "errback": _find_method(spider, self.errback) if callable(self.errback) else self.errback,
+ "headers": dict(self.headers),
+ }
+ if self._fingerprint:
+ d['fingerprint'] = self._fingerprint
+
+ for attr in self.attributes:
+ if attr != 'fingerprint':
+ d.setdefault(attr, getattr(self, attr))
+ if type(self) is not Request:
+ d["_class"] = self.__module__ + '.' + self.__class__.__name__
+ return d
+
+
+def _find_method(obj, func):
+ """Helper function for Request.to_dict"""
+ # Only instance methods contain ``__func__``
+ if obj and hasattr(func, '__func__'):
+ members = inspect.getmembers(obj, predicate=inspect.ismethod)
+ for name, obj_func in members:
+ # We need to use __func__ to access the original function object because instance
+ # method objects are generated each time attribute is retrieved from instance.
+ #
+ # Reference: The standard type hierarchy
+ # https://docs.python.org/3/reference/datamodel.html
+ if obj_func.__func__ is func.__func__:
+ return name
+ raise ValueError(f"Function {func} is not an instance method in: {obj}")
diff --git a/aioscrapy/http/request/form.py b/aioscrapy/http/request/form.py
index dea0d12..79972d4 100644
--- a/aioscrapy/http/request/form.py
+++ b/aioscrapy/http/request/form.py
@@ -4,18 +4,19 @@
See documentation in docs/topics/request-response.rst
"""
-
+from typing import List, Optional, Tuple, Union
from urllib.parse import urlencode
from aioscrapy.http.request import Request
from aioscrapy.utils.python import to_bytes, is_listlike
+FormdataType = Optional[Union[dict, List[Tuple[str, str]]]]
+
class FormRequest(Request):
valid_form_methods = ['GET', 'POST']
- def __init__(self, *args, **kwargs):
- formdata = kwargs.pop('formdata', None)
+ def __init__(self, *args, formdata: FormdataType = None, **kwargs) -> None:
if formdata and kwargs.get('method') is None:
kwargs['method'] = 'POST'
@@ -23,12 +24,12 @@ def __init__(self, *args, **kwargs):
if formdata:
items = formdata.items() if isinstance(formdata, dict) else formdata
- querystr = _urlencode(items, self.encoding)
+ form_query: str = _urlencode(items, self.encoding)
if self.method == 'POST':
self.headers.setdefault('Content-Type', 'application/x-www-form-urlencoded')
- self._set_body(querystr)
+ self._set_body(form_query)
else:
- self._set_url(self.url + ('&' if '?' in self.url else '?') + querystr)
+ self._set_url(self.url + ('&' if '?' in self.url else '?') + form_query)
def _urlencode(seq, enc):
diff --git a/aioscrapy/http/request/json_request.py b/aioscrapy/http/request/json_request.py
index 661cd05..48bf2d0 100644
--- a/aioscrapy/http/request/json_request.py
+++ b/aioscrapy/http/request/json_request.py
@@ -8,15 +8,17 @@
import copy
import json
import warnings
+from typing import Optional, Tuple
from aioscrapy.http.request import Request
from aioscrapy.utils.deprecate import create_deprecated_class
class JsonRequest(Request):
- def __init__(self, *args, **kwargs):
- dumps_kwargs = copy.deepcopy(kwargs.pop('dumps_kwargs', {}))
- dumps_kwargs.setdefault('sort_keys', True)
+ attributes: Tuple[str, ...] = Request.attributes + ("dumps_kwargs",)
+
+ def __init__(self, *args, dumps_kwargs: Optional[dict] = None, **kwargs) -> None:
+ dumps_kwargs = copy.deepcopy(dumps_kwargs) if dumps_kwargs is not None else {}
self._dumps_kwargs = dumps_kwargs
body_passed = kwargs.get('body', None) is not None
@@ -36,7 +38,11 @@ def __init__(self, *args, **kwargs):
self.headers.setdefault('Content-Type', 'application/json')
self.headers.setdefault('Accept', 'application/json, text/javascript, */*; q=0.01')
- def replace(self, *args, **kwargs):
+ @property
+ def dumps_kwargs(self) -> dict:
+ return self._dumps_kwargs
+
+ def replace(self, *args, **kwargs) -> Request:
body_passed = kwargs.get('body', None) is not None
data = kwargs.pop('data', None)
data_passed = data is not None
@@ -49,7 +55,7 @@ def replace(self, *args, **kwargs):
return super().replace(*args, **kwargs)
- def _dumps(self, data):
+ def _dumps(self, data: dict) -> str:
"""Convert to JSON """
return json.dumps(data, **self._dumps_kwargs)
diff --git a/aioscrapy/queue/__init__.py b/aioscrapy/queue/__init__.py
index 0e9848e..110b589 100644
--- a/aioscrapy/queue/__init__.py
+++ b/aioscrapy/queue/__init__.py
@@ -1,33 +1,22 @@
from abc import ABCMeta, abstractmethod
-from typing import Optional
+from typing import Optional, Any
+import aioscrapy
from aioscrapy.serializer import AbsSerializer
-from aioscrapy.utils.reqser import request_to_dict, request_from_dict
+from aioscrapy.utils.reqser import request_from_dict
-class AbsQueue(object, metaclass=ABCMeta):
+class AbsQueue(metaclass=ABCMeta):
"""Per-spider base queue class"""
def __init__(
- self, container,
- spider: Optional[str] = None,
+ self,
+ container: Any,
+ spider: Optional[aioscrapy.Spider] = None,
key: Optional[str] = None,
serializer: Optional[AbsSerializer] = None
- ):
- """Initialize per-spider redis queue.
-
- Parameters
- ----------
- container : Redis/Queue
- The queue for Request.
- spider : Spider
- aioscrapy spider instance.
- key: str
- Redis key where to put and get messages.
- serializer : object
- Serializer object with ``loads`` and ``dumps`` methods.
-
- """
+ ) -> None:
+ """Initialize per-spider queue"""
self.container = container
self.spider = spider
self.key = key
@@ -35,40 +24,40 @@ def __init__(
@property
@abstractmethod
- def inc_key(self):
+ def inc_key(self) -> str:
"""stats inc_value"""
@classmethod
@abstractmethod
- async def from_spider(cls, spider) -> "AbsQueue":
+ async def from_spider(cls, spider: aioscrapy.Spider) -> "AbsQueue":
"""get queue instance from spider"""
- def _encode_request(self, request):
+ def _encode_request(self, request: aioscrapy.Request) -> Any:
"""Encode a request object"""
- obj = request_to_dict(request, self.spider)
+ obj = request.to_dict(spider=self.spider)
return self.serializer.dumps(obj)
- def _decode_request(self, encoded_request):
+ def _decode_request(self, encoded_request: Any) -> aioscrapy.Request:
"""Decode an request previously encoded"""
obj = self.serializer.loads(encoded_request)
- return request_from_dict(obj, self.spider)
+ return request_from_dict(obj, spider=self.spider)
- def __len__(self):
+ def __len__(self) -> None:
"""Return the length of the queue"""
raise Exception('please use len()')
@abstractmethod
- async def len(self):
+ async def len(self) -> int:
"""Return the length of the queue"""
@abstractmethod
- async def push(self, request):
+ async def push(self, request: aioscrapy.Request) -> None:
"""Push a request"""
@abstractmethod
- async def pop(self, timeout=0):
+ async def pop(self, timeout: int = 0) -> Optional[aioscrapy.Request]:
"""Pop a request"""
@abstractmethod
- async def clear(self):
+ async def clear(self) -> None:
"""Clear queue/stack"""
diff --git a/aioscrapy/queue/memory.py b/aioscrapy/queue/memory.py
index 815747b..ab31879 100644
--- a/aioscrapy/queue/memory.py
+++ b/aioscrapy/queue/memory.py
@@ -2,6 +2,7 @@
from asyncio.queues import QueueEmpty
from typing import Optional
+import aioscrapy
from aioscrapy.queue import AbsQueue
from aioscrapy.serializer import AbsSerializer
from aioscrapy.utils.misc import load_object
@@ -11,21 +12,22 @@ class MemoryQueueBase(AbsQueue):
inc_key = 'scheduler/enqueued/memory'
def __init__(
- self, container, spider,
+ self,
+ container: Queue,
+ spider: Optional[aioscrapy.Spider],
key: Optional[str] = None,
serializer: Optional[AbsSerializer] = None,
max_size: int = 0
- ):
+ ) -> None:
super().__init__(container, spider, key, serializer)
self.max_size = max_size
@classmethod
- async def from_spider(cls, spider) -> "MemoryQueueBase":
- settings = spider.settings
- max_size = settings.getint("QUEUE_MAXSIZE", 0)
- queue = cls.get_queue(max_size)
- queue_key = settings.get("SCHEDULER_QUEUE_KEY", '%(spider)s:requests')
- serializer = settings.get("SCHEDULER_SERIALIZER", "aioscrapy.serializer.PickleSerializer")
+ async def from_spider(cls, spider: aioscrapy.Spider) -> "MemoryQueueBase":
+ max_size: int = spider.settings.getint("QUEUE_MAXSIZE", 0)
+ queue: Queue = cls.get_queue(max_size)
+ queue_key: str = spider.settings.get("SCHEDULER_QUEUE_KEY", '%(spider)s:requests')
+ serializer: str = spider.settings.get("SCHEDULER_SERIALIZER", "aioscrapy.serializer.PickleSerializer")
serializer: AbsSerializer = load_object(serializer)
return cls(
queue,
@@ -39,14 +41,14 @@ def len(self) -> int:
return self.container.qsize()
@staticmethod
- def get_queue(max_size):
+ def get_queue(max_size: int) -> Queue:
raise NotImplementedError
- async def push(self, request):
+ async def push(self, request) -> None:
data = self._encode_request(request)
await self.container.put(data)
- async def pop(self, count: int = 1):
+ async def pop(self, count: int = 1) -> None:
for _ in range(count):
try:
data = self.container.get_nowait()
@@ -54,34 +56,34 @@ async def pop(self, count: int = 1):
break
yield self._decode_request(data)
- async def clear(self, timeout: int = 0):
+ async def clear(self, timeout: int = 0) -> None:
self.container = self.get_queue(self.max_size)
class MemoryFifoQueue(MemoryQueueBase):
@staticmethod
- def get_queue(max_size):
+ def get_queue(max_size: int) -> Queue:
return Queue(max_size)
class MemoryLifoQueue(MemoryFifoQueue):
@staticmethod
- def get_queue(max_size):
+ def get_queue(max_size: int) -> LifoQueue:
return LifoQueue(max_size)
class MemoryPriorityQueue(MemoryFifoQueue):
@staticmethod
- def get_queue(max_size):
+ def get_queue(max_size: int) -> PriorityQueue:
return PriorityQueue(max_size)
- async def push(self, request):
+ async def push(self, request: aioscrapy.Request) -> None:
data = self._encode_request(request)
score = request.priority
await self.container.put((score, data))
- async def pop(self, count: int = 1):
+ async def pop(self, count: int = 1) -> Optional[aioscrapy.Request]:
for _ in range(count):
try:
score, data = self.container.get_nowait()
diff --git a/aioscrapy/queue/rabbitmq.py b/aioscrapy/queue/rabbitmq.py
index 3d77eb6..a6354c3 100644
--- a/aioscrapy/queue/rabbitmq.py
+++ b/aioscrapy/queue/rabbitmq.py
@@ -1,3 +1,6 @@
+from typing import Optional
+
+import aioscrapy
from aioscrapy.db import db_manager
from aioscrapy.queue import AbsQueue
from aioscrapy.serializer import AbsSerializer
@@ -8,11 +11,11 @@ class RabbitMqPriorityQueue(AbsQueue):
inc_key = 'scheduler/enqueued/rabbitmq'
@classmethod
- def from_dict(cls, data: dict) -> "AbsQueue":
- alias = data.get("alias", 'queue')
- server = db_manager.rabbitmq.executor(alias)
- spider_name = data["spider_name"]
- serializer = data.get("serializer", "aioscrapy.serializer.JsonSerializer")
+ def from_dict(cls, data: dict) -> "RabbitMqPriorityQueue":
+ alias: str = data.get("alias", 'queue')
+ server: aioscrapy.db.aiorabbitmq.RabbitmqExecutor = db_manager.rabbitmq.executor(alias)
+ spider_name: str = data["spider_name"]
+ serializer: str = data.get("serializer", "aioscrapy.serializer.JsonSerializer")
serializer: AbsSerializer = load_object(serializer)
return cls(
server,
@@ -21,12 +24,11 @@ def from_dict(cls, data: dict) -> "AbsQueue":
)
@classmethod
- async def from_spider(cls, spider) -> "RabbitMqPriorityQueue":
- settings = spider.settings
- alias = settings.get("SCHEDULER_QUEUE_ALIAS", 'queue')
- executor = db_manager.rabbitmq.executor(alias)
- queue_key = settings.get("SCHEDULER_QUEUE_KEY", '%(spider)s:requests')
- serializer = settings.get("SCHEDULER_SERIALIZER", "aioscrapy.serializer.JsonSerializer")
+ async def from_spider(cls, spider: aioscrapy.Spider) -> "RabbitMqPriorityQueue":
+ alias: str = spider.settings.get("SCHEDULER_QUEUE_ALIAS", 'queue')
+ executor: aioscrapy.db.aiorabbitmq.RabbitmqExecutor = db_manager.rabbitmq.executor(alias)
+ queue_key: str = spider.settings.get("SCHEDULER_QUEUE_KEY", '%(spider)s:requests')
+ serializer: str = spider.settings.get("SCHEDULER_SERIALIZER", "aioscrapy.serializer.JsonSerializer")
serializer: AbsSerializer = load_object(serializer)
return cls(
executor,
@@ -38,7 +40,7 @@ async def from_spider(cls, spider) -> "RabbitMqPriorityQueue":
async def len(self) -> int:
return await self.container.get_message_count(self.key)
- async def push(self, request):
+ async def push(self, request: aioscrapy.Request) -> None:
data = self._encode_request(request)
score = request.priority
await self.container.publish(
@@ -47,13 +49,13 @@ async def push(self, request):
priority=score
)
- async def pop(self, count: int = 1):
+ async def pop(self, count: int = 1) -> Optional[aioscrapy.Request]:
result = await self.container.get_message(self.key)
if result:
yield self._decode_request(result)
- async def clear(self):
- return await self.container.clean_message_queue(self.key)
+ async def clear(self) -> None:
+ await self.container.clean_message_queue(self.key)
SpiderPriorityQueue = RabbitMqPriorityQueue
diff --git a/aioscrapy/queue/redis.py b/aioscrapy/queue/redis.py
index 13abb4c..be65546 100644
--- a/aioscrapy/queue/redis.py
+++ b/aioscrapy/queue/redis.py
@@ -1,10 +1,12 @@
import logging
from abc import ABC
+from typing import Optional
-from aioscrapy.queue import AbsQueue
+import aioscrapy
from aioscrapy.db import db_manager
-from aioscrapy.utils.misc import load_object
+from aioscrapy.queue import AbsQueue
from aioscrapy.serializer import AbsSerializer
+from aioscrapy.utils.misc import load_object
logger = logging.getLogger(__name__)
@@ -13,11 +15,11 @@ class RedisQueueBase(AbsQueue, ABC):
inc_key = 'scheduler/enqueued/redis'
@classmethod
- def from_dict(cls, data: dict) -> "AbsQueue":
- alias = data.get("alias", 'queue')
- server = db_manager.redis(alias)
- spider_name = data["spider_name"]
- serializer = data.get("serializer", "aioscrapy.serializer.JsonSerializer")
+ def from_dict(cls, data: dict) -> "RedisQueueBase":
+ alias: str = data.get("alias", 'queue')
+ server: aioscrapy.db.aioredis.Redis = db_manager.redis(alias)
+ spider_name: str = data["spider_name"]
+ serializer: str = data.get("serializer", "aioscrapy.serializer.JsonSerializer")
serializer: AbsSerializer = load_object(serializer)
return cls(
server,
@@ -26,12 +28,11 @@ def from_dict(cls, data: dict) -> "AbsQueue":
)
@classmethod
- async def from_spider(cls, spider) -> "RedisQueueBase":
- settings = spider.settings
- alias = settings.get("SCHEDULER_QUEUE_ALIAS", 'queue')
- server = db_manager.redis(alias)
- queue_key = settings.get("SCHEDULER_QUEUE_KEY", '%(spider)s:requests')
- serializer = settings.get("SCHEDULER_SERIALIZER", "aioscrapy.serializer.JsonSerializer")
+ async def from_spider(cls, spider: aioscrapy.Spider) -> "RedisQueueBase":
+ alias: str = spider.settings.get("SCHEDULER_QUEUE_ALIAS", 'queue')
+ server: aioscrapy.db.aioredis.Redis = db_manager.redis(alias)
+ queue_key: str = spider.settings.get("SCHEDULER_QUEUE_KEY", '%(spider)s:requests')
+ serializer: str = spider.settings.get("SCHEDULER_SERIALIZER", "aioscrapy.serializer.JsonSerializer")
serializer: AbsSerializer = load_object(serializer)
return cls(
server,
@@ -40,7 +41,7 @@ async def from_spider(cls, spider) -> "RedisQueueBase":
serializer=serializer
)
- async def clear(self):
+ async def clear(self) -> None:
"""Clear queue/stack"""
await self.container.delete(self.key)
@@ -48,14 +49,14 @@ async def clear(self):
class RedisFifoQueue(RedisQueueBase):
"""Per-spider FIFO queue"""
- async def len(self):
+ async def len(self) -> int:
return await self.container.llen(self.key)
- async def push(self, request):
+ async def push(self, request: aioscrapy.Request) -> None:
"""Push a request"""
await self.container.lpush(self.key, self._encode_request(request))
- async def pop(self, count: int = 1):
+ async def pop(self, count: int = 1) -> Optional[aioscrapy.Request]:
"""Pop a request"""
async with self.container.pipeline(transaction=True) as pipe:
for _ in range(count):
@@ -69,16 +70,16 @@ async def pop(self, count: int = 1):
class RedisPriorityQueue(RedisQueueBase):
"""Per-spider priority queue abstraction using redis' sorted set"""
- async def len(self):
+ async def len(self) -> int:
return await self.container.zcard(self.key)
- async def push(self, request):
+ async def push(self, request: aioscrapy.Request) -> None:
"""Push a request"""
data = self._encode_request(request)
score = request.priority
await self.container.zadd(self.key, {data: score})
- async def pop(self, count: int = 1):
+ async def pop(self, count: int = 1) -> Optional[aioscrapy.Request]:
async with self.container.pipeline(transaction=True) as pipe:
stop = count - 1 if count - 1 > 0 else 0
results, _ = await (
@@ -93,14 +94,14 @@ async def pop(self, count: int = 1):
class RedisLifoQueue(RedisQueueBase):
"""Per-spider LIFO queue."""
- async def len(self):
+ async def len(self) -> int:
return await self.container.llen(self.key)
- async def push(self, request):
+ async def push(self, request: aioscrapy.Request) -> None:
"""Push a request"""
await self.container.lpush(self.key, self._encode_request(request))
- async def pop(self, count: int = 1):
+ async def pop(self, count: int = 1) -> Optional[aioscrapy.Request]:
"""Pop a request"""
async with self.container.pipeline(transaction=True) as pipe:
for _ in range(count):
@@ -111,7 +112,6 @@ async def pop(self, count: int = 1):
yield self._decode_request(result)
-# TODO: Deprecate the use of these names.
SpiderQueue = RedisFifoQueue
SpiderStack = RedisLifoQueue
SpiderPriorityQueue = RedisPriorityQueue
diff --git a/aioscrapy/settings/default_settings.py b/aioscrapy/settings/default_settings.py
index 777bbc8..63ff60c 100644
--- a/aioscrapy/settings/default_settings.py
+++ b/aioscrapy/settings/default_settings.py
@@ -14,7 +14,6 @@
"""
import sys
-from importlib import import_module
from os.path import join, abspath, dirname
AUTOTHROTTLE_ENABLED = False
@@ -166,7 +165,4 @@
URLLENGTH_LIMIT = 2083
-USER_AGENT = f'Aioscrapy/{import_module("aioscrapy").__version__}'
-
-
CLOSE_SPIDER_ON_IDLE = False
diff --git a/aioscrapy/utils/misc.py b/aioscrapy/utils/misc.py
index 1f5bae1..a0f9835 100644
--- a/aioscrapy/utils/misc.py
+++ b/aioscrapy/utils/misc.py
@@ -2,7 +2,8 @@
from importlib import import_module
from pkgutil import iter_modules
-from .tools import call_helper
+
+from aioscrapy.utils.tools import call_helper
def walk_modules(path):
diff --git a/aioscrapy/utils/reqser.py b/aioscrapy/utils/reqser.py
index 1906a53..55a3974 100644
--- a/aioscrapy/utils/reqser.py
+++ b/aioscrapy/utils/reqser.py
@@ -1,97 +1,15 @@
"""
Helper functions for serializing (and deserializing) requests.
"""
-import inspect
+from typing import Optional
-from aioscrapy.http import Request
-from aioscrapy.utils.misc import load_object
-from aioscrapy.utils.python import to_unicode
+import aioscrapy
+from aioscrapy.utils.request import request_from_dict as _from_dict
-def request_to_dict(request, spider=None):
- """Convert Request object to a dict.
+def request_to_dict(request: "aioscrapy.Request", spider: Optional["aioscrapy.Spider"] = None) -> dict:
+ return request.to_dict(spider=spider)
- If a spider is given, it will try to find out the name of the spider method
- used in the callback and store that as the callback.
- """
- cb = request.callback
- if callable(cb):
- cb = _find_method(spider, cb)
- eb = request.errback
- if callable(eb):
- eb = _find_method(spider, eb)
- d = {
- 'url': to_unicode(request.url), # urls should be safe (safe_string_url)
- 'callback': cb,
- 'errback': eb,
- 'method': request.method,
- 'headers': dict(request.headers),
- 'body': request.body,
- 'cookies': request.cookies,
- 'meta': request.meta,
- '_encoding': request._encoding,
- 'priority': request.priority,
- 'dont_filter': request.dont_filter,
- 'flags': request.flags,
- 'cb_kwargs': request.cb_kwargs,
- 'fingerprint': request._fingerprint,
- }
- if type(request) is not Request:
- d['_class'] = request.__module__ + '.' + request.__class__.__name__
- return d
-
-def request_from_dict(d, spider=None):
- """Create Request object from a dict.
-
- If a spider is given, it will try to resolve the callbacks looking at the
- spider for methods with the same name.
- """
- cb = d.get('callback') or 'parse'
- if cb and spider:
- cb = _get_method(spider, cb)
- eb = d.get('errback')
- if eb and spider:
- eb = _get_method(spider, eb)
- request_cls = load_object(d['_class']) if '_class' in d else Request
- return request_cls(
- url=to_unicode(d['url']),
- callback=cb,
- errback=eb,
- method=d.get('method', 'GET'),
- headers=d.get('headers'),
- body=d.get('body'),
- cookies=d.get('cookies'),
- meta=d.get('meta'),
- encoding=d.get('_encoding', 'utf-8'),
- priority=d.get('priority', 0),
- dont_filter=d.get('dont_filter', True),
- flags=d.get('flags'),
- cb_kwargs=d.get('cb_kwargs'),
- fingerprint=d.get('fingerprint'),
- )
-
-
-def _find_method(obj, func):
- # Only instance methods contain ``__func__``
- if obj and hasattr(func, '__func__'):
- members = inspect.getmembers(obj, predicate=inspect.ismethod)
- for name, obj_func in members:
- # We need to use __func__ to access the original
- # function object because instance method objects
- # are generated each time attribute is retrieved from
- # instance.
- #
- # Reference: The standard type hierarchy
- # https://docs.python.org/3/reference/datamodel.html
- if obj_func.__func__ is func.__func__:
- return name
- raise ValueError(f"Function {func} is not an instance method in: {obj}")
-
-
-def _get_method(obj, name):
- name = str(name)
- try:
- return getattr(obj, name)
- except AttributeError:
- raise ValueError(f"Method {name!r} not found in: {obj}")
+def request_from_dict(d: dict, spider: Optional["aioscrapy.Spider"] = None) -> "aioscrapy.Request":
+ return _from_dict(d, spider=spider)
diff --git a/aioscrapy/utils/request.py b/aioscrapy/utils/request.py
index cdd65ab..b9fc0af 100644
--- a/aioscrapy/utils/request.py
+++ b/aioscrapy/utils/request.py
@@ -8,8 +8,9 @@
from w3lib.http import headers_dict_to_raw
-from aioscrapy.http import Request
+from aioscrapy import Spider, Request
from aioscrapy.utils.httpobj import urlparse_cached
+from aioscrapy.utils.misc import load_object
from aioscrapy.utils.python import to_bytes, to_unicode
@@ -36,3 +37,27 @@ def referer_str(request: Request) -> Optional[str]:
if referrer is None:
return referrer
return to_unicode(referrer, errors='replace')
+
+
+def request_from_dict(d: dict, *, spider: Optional[Spider] = None) -> Request:
+ """Create a :class:`~scrapy.Request` object from a dict.
+
+ If a spider is given, it will try to resolve the callbacks looking at the
+ spider for methods with the same name.
+ """
+ request_cls = load_object(d["_class"]) if "_class" in d else Request
+ kwargs = {key: value for key, value in d.items() if key in request_cls.attributes}
+ if d.get("callback") and spider:
+ kwargs["callback"] = _get_method(spider, d["callback"])
+ if d.get("errback") and spider:
+ kwargs["errback"] = _get_method(spider, d["errback"])
+ return request_cls(**kwargs)
+
+
+def _get_method(obj, name):
+ """Helper function for request_from_dict"""
+ name = str(name)
+ try:
+ return getattr(obj, name)
+ except AttributeError:
+ raise ValueError(f"Method {name!r} not found in: {obj}")