Skip to content

Commit

Permalink
chore: Type hints
Browse files Browse the repository at this point in the history
  • Loading branch information
ConlinH committed Aug 12, 2022
1 parent d3609d2 commit 4f41ce4
Show file tree
Hide file tree
Showing 21 changed files with 303 additions and 318 deletions.
4 changes: 3 additions & 1 deletion aioscrapy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@
# Declare top-level shortcuts
from aioscrapy.spiders import Spider
from aioscrapy.http import Request, FormRequest
from aioscrapy.settings import Settings
from aioscrapy.crawler import Crawler


__all__ = [
'__version__', 'version_info', 'Spider', 'Request', 'FormRequest',
'__version__', 'version_info', 'Spider', 'Request', 'FormRequest', 'Crawler'
]


Expand Down
3 changes: 2 additions & 1 deletion aioscrapy/core/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import logging
from typing import Optional, AsyncGenerator, Union, Callable

import aioscrapy
from aioscrapy import Spider
from aioscrapy import signals
from aioscrapy.core.downloader import DownloaderTV
Expand Down Expand Up @@ -35,7 +36,7 @@ def remove_request(self, request: Request) -> None:

class ExecutionEngine(object):

def __init__(self, crawler) -> None:
def __init__(self, crawler: "aioscrapy.Crawler") -> None:
self.crawler = crawler
self.settings = crawler.settings
self.signals = crawler.signals
Expand Down
18 changes: 8 additions & 10 deletions aioscrapy/core/scheduler.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
from abc import abstractmethod
from typing import Optional, Type, TypeVar

from aioscrapy import Spider
from aioscrapy.dupefilters import DupeFilterBase
from aioscrapy.http.request import Request
import aioscrapy
from aioscrapy.queue import AbsQueue
from aioscrapy.statscollectors import StatsCollector
from aioscrapy.utils.misc import load_instance
Expand All @@ -29,7 +27,7 @@ def __subclasscheck__(cls, subclass):
class BaseScheduler(metaclass=BaseSchedulerMeta):

@classmethod
async def from_crawler(cls, crawler):
async def from_crawler(cls, crawler: "aioscrapy.Crawler") -> "BaseScheduler":
"""
Factory method which receives the current :class:`~scrapy.crawler.Crawler` object as argument.
"""
Expand All @@ -53,7 +51,7 @@ async def has_pending_requests(self) -> bool:
raise NotImplementedError()

@abstractmethod
async def enqueue_request(self, request: Request) -> bool:
async def enqueue_request(self, request: aioscrapy.Request) -> bool:
"""
Process a request received by the engine.
Expand All @@ -67,7 +65,7 @@ async def enqueue_request(self, request: Request) -> bool:
raise NotImplementedError()

@abstractmethod
async def next_request(self) -> Optional[Request]:
async def next_request(self) -> Optional[aioscrapy.Request]:
"""
Return the next :class:`~scrapy.http.Request` to be processed, or ``None``
to indicate that there are no requests to be considered ready at the moment.
Expand All @@ -87,7 +85,7 @@ class Scheduler(BaseScheduler):
def __init__(
self,
queue: AbsQueue,
spider: Spider,
spider: aioscrapy.Spider,
stats=Optional[StatsCollector],
persist: bool = True
):
Expand All @@ -97,7 +95,7 @@ def __init__(
self.persist = persist

@classmethod
async def from_crawler(cls: Type[SchedulerTV], crawler) -> SchedulerTV:
async def from_crawler(cls: Type[SchedulerTV], crawler: "aioscrapy.Crawler") -> SchedulerTV:
instance = cls(
await load_instance(crawler.settings['SCHEDULER_QUEUE_CLASS'], spider=crawler.spider),
crawler.spider,
Expand All @@ -120,13 +118,13 @@ async def close(self, reason: str) -> None:
async def flush(self) -> None:
await call_helper(self.queue.clear)

async def enqueue_request(self, request: Request) -> bool:
async def enqueue_request(self, request: aioscrapy.Request) -> bool:
await call_helper(self.queue.push, request)
if self.stats:
self.stats.inc_value(self.queue.inc_key, spider=self.spider)
return True

async def next_request(self, count: int = 1) -> Optional[Request]:
async def next_request(self, count: int = 1) -> Optional[aioscrapy.Request]:
async for request in self.queue.pop(count):
if request and self.stats:
self.stats.inc_value(self.queue.inc_key, spider=self.spider)
Expand Down
5 changes: 3 additions & 2 deletions aioscrapy/core/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import logging
from typing import Any, AsyncGenerator, Set, Union, Optional

import aioscrapy
from aioscrapy import signals, Spider
from aioscrapy.exceptions import CloseSpider, DropItem, IgnoreRequest
from aioscrapy.http import Request, Response
Expand Down Expand Up @@ -54,7 +55,7 @@ class Scraper:

def __init__(
self,
crawler,
crawler: "aioscrapy.Crawler",
slot: Slot,
spidermw: SpiderMiddlewareManager,
itemproc: ItemPipelineManager,
Expand All @@ -72,7 +73,7 @@ def __init__(
self.concurrent_parser = asyncio.Semaphore(crawler.settings.getint('CONCURRENT_PARSER', 1))

@classmethod
async def from_crawler(cls, crawler):
async def from_crawler(cls, crawler: "aioscrapy.Crawler") -> "Scraper":
instance: "Scraper" = cls(
crawler,
Slot(crawler.settings.getint('SCRAPER_SLOT_MAX_ACTIVE_SIZE')),
Expand Down
78 changes: 51 additions & 27 deletions aioscrapy/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import signal
import sys
import warnings
from typing import Optional
from typing import Optional, Type, Union, Any

from zope.interface.exceptions import DoesNotImplement

Expand All @@ -15,8 +15,9 @@
MultipleInvalid = None

from zope.interface.verify import verifyClass
from aioscrapy.logformatter import LogFormatter
from aioscrapy import signals, Spider
from aioscrapy.settings import overridden_settings
from aioscrapy.settings import overridden_settings, Settings
from aioscrapy.utils.log import (
get_scrapy_root_handler,
install_scrapy_root_handler,
Expand All @@ -31,7 +32,6 @@
from aioscrapy.utils.tools import async_generator_wrapper
from aioscrapy.middleware import ExtensionManager
from aioscrapy.core.engine import ExecutionEngine
from aioscrapy.settings import Settings
from aioscrapy.signalmanager import SignalManager
from aioscrapy.utils.ossignal import install_shutdown_handlers, signal_names
from aioscrapy.statscollectors import StatsCollector
Expand All @@ -41,15 +41,15 @@

class Crawler:

def __init__(self, spidercls, settings=None):
def __init__(self, spidercls: Type[Spider], settings: Union[Settings, dict, None] = None) -> None:

if isinstance(spidercls, Spider):
raise ValueError('The spidercls argument must be a class, not an object')

if isinstance(settings, dict) or settings is None:
settings = Settings(settings)

self.spidercls: Spider = spidercls
self.spidercls = spidercls
self.settings = settings.copy()
self.spidercls.update_settings(self.settings)

Expand All @@ -59,10 +59,10 @@ def __init__(self, spidercls, settings=None):
self.crawling = False
self.spider: Optional[Spider] = None
self.engine: Optional[ExecutionEngine] = None
self.extensions: Optional[ExecutionEngine] = None
self.logformatter: Optional[ExecutionEngine] = None
self.extensions: Optional[ExtensionManager] = None
self.logformatter: Optional[LogFormatter] = None

async def crawl(self, *args, **kwargs):
async def crawl(self, *args, **kwargs) -> None:
try:
if self.crawling:
raise RuntimeError("Crawling already taking place")
Expand Down Expand Up @@ -96,7 +96,7 @@ async def crawl(self, *args, **kwargs):
await self.engine.close()
raise e

async def stop(self):
async def stop(self) -> None:
"""Starts a graceful stop of the crawler and returns a deferred that is
fired when the crawler is stopped."""
if self.crawling:
Expand All @@ -112,7 +112,7 @@ class CrawlerRunner:
)

@staticmethod
def _get_spider_loader(settings):
def _get_spider_loader(settings: Settings) -> ISpiderLoader:
""" Get SpiderLoader instance from settings """
cls_path = settings.get('SPIDER_LOADER_CLASS')
loader_cls = load_object(cls_path)
Expand All @@ -128,7 +128,7 @@ def _get_spider_loader(settings):
)
return loader_cls.from_settings(settings.frozencopy())

def __init__(self, settings=None):
def __init__(self, settings: Union[Settings, dict, None] = None) -> None:
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
self.settings = settings
Expand All @@ -144,12 +144,18 @@ def spiders(self):
category=AioScrapyDeprecationWarning, stacklevel=2)
return self.spider_loader

def crawl_soon(self, crawler_or_spidercls, *args, settings=None, **kwargs):
def crawl_soon(
self,
crawler_or_spidercls: Union[Type[Spider], Crawler],
*args,
settings: Union[Settings, dict, None] = None,
**kwargs
) -> None:
crawler = self.crawl(crawler_or_spidercls, settings=settings)
self.crawlers.setdefault(crawler, (args, kwargs))
self.active_crawler(crawler, *args, **kwargs)

def active_crawler(self, crawler, *args, **kwargs):
def active_crawler(self, crawler: Crawler, *args, **kwargs) -> None:
task = asyncio.create_task(crawler.crawl(*args, **kwargs))
self._active.add(task)

Expand All @@ -161,7 +167,13 @@ def _done(result):

task.add_done_callback(_done)

def crawl(self, crawler_or_spidercls, *args, settings=None, **kwargs):
def crawl(
self,
crawler_or_spidercls: Union[Type[Spider], Crawler],
*args,
settings: Union[Settings, dict, None] = None,
**kwargs
) -> Crawler:
if isinstance(crawler_or_spidercls, Spider):
raise ValueError(
'The crawler_or_spidercls argument cannot be a spider object, '
Expand All @@ -170,7 +182,11 @@ def crawl(self, crawler_or_spidercls, *args, settings=None, **kwargs):
self.crawlers.setdefault(crawler, (args, kwargs))
return crawler

def create_crawler(self, crawler_or_spidercls, settings):
def create_crawler(
self,
crawler_or_spidercls: Union[Type[Spider], Crawler, str],
settings: Union[Settings, dict, None]
) -> Crawler:
if isinstance(crawler_or_spidercls, Spider):
raise ValueError(
'The crawler_or_spidercls argument cannot be a spider object, '
Expand All @@ -179,44 +195,52 @@ def create_crawler(self, crawler_or_spidercls, settings):
return crawler_or_spidercls
return self._create_crawler(crawler_or_spidercls, settings)

def _create_crawler(self, spidercls, settings):
def _create_crawler(
self,
spidercls: Union[Type[Spider], str],
settings: Union[Settings, dict, None]
) -> Crawler:
if isinstance(spidercls, str):
spidercls = self.spider_loader.load(spidercls)
return Crawler(spidercls, settings=settings)

async def stop(self):
return await asyncio.gather(*[c.stop() for c in self.crawlers])
async def stop(self) -> None:
await asyncio.gather(*[c.stop() for c in self.crawlers])


class CrawlerProcess(CrawlerRunner):

def __init__(self, settings=None, install_root_handler=True):
def __init__(
self,
settings: Union[Settings, dict, None] = None,
install_root_handler: bool = True
) -> None:
super().__init__(settings)
install_shutdown_handlers(self._signal_shutdown)
configure_logging(self.settings, install_root_handler)

def _signal_shutdown(self, signum, _):
def _signal_shutdown(self, signum: Any, _) -> None:
install_shutdown_handlers(self._signal_kill)
signame = signal_names[signum]
logger.info("Received %(signame)s, shutting down gracefully. Send again to force ",
{'signame': signame})
asyncio.create_task(self._graceful_stop_reactor())

def _signal_kill(self, signum, _):
def _signal_kill(self, signum: Any, _) -> None:
install_shutdown_handlers(signal.SIG_IGN)
signame = signal_names[signum]
logger.info('Received %(signame)s twice, forcing unclean shutdown',
{'signame': signame})
asyncio.create_task(self._stop_reactor())

async def run(self):
async def run(self) -> None:
for crawler, (args, kwargs) in self.crawlers.items():
self.active_crawler(crawler, *args, **kwargs)
while self._active:
await asyncio.gather(*self._active)
await self.recycle_db_connect()

def start(self):
def start(self) -> None:
if sys.platform.startswith('win'):
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
else:
Expand All @@ -227,17 +251,17 @@ def start(self):
pass
asyncio.run(self.run())

async def _graceful_stop_reactor(self):
async def _graceful_stop_reactor(self) -> None:
await self.stop()
await self.recycle_db_connect()

async def _stop_reactor(self):
async def _stop_reactor(self) -> None:
try:
await self.recycle_db_connect()
finally:
asyncio.get_event_loop().stop()

async def recycle_db_connect(self):
# 回收所以的链接
async def recycle_db_connect(self) -> None:
# recycle pool of db_manager
if not len(self._active):
await db_manager.close_all()
Loading

0 comments on commit 4f41ce4

Please sign in to comment.