From 8afef47b6c4ebd04406f024ebce8aa2824ceae06 Mon Sep 17 00:00:00 2001 From: unknown Date: Sat, 6 May 2023 15:51:18 +0800 Subject: [PATCH] fix closespider --- .../handlers/playwright/webdriver.py | 10 +-- aioscrapy/libs/extensions/closespider.py | 27 +++---- .../{demo_http2.py => demo_httpx.py} | 27 ++++--- example/singlespider/demo_memory.py | 3 +- example/singlespider/demo_playwright.py | 4 +- example/singlespider/demo_sink_mongo.py | 6 +- example/singlespider/demo_sink_mysql.py | 75 +++++++++++++++++++ 7 files changed, 113 insertions(+), 39 deletions(-) rename example/singlespider/{demo_http2.py => demo_httpx.py} (69%) create mode 100644 example/singlespider/demo_sink_mysql.py diff --git a/aioscrapy/core/downloader/handlers/playwright/webdriver.py b/aioscrapy/core/downloader/handlers/playwright/webdriver.py index 5ea994d..76cb036 100644 --- a/aioscrapy/core/downloader/handlers/playwright/webdriver.py +++ b/aioscrapy/core/downloader/handlers/playwright/webdriver.py @@ -41,10 +41,10 @@ def __init__( self.on_response = on_response self.user_agent = user_agent - self.driver: Playwright = None - self.browser: Browser = None - self.context: BrowserContext = None - self.page: Page = None + self.driver: Optional[Playwright] = None + self.browser: Optional[Browser] = None + self.context: Optional[BrowserContext] = None + self.page: Optional[Page] = None self.url = None async def setup(self): @@ -67,7 +67,7 @@ async def setup(self): context_args.update({'user_agent': self.user_agent}) self.driver = await async_playwright().start() - # self.browser = await getattr(self.driver, self.driver_type).launch(**browser_args) + self.browser = await getattr(self.driver, self.driver_type).launch(**browser_args) self.browser = await self.driver.chromium.launch(**browser_args) self.context = await self.browser.new_context(**context_args) self.page = await self.context.new_page() diff --git a/aioscrapy/libs/extensions/closespider.py b/aioscrapy/libs/extensions/closespider.py index 45b5f63..3cf223a 100644 --- a/aioscrapy/libs/extensions/closespider.py +++ b/aioscrapy/libs/extensions/closespider.py @@ -4,6 +4,7 @@ See documentation in docs/topics/extensions.rst """ import asyncio +from typing import Optional from collections import defaultdict from aioscrapy import signals @@ -26,13 +27,14 @@ def __init__(self, crawler): raise NotConfigured self.counter = defaultdict(int) + self.task: Optional[asyncio.tasks.Task] = None if self.close_on.get('errorcount'): crawler.signals.connect(self.error_count, signal=signals.spider_error) if self.close_on.get('pagecount'): crawler.signals.connect(self.page_count, signal=signals.response_received) if self.close_on.get('timeout'): - crawler.signals.connect(self.spider_opened, signal=signals.spider_opened) + crawler.signals.connect(self.timeout_close, signal=signals.spider_opened) if self.close_on.get('itemcount'): crawler.signals.connect(self.item_scraped, signal=signals.item_scraped) crawler.signals.connect(self.spider_closed, signal=signals.spider_closed) @@ -44,26 +46,25 @@ def from_crawler(cls, crawler): async def error_count(self, failure, response, spider): self.counter['errorcount'] += 1 if self.counter['errorcount'] == self.close_on['errorcount']: - asyncio.create_task(self.crawler.engine.close_spider(spider, 'closespider_errorcount')) + asyncio.create_task(self.crawler.engine.stop(reason='closespider_errorcount')) async def page_count(self, response, request, spider): self.counter['pagecount'] += 1 if self.counter['pagecount'] == self.close_on['pagecount']: - asyncio.create_task(self.crawler.engine.close_spider(spider, 'closespider_pagecount')) + asyncio.create_task(self.crawler.engine.stop(reason='closespider_pagecount')) - async def spider_opened(self, spider): - self.task = asyncio.create_task(self.timeout_close(spider)) - async def timeout_close(self, spider): - await asyncio.sleep(self.close_on['timeout']) - asyncio.create_task(self.crawler.engine.close_spider(spider, reason='closespider_timeout')) - + async def close(): + await asyncio.sleep(self.close_on['timeout']) + asyncio.create_task(self.crawler.engine.stop(reason='closespider_timeout')) + + self.task = asyncio.create_task(close()) + async def item_scraped(self, item, spider): self.counter['itemcount'] += 1 if self.counter['itemcount'] == self.close_on['itemcount']: - asyncio.create_task(self.crawler.engine.close_spider(spider, 'closespider_itemcount')) + asyncio.create_task(self.crawler.engine.stop(reason='closespider_itemcount')) def spider_closed(self, spider): - task = getattr(self, 'task', False) - if task and not task.done(): - task.cancel() + if self.task and not self.task.done(): + self.task.cancel() diff --git a/example/singlespider/demo_http2.py b/example/singlespider/demo_httpx.py similarity index 69% rename from example/singlespider/demo_http2.py rename to example/singlespider/demo_httpx.py index af9e237..a7a1139 100644 --- a/example/singlespider/demo_http2.py +++ b/example/singlespider/demo_httpx.py @@ -6,24 +6,23 @@ logger = logging.getLogger(__name__) -class DemoMemorySpider(Spider): - name = 'DemoMemorySpider' - custom_settings = { - "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36", - # 'DOWNLOAD_DELAY': 3, - # 'RANDOMIZE_DOWNLOAD_DELAY': True, - # 'CONCURRENT_REQUESTS': 1, - # 'LOG_LEVEL': 'INFO' - # 'DUPEFILTER_CLASS': 'aioscrapy.dupefilters.disk.RFPDupeFilter', - "CLOSE_SPIDER_ON_IDLE": True, - 'DOWNLOAD_HANDLERS': { +class DemoHttpxSpider(Spider): + name = 'DemoHttpxSpider' + custom_settings = dict( + USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36", + # DOWNLOAD_DELAY=3, + # RANDOMIZE_DOWNLOAD_DELAY=True, + # CONCURRENT_REQUESTS=1, + LOG_LEVEL='INFO', + CLOSE_SPIDER_ON_IDLE=True, + DOWNLOAD_HANDLERS={ 'http': 'aioscrapy.core.downloader.handlers.httpx.HttpxDownloadHandler', 'https': 'aioscrapy.core.downloader.handlers.httpx.HttpxDownloadHandler', }, - 'HTTPX_CLIENT_SESSION_ARGS': { + HTTPX_CLIENT_SESSION_ARGS={ 'http2': True } - } + ) start_urls = ['https://quotes.toscrape.com'] @@ -59,4 +58,4 @@ async def process_item(self, item): if __name__ == '__main__': - DemoMemorySpider.start() + DemoHttpxSpider.start() diff --git a/example/singlespider/demo_memory.py b/example/singlespider/demo_memory.py index 2cab063..6d48837 100644 --- a/example/singlespider/demo_memory.py +++ b/example/singlespider/demo_memory.py @@ -13,8 +13,7 @@ class DemoMemorySpider(Spider): # 'DOWNLOAD_DELAY': 3, # 'RANDOMIZE_DOWNLOAD_DELAY': True, # 'CONCURRENT_REQUESTS': 1, - # 'LOG_LEVEL': 'INFO' - # 'DUPEFILTER_CLASS': 'aioscrapy.dupefilters.disk.RFPDupeFilter', + # 'LOG_LEVEL': 'INFO', "CLOSE_SPIDER_ON_IDLE": True, } diff --git a/example/singlespider/demo_playwright.py b/example/singlespider/demo_playwright.py index bd3b3da..5ea5f32 100644 --- a/example/singlespider/demo_playwright.py +++ b/example/singlespider/demo_playwright.py @@ -86,8 +86,8 @@ async def process_action(self, driver: PlaywrightDriver): img_bytes = await driver.page.screenshot(type="jpeg", quality=50) return img_bytes - async def process_item(self, item): - print(item) + # async def process_item(self, item): + # print(item) if __name__ == '__main__': diff --git a/example/singlespider/demo_sink_mongo.py b/example/singlespider/demo_sink_mongo.py index 7320bc4..52ec795 100644 --- a/example/singlespider/demo_sink_mongo.py +++ b/example/singlespider/demo_sink_mongo.py @@ -6,8 +6,8 @@ logger = logging.getLogger(__name__) -class DemoMemorySpider(Spider): - name = 'DemoMemorySpider' +class DemoMongoSpider(Spider): + name = 'DemoMongoSpider' custom_settings = { "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36", # 'DOWNLOAD_DELAY': 3, @@ -68,4 +68,4 @@ async def process_item(self, item): if __name__ == '__main__': - DemoMemorySpider.start() + DemoMongoSpider.start() diff --git a/example/singlespider/demo_sink_mysql.py b/example/singlespider/demo_sink_mysql.py new file mode 100644 index 0000000..cfb9951 --- /dev/null +++ b/example/singlespider/demo_sink_mysql.py @@ -0,0 +1,75 @@ +import logging + +from aioscrapy import Request +from aioscrapy.spiders import Spider + +logger = logging.getLogger(__name__) + + +class DemoMysqlSpider(Spider): + name = 'DemoMysqlSpider' + custom_settings = dict( + USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36", + # OWNLOAD_DELAY=3, + # ANDOMIZE_DOWNLOAD_DELAY=True, + # ONCURRENT_REQUESTS=1, + # OG_LEVEL='INFO', + # UPEFILTER_CLASS='aioscrapy.dupefilters.disk.RFPDupeFilter', + CLOSE_SPIDER_ON_IDLE=True, + # mysql parameter + MYSQL_ARGS={ + 'default': { + 'host': '127.0.0.1', + 'user': 'root', + 'password': 'root', + 'port': 3306, + 'charset': 'utf8mb4', + 'db': 'test', + }, + }, + ITEM_PIPELINES={ + 'aioscrapy.libs.pipelines.sink.MysqlPipeline': 100, + }, + SAVE_CACHE_NUM=1000, # 每次存储1000条 + SAVE_CACHE_INTERVAL=10, # 每次10秒存储一次 + ) + + start_urls = ['https://quotes.toscrape.com'] + + @staticmethod + async def process_request(request, spider): + """ request middleware """ + pass + + @staticmethod + async def process_response(request, response, spider): + """ response middleware """ + return response + + @staticmethod + async def process_exception(request, exception, spider): + """ exception middleware """ + pass + + async def parse(self, response): + for quote in response.css('div.quote'): + yield { + 'save_table_name': 'article', # 要存储的表名字 + 'save_db_alias': 'default', # 要存储的mongo, 参数“MYSQL_ARGS”的key + # 'save_db_name': 'xxx', # 要存储的mongo的库名, 不指定则默认为“MYSQL_ARGS”中的“db”值 + + 'author': quote.xpath('span/small/text()').get(), + 'text': quote.css('span.text::text').get(), + } + + next_page = response.css('li.next a::attr("href")').get() + if next_page is not None: + # yield response.follow(next_page, self.parse) + yield Request(f"https://quotes.toscrape.com{next_page}", callback=self.parse) + + async def process_item(self, item): + print(item) + + +if __name__ == '__main__': + DemoMysqlSpider.start()