From 717c37d13fd73bea8d09ee7da7f46d7169753e38 Mon Sep 17 00:00:00 2001 From: conlin-huang <9950188884@qq.com> Date: Sun, 26 Mar 2023 23:03:43 +0800 Subject: [PATCH] feat: support playwright --- .../handlers/playwright/__init__.py | 75 +++++++++++++ .../handlers/playwright/driverpool.py | 53 +++++++++ .../handlers/playwright/webdriver.py | 104 ++++++++++++++++++ aioscrapy/core/scraper.py | 5 +- aioscrapy/crawler.py | 2 +- aioscrapy/http/__init__.py | 4 +- aioscrapy/http/response/playwright.py | 39 +++++++ aioscrapy/http/response/text.py | 7 +- example/singlespider/demo_playwright.py | 85 ++++++++++++++ setup.py | 6 +- 10 files changed, 371 insertions(+), 9 deletions(-) create mode 100644 aioscrapy/core/downloader/handlers/playwright/__init__.py create mode 100644 aioscrapy/core/downloader/handlers/playwright/driverpool.py create mode 100644 aioscrapy/core/downloader/handlers/playwright/webdriver.py create mode 100644 aioscrapy/http/response/playwright.py create mode 100644 example/singlespider/demo_playwright.py diff --git a/aioscrapy/core/downloader/handlers/playwright/__init__.py b/aioscrapy/core/downloader/handlers/playwright/__init__.py new file mode 100644 index 0000000..6b7658f --- /dev/null +++ b/aioscrapy/core/downloader/handlers/playwright/__init__.py @@ -0,0 +1,75 @@ +import logging + +from aioscrapy import Request +from aioscrapy.core.downloader.handlers import BaseDownloadHandler +from aioscrapy.http import PlaywrightResponse +from aioscrapy.settings import Settings +from .driverpool import WebDriverPool +from .webdriver import PlaywrightDriver + +logger = logging.getLogger(__name__) + + +class PlaywrightHandler(BaseDownloadHandler): + def __init__(self, settings: Settings): + self.settings = settings + playwright_client_args = settings.getdict('PLAYWRIGHT_CLIENT_ARGS') + self.url_regexes = playwright_client_args.pop('url_regexes', []) + pool_size = playwright_client_args.pop('pool_size', settings.getint("CONCURRENT_REQUESTS", 1)) + self._webdriver_pool = WebDriverPool(pool_size=pool_size, driver_cls=PlaywrightDriver, **playwright_client_args) + + @classmethod + def from_settings(cls, settings: Settings): + return cls(settings) + + async def download_request(self, request: Request, spider) -> PlaywrightResponse: + cookies = dict(request.cookies) + timeout = request.meta.get('download_timeout', 5) * 1000 + user_agent = (request.headers or self.settings.get('DEFAULT_REQUEST_HEADERS')).get("User-agent") + proxy: str = request.meta.get("proxy") + url = request.url + + driver: PlaywrightDriver = await self._webdriver_pool.get( + user_agent=user_agent, + proxy=proxy, + timeout=timeout, + on_event={ + name.replace('on_event', ''): getattr(spider, name) for name in dir(spider) if name.startswith('on_event') + }, + ) + try: + if cookies: + driver.url = url + await driver.set_cookies(cookies) + await driver.page.goto(url, wait_until=request.meta.get('wait_until', "networkidle")) + cache_response = {} + for url_regex in self.url_regexes: + async with driver.page.expect_response(url_regex, timeout=int(timeout/len(self.url_regexes))) as result: + res = await result.value + cache_response[url_regex] = PlaywrightResponse( + url=res.url, + request=request, + intercept_request=dict( + url=res.request.url, + headers=res.request.headers, + data=res.request.post_data, + ), + headers=res.headers, + body=await res.body(), + status=res.status, + ) + return PlaywrightResponse( + url=driver.page.url, + status=200, + text=await driver.page.content(), + cookies=await driver.get_cookies(), + cache_response=cache_response, + driver=driver, + driver_pool=self._webdriver_pool + ) + except Exception as e: + await self._webdriver_pool.remove(driver) + raise e + + async def close(self): + await self._webdriver_pool.close() diff --git a/aioscrapy/core/downloader/handlers/playwright/driverpool.py b/aioscrapy/core/downloader/handlers/playwright/driverpool.py new file mode 100644 index 0000000..9a3eb46 --- /dev/null +++ b/aioscrapy/core/downloader/handlers/playwright/driverpool.py @@ -0,0 +1,53 @@ +# -*- coding: utf-8 -*- + +from asyncio import Lock +from asyncio.queues import Queue + +from aioscrapy.utils.tools import singleton + + +@singleton +class WebDriverPool: + def __init__( + self, pool_size=5, driver_cls=None, **kwargs + ): + self.pool_size = pool_size + self.driver_cls = driver_cls + self.kwargs = kwargs + + self.queue = Queue(maxsize=pool_size) + self.lock = Lock() + self.driver_count = 0 + + @property + def is_full(self): + return self.driver_count >= self.pool_size + + async def create_driver(self, **args): + kwargs = self.kwargs.copy() + kwargs.update(args) + driver = self.driver_cls(**kwargs) + await driver.setup() + return driver + + async def get(self, **kwargs): + async with self.lock: + if not self.is_full: + driver = await self.create_driver(**kwargs) + self.driver_count += 1 + else: + driver = await self.queue.get() + return driver + + async def release(self, driver): + await self.queue.put(driver) + + async def remove(self, driver): + await driver.quit() + self.driver_count -= 1 + + async def close(self): + while not self.queue.empty(): + driver = await self.queue.get() + await driver.quit() + self.driver_count -= 1 diff --git a/aioscrapy/core/downloader/handlers/playwright/webdriver.py b/aioscrapy/core/downloader/handlers/playwright/webdriver.py new file mode 100644 index 0000000..e5d1c19 --- /dev/null +++ b/aioscrapy/core/downloader/handlers/playwright/webdriver.py @@ -0,0 +1,104 @@ +# -*- coding: utf-8 -*- + +import os +from typing import Dict, Optional, Tuple, Callable + +try: + from typing import Literal # python >= 3.8 +except ImportError: # python <3.8 + from typing_extensions import Literal + +from urllib.parse import urlparse, urlunparse + +from playwright.async_api import Page, BrowserContext, ViewportSize, ProxySettings +from playwright.async_api import Playwright, Browser +from playwright.async_api import async_playwright + + +class PlaywrightDriver: + def __init__( + self, + *, + driver_type: Literal["chromium", "firefox", "webkit"] = "chromium", + proxy: Optional[str] = None, + browser_args: Optional[Dict] = None, + context_args: Optional[Dict] = None, + on_event: Optional[Dict] = None, + on_response: Optional[Callable] = None, + window_size: Optional[Tuple[int, int]] = None, + timout: int = 30 * 1000, + user_agent: str = None, + **kwargs + ): + + self.driver_type = driver_type + self.proxy = proxy and self.format_context_proxy(proxy) + self.viewport = window_size and ViewportSize(width=window_size[0], height=window_size[1]) + self.browser_args = browser_args + self.context_args = context_args + self.timout = timout + self.on_event = on_event + self.on_response = on_response + self.user_agent = user_agent + + self.driver: Playwright = None + self.browser: Browser = None + self.context: BrowserContext = None + self.page: Page = None + self.url = None + + async def setup(self): + browser_args = self.browser_args.copy() + context_args = self.context_args.copy() + if browser_args.get('args') is None: + browser_args.update({'args': ["--no-sandbox"]}) + + if context_args.get("storage_state") is not None: + storage_state_path = context_args.get("storage_state") + os.makedirs(os.path.dirname(storage_state_path), exist_ok=True) + + if self.proxy: + browser_args.update({'proxy': self.proxy}) + context_args.update({'proxy': self.proxy}) + if self.viewport: + context_args.update({"viewport": self.viewport}) + context_args.update({"screen": self.viewport}) + if self.user_agent: + context_args.update({'user_agent': self.user_agent}) + + self.driver = await async_playwright().start() + # self.browser = await getattr(self.driver, self.driver_type).launch(**browser_args) + self.browser = await self.driver.chromium.launch(**browser_args) + self.context = await self.browser.new_context(**context_args) + self.page = await self.context.new_page() + self.page.set_default_timeout(self.timout * 1000) + + for event, callback in self.on_event.items(): + self.page.on(event, callback) + self.on_response and self.page.on("response", self.on_response) + + @staticmethod + def format_context_proxy(proxy) -> ProxySettings: + parsed_url = urlparse(proxy) + return ProxySettings( + server=urlunparse(parsed_url._replace(netloc=parsed_url.hostname)), + username=urlparse(proxy).username, + password=urlparse(proxy).password, + ) + + async def quit(self): + await self.page.close() + await self.context.close() + await self.browser.close() + await self.driver.stop() + + async def get_cookies(self): + return { + cookie["name"]: cookie["value"] + for cookie in await self.page.context.cookies() + } + + async def set_cookies(self, cookies: dict): + await self.page.context.add_cookies([ + {"name": key, "value": value, "url": self.url or self.page.url} for key, value in cookies.items() + ]) diff --git a/aioscrapy/core/scraper.py b/aioscrapy/core/scraper.py index bbc6a1b..7aa1f45 100644 --- a/aioscrapy/core/scraper.py +++ b/aioscrapy/core/scraper.py @@ -7,6 +7,7 @@ import aioscrapy from aioscrapy import signals, Spider from aioscrapy.exceptions import CloseSpider, DropItem, IgnoreRequest +from aioscrapy.http import PlaywrightResponse from aioscrapy.http import Request, Response from aioscrapy.logformatter import LogFormatter from aioscrapy.middleware import ItemPipelineManager, SpiderMiddlewareManager @@ -118,6 +119,9 @@ async def _scrape(self, result: Union[Response, BaseException], request: Request exc_info=e, extra={'spider': self.spider}) finally: + if isinstance(result, PlaywrightResponse): + await result.release() + # Delete the cache result from the slot self.slot.finish_response(request, result) @@ -168,7 +172,6 @@ async def handle_spider_error(self, exc: BaseException, request: Request, respon async def handle_spider_output(self, result: AsyncGenerator, request: Request, response: Response) -> None: """Iter each Request/Item (given in the output parameter) returned from the given spider""" - if not result: return diff --git a/aioscrapy/crawler.py b/aioscrapy/crawler.py index 3c55746..2414b98 100644 --- a/aioscrapy/crawler.py +++ b/aioscrapy/crawler.py @@ -242,7 +242,7 @@ async def run(self) -> None: def start(self) -> None: if sys.platform.startswith('win'): - asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) + asyncio.set_event_loop(asyncio.windows_events.ProactorEventLoop()) else: try: import uvloop diff --git a/aioscrapy/http/__init__.py b/aioscrapy/http/__init__.py index a79e1c2..bc35b42 100644 --- a/aioscrapy/http/__init__.py +++ b/aioscrapy/http/__init__.py @@ -8,8 +8,8 @@ from aioscrapy.http.request import Request from aioscrapy.http.request.form import FormRequest from aioscrapy.http.request.json_request import JsonRequest - from aioscrapy.http.response import Response from aioscrapy.http.response.html import HtmlResponse -from aioscrapy.http.response.xml import XmlResponse +from aioscrapy.http.response.playwright import PlaywrightResponse from aioscrapy.http.response.text import TextResponse +from aioscrapy.http.response.xml import XmlResponse diff --git a/aioscrapy/http/response/playwright.py b/aioscrapy/http/response/playwright.py new file mode 100644 index 0000000..6bcc49c --- /dev/null +++ b/aioscrapy/http/response/playwright.py @@ -0,0 +1,39 @@ +import re +from typing import Optional + +from playwright.async_api import Response + +from aioscrapy.http.response.text import TextResponse + + +class PlaywrightResponse(TextResponse): + def __init__( + self, + *args, + text: str = '', + cache_response: Optional[dict] = None, + driver: Optional["PlaywrightDriver"] = None, + driver_pool: Optional["WebDriverPool"] = None, + intercept_request: Optional[dict] = None, + **kwargs + ): + self.driver = driver + self.driver_pool = driver_pool + self._text = text + self.cache_response = cache_response or {} + self.intercept_request = intercept_request + super().__init__(*args, **kwargs) + + async def release(self): + self.driver_pool and self.driver and await self.driver_pool.release(self.driver) + + @property + def text(self): + return self._text or super().text + + @text.setter + def text(self, text): + self._text = text + + def get_response(self, url_regex): + return self.cache_response.get(url_regex) diff --git a/aioscrapy/http/response/text.py b/aioscrapy/http/response/text.py index 3c31f52..b185c86 100644 --- a/aioscrapy/http/response/text.py +++ b/aioscrapy/http/response/text.py @@ -5,15 +5,14 @@ See documentation in docs/topics/request-response.rst """ -import ujson import warnings from contextlib import suppress from typing import Generator from urllib.parse import urljoin import parsel +import ujson from parsel import Selector - from w3lib.encoding import (html_body_declared_encoding, html_to_unicode, http_content_type_encoding, resolve_encoding) from w3lib.html import strip_html5_whitespace @@ -32,8 +31,8 @@ class TextResponse(Response): _DEFAULT_ENCODING = 'ascii' _cached_decoded_json = _NONE - def __init__(self, *args, **kwargs): - self._encoding = kwargs.pop('encoding', None) + def __init__(self, *args, encoding=None, **kwargs): + self._encoding = encoding self._cached_benc = None self._cached_ubody = None self._cached_selector = None diff --git a/example/singlespider/demo_playwright.py b/example/singlespider/demo_playwright.py new file mode 100644 index 0000000..fc1e1a3 --- /dev/null +++ b/example/singlespider/demo_playwright.py @@ -0,0 +1,85 @@ +import logging + +from aioscrapy import Request +from aioscrapy.spiders import Spider +from aioscrapy.http import PlaywrightResponse + +logger = logging.getLogger(__name__) + + +class DemoMemorySpider(Spider): + name = 'DemoMemorySpider' + + custom_settings = dict( + USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36", + # DOWNLOAD_DELAY=3, + # RANDOMIZE_DOWNLOAD_DELAY=True, + CONCURRENT_REQUESTS=3, + LOG_LEVEL='INFO', + CLOSE_SPIDER_ON_IDLE=True, + DOWNLOAD_HANDLERS={ + 'http': 'aioscrapy.core.downloader.handlers.playwright.PlaywrightHandler', + 'https': 'aioscrapy.core.downloader.handlers.playwright.PlaywrightHandler', + }, + PLAYWRIGHT_CLIENT_ARGS=dict( + driver_type="chromium", # chromium、firefox、webkit + wait_until="networkidle", # 等待页面加载完成的事件,可选值:"commit", "domcontentloaded", "load", "networkidle" + window_size=(1024, 800), + # url_regexes=["xxxx"], + browser_args=dict( + executable_path=None, channel=None, args=None, ignore_default_args=None, handle_sigint=None, + handle_sigterm=None, handle_sighup=None, timeout=None, env=None, headless=False, devtools=None, + proxy=None, downloads_path=None, slow_mo=None, traces_dir=None, chromium_sandbox=None, + firefox_user_prefs=None, + ), + context_args=dict( + no_viewport=None, ignore_https_errors=None, java_script_enabled=None, + bypass_csp=None, user_agent=None, locale=None, timezone_id=None, geolocation=None, permissions=None, + extra_http_headers=None, offline=None, http_credentials=None, device_scale_factor=None, + is_mobile=None, has_touch=None, color_scheme=None, reduced_motion=None, forced_colors=None, + accept_downloads=None, default_browser_type=None, proxy=None, record_har_path=None, + record_har_omit_content=None, record_video_dir=None, record_video_size=None, storage_state=None, + base_url=None, strict_selectors=None, service_workers=None, record_har_url_filter=None, + record_har_mode=None, record_har_content=None, + ), + ) + + ) + + start_urls = ['https://hanyu.baidu.com/zici/s?wd=%E9%BB%84&query=%E9%BB%84'] + + @staticmethod + async def process_request(request, spider): + """ request middleware """ + pass + + @staticmethod + async def process_response(request, response, spider): + """ response middleware """ + return response + + @staticmethod + async def process_exception(request, exception, spider): + """ exception middleware """ + pass + + async def parse(self, response: PlaywrightResponse): + # res = response.get_response("xxxx") + # print(res.text[:100]) + + yield { + 'pingyin': response.xpath('//div[@id="pinyin"]/span/b/text()').get(), + 'fan': response.xpath('//*[@id="traditional"]/span/text()').get(), + } + + new_character = response.xpath('//a[@class="img-link"]/@href').getall() + for character in new_character: + new_url = 'https://hanyu.baidu.com/zici' + character + yield Request(new_url, callback=self.parse, dont_filter=True) + + async def process_item(self, item): + print(item) + + +if __name__ == '__main__': + DemoMemorySpider.start() diff --git a/setup.py b/setup.py index ff8d40f..cd98570 100644 --- a/setup.py +++ b/setup.py @@ -14,11 +14,15 @@ "redis>=4.3.1", ] extras_require = { - "all": ["aiomysql>=0.1.1", "httpx[http2]>=0.23.0", "aio-pika>=8.1.1", "cryptography", "motor>=3.1.1"], + "all": [ + "aiomysql>=0.1.1", "httpx[http2]>=0.23.0", "aio-pika>=8.1.1", + "cryptography", "motor>=3.1.1", "playwright>=1.31.1" + ], "aiomysql": ["aiomysql>=0.1.1", "cryptography"], "httpx": ["httpx[http2]>=0.23.0"], "aio-pika": ["aio-pika>=8.1.1"], "mongo": ["motor>=3.1.1"], + "playwright": ["playwright>=1.31.1"], } setup(