Skip to content

Commit

Permalink
feat: support playwright
Browse files Browse the repository at this point in the history
  • Loading branch information
conlin-huang committed Mar 26, 2023
1 parent 9d15081 commit 717c37d
Show file tree
Hide file tree
Showing 10 changed files with 371 additions and 9 deletions.
75 changes: 75 additions & 0 deletions aioscrapy/core/downloader/handlers/playwright/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import logging

from aioscrapy import Request
from aioscrapy.core.downloader.handlers import BaseDownloadHandler
from aioscrapy.http import PlaywrightResponse
from aioscrapy.settings import Settings
from .driverpool import WebDriverPool
from .webdriver import PlaywrightDriver

logger = logging.getLogger(__name__)


class PlaywrightHandler(BaseDownloadHandler):
def __init__(self, settings: Settings):
self.settings = settings
playwright_client_args = settings.getdict('PLAYWRIGHT_CLIENT_ARGS')
self.url_regexes = playwright_client_args.pop('url_regexes', [])
pool_size = playwright_client_args.pop('pool_size', settings.getint("CONCURRENT_REQUESTS", 1))
self._webdriver_pool = WebDriverPool(pool_size=pool_size, driver_cls=PlaywrightDriver, **playwright_client_args)

@classmethod
def from_settings(cls, settings: Settings):
return cls(settings)

async def download_request(self, request: Request, spider) -> PlaywrightResponse:
cookies = dict(request.cookies)
timeout = request.meta.get('download_timeout', 5) * 1000
user_agent = (request.headers or self.settings.get('DEFAULT_REQUEST_HEADERS')).get("User-agent")
proxy: str = request.meta.get("proxy")
url = request.url

driver: PlaywrightDriver = await self._webdriver_pool.get(
user_agent=user_agent,
proxy=proxy,
timeout=timeout,
on_event={
name.replace('on_event', ''): getattr(spider, name) for name in dir(spider) if name.startswith('on_event')
},
)
try:
if cookies:
driver.url = url
await driver.set_cookies(cookies)
await driver.page.goto(url, wait_until=request.meta.get('wait_until', "networkidle"))
cache_response = {}
for url_regex in self.url_regexes:
async with driver.page.expect_response(url_regex, timeout=int(timeout/len(self.url_regexes))) as result:
res = await result.value
cache_response[url_regex] = PlaywrightResponse(
url=res.url,
request=request,
intercept_request=dict(
url=res.request.url,
headers=res.request.headers,
data=res.request.post_data,
),
headers=res.headers,
body=await res.body(),
status=res.status,
)
return PlaywrightResponse(
url=driver.page.url,
status=200,
text=await driver.page.content(),
cookies=await driver.get_cookies(),
cache_response=cache_response,
driver=driver,
driver_pool=self._webdriver_pool
)
except Exception as e:
await self._webdriver_pool.remove(driver)
raise e

async def close(self):
await self._webdriver_pool.close()
53 changes: 53 additions & 0 deletions aioscrapy/core/downloader/handlers/playwright/driverpool.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# -*- coding: utf-8 -*-

from asyncio import Lock
from asyncio.queues import Queue

from aioscrapy.utils.tools import singleton


@singleton
class WebDriverPool:
def __init__(
self, pool_size=5, driver_cls=None, **kwargs
):
self.pool_size = pool_size
self.driver_cls = driver_cls
self.kwargs = kwargs

self.queue = Queue(maxsize=pool_size)
self.lock = Lock()
self.driver_count = 0

@property
def is_full(self):
return self.driver_count >= self.pool_size

async def create_driver(self, **args):
kwargs = self.kwargs.copy()
kwargs.update(args)
driver = self.driver_cls(**kwargs)
await driver.setup()
return driver

async def get(self, **kwargs):
async with self.lock:
if not self.is_full:
driver = await self.create_driver(**kwargs)
self.driver_count += 1
else:
driver = await self.queue.get()
return driver

async def release(self, driver):
await self.queue.put(driver)

async def remove(self, driver):
await driver.quit()
self.driver_count -= 1

async def close(self):
while not self.queue.empty():
driver = await self.queue.get()
await driver.quit()
self.driver_count -= 1
104 changes: 104 additions & 0 deletions aioscrapy/core/downloader/handlers/playwright/webdriver.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# -*- coding: utf-8 -*-

import os
from typing import Dict, Optional, Tuple, Callable

try:
from typing import Literal # python >= 3.8
except ImportError: # python <3.8
from typing_extensions import Literal

from urllib.parse import urlparse, urlunparse

from playwright.async_api import Page, BrowserContext, ViewportSize, ProxySettings
from playwright.async_api import Playwright, Browser
from playwright.async_api import async_playwright


class PlaywrightDriver:
def __init__(
self,
*,
driver_type: Literal["chromium", "firefox", "webkit"] = "chromium",
proxy: Optional[str] = None,
browser_args: Optional[Dict] = None,
context_args: Optional[Dict] = None,
on_event: Optional[Dict] = None,
on_response: Optional[Callable] = None,
window_size: Optional[Tuple[int, int]] = None,
timout: int = 30 * 1000,
user_agent: str = None,
**kwargs
):

self.driver_type = driver_type
self.proxy = proxy and self.format_context_proxy(proxy)
self.viewport = window_size and ViewportSize(width=window_size[0], height=window_size[1])
self.browser_args = browser_args
self.context_args = context_args
self.timout = timout
self.on_event = on_event
self.on_response = on_response
self.user_agent = user_agent

self.driver: Playwright = None
self.browser: Browser = None
self.context: BrowserContext = None
self.page: Page = None
self.url = None

async def setup(self):
browser_args = self.browser_args.copy()
context_args = self.context_args.copy()
if browser_args.get('args') is None:
browser_args.update({'args': ["--no-sandbox"]})

if context_args.get("storage_state") is not None:
storage_state_path = context_args.get("storage_state")
os.makedirs(os.path.dirname(storage_state_path), exist_ok=True)

if self.proxy:
browser_args.update({'proxy': self.proxy})
context_args.update({'proxy': self.proxy})
if self.viewport:
context_args.update({"viewport": self.viewport})
context_args.update({"screen": self.viewport})
if self.user_agent:
context_args.update({'user_agent': self.user_agent})

self.driver = await async_playwright().start()
# self.browser = await getattr(self.driver, self.driver_type).launch(**browser_args)
self.browser = await self.driver.chromium.launch(**browser_args)
self.context = await self.browser.new_context(**context_args)
self.page = await self.context.new_page()
self.page.set_default_timeout(self.timout * 1000)

for event, callback in self.on_event.items():
self.page.on(event, callback)
self.on_response and self.page.on("response", self.on_response)

@staticmethod
def format_context_proxy(proxy) -> ProxySettings:
parsed_url = urlparse(proxy)
return ProxySettings(
server=urlunparse(parsed_url._replace(netloc=parsed_url.hostname)),
username=urlparse(proxy).username,
password=urlparse(proxy).password,
)

async def quit(self):
await self.page.close()
await self.context.close()
await self.browser.close()
await self.driver.stop()

async def get_cookies(self):
return {
cookie["name"]: cookie["value"]
for cookie in await self.page.context.cookies()
}

async def set_cookies(self, cookies: dict):
await self.page.context.add_cookies([
{"name": key, "value": value, "url": self.url or self.page.url} for key, value in cookies.items()
])
5 changes: 4 additions & 1 deletion aioscrapy/core/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import aioscrapy
from aioscrapy import signals, Spider
from aioscrapy.exceptions import CloseSpider, DropItem, IgnoreRequest
from aioscrapy.http import PlaywrightResponse
from aioscrapy.http import Request, Response
from aioscrapy.logformatter import LogFormatter
from aioscrapy.middleware import ItemPipelineManager, SpiderMiddlewareManager
Expand Down Expand Up @@ -118,6 +119,9 @@ async def _scrape(self, result: Union[Response, BaseException], request: Request
exc_info=e,
extra={'spider': self.spider})
finally:
if isinstance(result, PlaywrightResponse):
await result.release()

# Delete the cache result from the slot
self.slot.finish_response(request, result)

Expand Down Expand Up @@ -168,7 +172,6 @@ async def handle_spider_error(self, exc: BaseException, request: Request, respon

async def handle_spider_output(self, result: AsyncGenerator, request: Request, response: Response) -> None:
"""Iter each Request/Item (given in the output parameter) returned from the given spider"""

if not result:
return

Expand Down
2 changes: 1 addition & 1 deletion aioscrapy/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ async def run(self) -> None:

def start(self) -> None:
if sys.platform.startswith('win'):
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
asyncio.set_event_loop(asyncio.windows_events.ProactorEventLoop())
else:
try:
import uvloop
Expand Down
4 changes: 2 additions & 2 deletions aioscrapy/http/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
from aioscrapy.http.request import Request
from aioscrapy.http.request.form import FormRequest
from aioscrapy.http.request.json_request import JsonRequest

from aioscrapy.http.response import Response
from aioscrapy.http.response.html import HtmlResponse
from aioscrapy.http.response.xml import XmlResponse
from aioscrapy.http.response.playwright import PlaywrightResponse
from aioscrapy.http.response.text import TextResponse
from aioscrapy.http.response.xml import XmlResponse
39 changes: 39 additions & 0 deletions aioscrapy/http/response/playwright.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import re
from typing import Optional

from playwright.async_api import Response

from aioscrapy.http.response.text import TextResponse


class PlaywrightResponse(TextResponse):
def __init__(
self,
*args,
text: str = '',
cache_response: Optional[dict] = None,
driver: Optional["PlaywrightDriver"] = None,
driver_pool: Optional["WebDriverPool"] = None,
intercept_request: Optional[dict] = None,
**kwargs
):
self.driver = driver
self.driver_pool = driver_pool
self._text = text
self.cache_response = cache_response or {}
self.intercept_request = intercept_request
super().__init__(*args, **kwargs)

async def release(self):
self.driver_pool and self.driver and await self.driver_pool.release(self.driver)

@property
def text(self):
return self._text or super().text

@text.setter
def text(self, text):
self._text = text

def get_response(self, url_regex):
return self.cache_response.get(url_regex)
7 changes: 3 additions & 4 deletions aioscrapy/http/response/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,14 @@
See documentation in docs/topics/request-response.rst
"""

import ujson
import warnings
from contextlib import suppress
from typing import Generator
from urllib.parse import urljoin

import parsel
import ujson
from parsel import Selector

from w3lib.encoding import (html_body_declared_encoding, html_to_unicode,
http_content_type_encoding, resolve_encoding)
from w3lib.html import strip_html5_whitespace
Expand All @@ -32,8 +31,8 @@ class TextResponse(Response):
_DEFAULT_ENCODING = 'ascii'
_cached_decoded_json = _NONE

def __init__(self, *args, **kwargs):
self._encoding = kwargs.pop('encoding', None)
def __init__(self, *args, encoding=None, **kwargs):
self._encoding = encoding
self._cached_benc = None
self._cached_ubody = None
self._cached_selector = None
Expand Down
Loading

0 comments on commit 717c37d

Please sign in to comment.