-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
conlin-huang
committed
Mar 26, 2023
1 parent
9d15081
commit 717c37d
Showing
10 changed files
with
371 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
import logging | ||
|
||
from aioscrapy import Request | ||
from aioscrapy.core.downloader.handlers import BaseDownloadHandler | ||
from aioscrapy.http import PlaywrightResponse | ||
from aioscrapy.settings import Settings | ||
from .driverpool import WebDriverPool | ||
from .webdriver import PlaywrightDriver | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class PlaywrightHandler(BaseDownloadHandler): | ||
def __init__(self, settings: Settings): | ||
self.settings = settings | ||
playwright_client_args = settings.getdict('PLAYWRIGHT_CLIENT_ARGS') | ||
self.url_regexes = playwright_client_args.pop('url_regexes', []) | ||
pool_size = playwright_client_args.pop('pool_size', settings.getint("CONCURRENT_REQUESTS", 1)) | ||
self._webdriver_pool = WebDriverPool(pool_size=pool_size, driver_cls=PlaywrightDriver, **playwright_client_args) | ||
|
||
@classmethod | ||
def from_settings(cls, settings: Settings): | ||
return cls(settings) | ||
|
||
async def download_request(self, request: Request, spider) -> PlaywrightResponse: | ||
cookies = dict(request.cookies) | ||
timeout = request.meta.get('download_timeout', 5) * 1000 | ||
user_agent = (request.headers or self.settings.get('DEFAULT_REQUEST_HEADERS')).get("User-agent") | ||
proxy: str = request.meta.get("proxy") | ||
url = request.url | ||
|
||
driver: PlaywrightDriver = await self._webdriver_pool.get( | ||
user_agent=user_agent, | ||
proxy=proxy, | ||
timeout=timeout, | ||
on_event={ | ||
name.replace('on_event', ''): getattr(spider, name) for name in dir(spider) if name.startswith('on_event') | ||
}, | ||
) | ||
try: | ||
if cookies: | ||
driver.url = url | ||
await driver.set_cookies(cookies) | ||
await driver.page.goto(url, wait_until=request.meta.get('wait_until', "networkidle")) | ||
cache_response = {} | ||
for url_regex in self.url_regexes: | ||
async with driver.page.expect_response(url_regex, timeout=int(timeout/len(self.url_regexes))) as result: | ||
res = await result.value | ||
cache_response[url_regex] = PlaywrightResponse( | ||
url=res.url, | ||
request=request, | ||
intercept_request=dict( | ||
url=res.request.url, | ||
headers=res.request.headers, | ||
data=res.request.post_data, | ||
), | ||
headers=res.headers, | ||
body=await res.body(), | ||
status=res.status, | ||
) | ||
return PlaywrightResponse( | ||
url=driver.page.url, | ||
status=200, | ||
text=await driver.page.content(), | ||
cookies=await driver.get_cookies(), | ||
cache_response=cache_response, | ||
driver=driver, | ||
driver_pool=self._webdriver_pool | ||
) | ||
except Exception as e: | ||
await self._webdriver_pool.remove(driver) | ||
raise e | ||
|
||
async def close(self): | ||
await self._webdriver_pool.close() |
53 changes: 53 additions & 0 deletions
53
aioscrapy/core/downloader/handlers/playwright/driverpool.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
from asyncio import Lock | ||
from asyncio.queues import Queue | ||
|
||
from aioscrapy.utils.tools import singleton | ||
|
||
|
||
@singleton | ||
class WebDriverPool: | ||
def __init__( | ||
self, pool_size=5, driver_cls=None, **kwargs | ||
): | ||
self.pool_size = pool_size | ||
self.driver_cls = driver_cls | ||
self.kwargs = kwargs | ||
|
||
self.queue = Queue(maxsize=pool_size) | ||
self.lock = Lock() | ||
self.driver_count = 0 | ||
|
||
@property | ||
def is_full(self): | ||
return self.driver_count >= self.pool_size | ||
|
||
async def create_driver(self, **args): | ||
kwargs = self.kwargs.copy() | ||
kwargs.update(args) | ||
driver = self.driver_cls(**kwargs) | ||
await driver.setup() | ||
return driver | ||
|
||
async def get(self, **kwargs): | ||
async with self.lock: | ||
if not self.is_full: | ||
driver = await self.create_driver(**kwargs) | ||
self.driver_count += 1 | ||
else: | ||
driver = await self.queue.get() | ||
return driver | ||
|
||
async def release(self, driver): | ||
await self.queue.put(driver) | ||
|
||
async def remove(self, driver): | ||
await driver.quit() | ||
self.driver_count -= 1 | ||
|
||
async def close(self): | ||
while not self.queue.empty(): | ||
driver = await self.queue.get() | ||
await driver.quit() | ||
self.driver_count -= 1 |
104 changes: 104 additions & 0 deletions
104
aioscrapy/core/downloader/handlers/playwright/webdriver.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
import os | ||
from typing import Dict, Optional, Tuple, Callable | ||
|
||
try: | ||
from typing import Literal # python >= 3.8 | ||
except ImportError: # python <3.8 | ||
from typing_extensions import Literal | ||
|
||
from urllib.parse import urlparse, urlunparse | ||
|
||
from playwright.async_api import Page, BrowserContext, ViewportSize, ProxySettings | ||
from playwright.async_api import Playwright, Browser | ||
from playwright.async_api import async_playwright | ||
|
||
|
||
class PlaywrightDriver: | ||
def __init__( | ||
self, | ||
*, | ||
driver_type: Literal["chromium", "firefox", "webkit"] = "chromium", | ||
proxy: Optional[str] = None, | ||
browser_args: Optional[Dict] = None, | ||
context_args: Optional[Dict] = None, | ||
on_event: Optional[Dict] = None, | ||
on_response: Optional[Callable] = None, | ||
window_size: Optional[Tuple[int, int]] = None, | ||
timout: int = 30 * 1000, | ||
user_agent: str = None, | ||
**kwargs | ||
): | ||
|
||
self.driver_type = driver_type | ||
self.proxy = proxy and self.format_context_proxy(proxy) | ||
self.viewport = window_size and ViewportSize(width=window_size[0], height=window_size[1]) | ||
self.browser_args = browser_args | ||
self.context_args = context_args | ||
self.timout = timout | ||
self.on_event = on_event | ||
self.on_response = on_response | ||
self.user_agent = user_agent | ||
|
||
self.driver: Playwright = None | ||
self.browser: Browser = None | ||
self.context: BrowserContext = None | ||
self.page: Page = None | ||
self.url = None | ||
|
||
async def setup(self): | ||
browser_args = self.browser_args.copy() | ||
context_args = self.context_args.copy() | ||
if browser_args.get('args') is None: | ||
browser_args.update({'args': ["--no-sandbox"]}) | ||
|
||
if context_args.get("storage_state") is not None: | ||
storage_state_path = context_args.get("storage_state") | ||
os.makedirs(os.path.dirname(storage_state_path), exist_ok=True) | ||
|
||
if self.proxy: | ||
browser_args.update({'proxy': self.proxy}) | ||
context_args.update({'proxy': self.proxy}) | ||
if self.viewport: | ||
context_args.update({"viewport": self.viewport}) | ||
context_args.update({"screen": self.viewport}) | ||
if self.user_agent: | ||
context_args.update({'user_agent': self.user_agent}) | ||
|
||
self.driver = await async_playwright().start() | ||
# self.browser = await getattr(self.driver, self.driver_type).launch(**browser_args) | ||
self.browser = await self.driver.chromium.launch(**browser_args) | ||
self.context = await self.browser.new_context(**context_args) | ||
self.page = await self.context.new_page() | ||
self.page.set_default_timeout(self.timout * 1000) | ||
|
||
for event, callback in self.on_event.items(): | ||
self.page.on(event, callback) | ||
self.on_response and self.page.on("response", self.on_response) | ||
|
||
@staticmethod | ||
def format_context_proxy(proxy) -> ProxySettings: | ||
parsed_url = urlparse(proxy) | ||
return ProxySettings( | ||
server=urlunparse(parsed_url._replace(netloc=parsed_url.hostname)), | ||
username=urlparse(proxy).username, | ||
password=urlparse(proxy).password, | ||
) | ||
|
||
async def quit(self): | ||
await self.page.close() | ||
await self.context.close() | ||
await self.browser.close() | ||
await self.driver.stop() | ||
|
||
async def get_cookies(self): | ||
return { | ||
cookie["name"]: cookie["value"] | ||
for cookie in await self.page.context.cookies() | ||
} | ||
|
||
async def set_cookies(self, cookies: dict): | ||
await self.page.context.add_cookies([ | ||
{"name": key, "value": value, "url": self.url or self.page.url} for key, value in cookies.items() | ||
]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
import re | ||
from typing import Optional | ||
|
||
from playwright.async_api import Response | ||
|
||
from aioscrapy.http.response.text import TextResponse | ||
|
||
|
||
class PlaywrightResponse(TextResponse): | ||
def __init__( | ||
self, | ||
*args, | ||
text: str = '', | ||
cache_response: Optional[dict] = None, | ||
driver: Optional["PlaywrightDriver"] = None, | ||
driver_pool: Optional["WebDriverPool"] = None, | ||
intercept_request: Optional[dict] = None, | ||
**kwargs | ||
): | ||
self.driver = driver | ||
self.driver_pool = driver_pool | ||
self._text = text | ||
self.cache_response = cache_response or {} | ||
self.intercept_request = intercept_request | ||
super().__init__(*args, **kwargs) | ||
|
||
async def release(self): | ||
self.driver_pool and self.driver and await self.driver_pool.release(self.driver) | ||
|
||
@property | ||
def text(self): | ||
return self._text or super().text | ||
|
||
@text.setter | ||
def text(self, text): | ||
self._text = text | ||
|
||
def get_response(self, url_regex): | ||
return self.cache_response.get(url_regex) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.