diff --git a/aioscrapy/VERSION b/aioscrapy/VERSION index a77d7d9..5975b14 100644 --- a/aioscrapy/VERSION +++ b/aioscrapy/VERSION @@ -1 +1 @@ -1.2.7 \ No newline at end of file +1.2.8 \ No newline at end of file diff --git a/aioscrapy/core/downloader/handlers/pyhttpx.py b/aioscrapy/core/downloader/handlers/pyhttpx.py new file mode 100644 index 0000000..99205ab --- /dev/null +++ b/aioscrapy/core/downloader/handlers/pyhttpx.py @@ -0,0 +1,58 @@ +import asyncio +import logging +from urllib.parse import urlparse + +import pyhttpx + +from aioscrapy import Request +from aioscrapy.core.downloader.handlers import BaseDownloadHandler +from aioscrapy.http import HtmlResponse +from aioscrapy.settings import Settings + +logger = logging.getLogger(__name__) + + +class PyhttpxHandler(BaseDownloadHandler): + + def __init__(self, settings): + self.settings: Settings = settings + self.pyhttpx_client_args: dict = self.settings.get('PYHTTPX_CLIENT_ARGS', {}) + self.verify_ssl: bool = self.settings.get("VERIFY_SSL") + self.loop = asyncio.get_running_loop() + + @classmethod + def from_settings(cls, settings: Settings): + return cls(settings) + + async def download_request(self, request: Request, _) -> HtmlResponse: + kwargs = { + 'timeout': self.settings.get('DOWNLOAD_TIMEOUT'), + 'cookies': dict(request.cookies), + 'data': request.body or None, + 'verify': self.verify_ssl + } + headers = request.headers or self.settings.get('DEFAULT_REQUEST_HEADERS') + kwargs['headers'] = headers + + proxy = request.meta.get("proxy") + if proxy: + parsed_url = urlparse(proxy) + kwargs["proxies"] = {'https': parsed_url.netloc.split('@')[-1]} + if parsed_url.password or parsed_url.username: + kwargs['proxy_auth'] = (parsed_url.username, parsed_url.password) + logger.debug(f"use proxy {proxy}: {request.url}") + + session_args = self.pyhttpx_client_args.copy() + session = pyhttpx.HttpSession(**session_args) + response = await asyncio.to_thread(session.request, request.method, request.url, **kwargs) + return HtmlResponse( + '', + status=response.status_code, + headers=response.headers, + body=response.content, + cookies=dict(response.cookies), + encoding=response.encoding + ) + + async def close(self): + pass diff --git a/example/singlespider/demo_pyhttpx.py b/example/singlespider/demo_pyhttpx.py new file mode 100644 index 0000000..fadd4c6 --- /dev/null +++ b/example/singlespider/demo_pyhttpx.py @@ -0,0 +1,55 @@ +import logging + +from aioscrapy import Request +from aioscrapy.spiders import Spider +from aioscrapy.http import Response + +logger = logging.getLogger(__name__) + + +class DemoPyhttpxSpider(Spider): + name = 'DemoPyhttpxSpider' + + custom_settings = dict( + USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36", + # DOWNLOAD_DELAY=3, + # RANDOMIZE_DOWNLOAD_DELAY=True, + CONCURRENT_REQUESTS=1, + LOG_LEVEL='INFO', + CLOSE_SPIDER_ON_IDLE=True, + DOWNLOAD_HANDLERS={ + 'http': 'aioscrapy.core.downloader.handlers.pyhttpx.PyhttpxHandler', + 'https': 'aioscrapy.core.downloader.handlers.pyhttpx.PyhttpxHandler', + }, + PYHTTPX_CLIENT_ARGS=dict( + browser_type='chrome', + http2=True + ) + ) + + start_urls = ['https://tls.peet.ws/api/all'] + + @staticmethod + async def process_request(request, spider): + """ request middleware """ + pass + + @staticmethod + async def process_response(request, response, spider): + """ response middleware """ + return response + + @staticmethod + async def process_exception(request, exception, spider): + """ exception middleware """ + pass + + async def parse(self, response: Response): + print(response.text) + + async def process_item(self, item): + print(item) + + +if __name__ == '__main__': + DemoPyhttpxSpider.start() diff --git a/setup.py b/setup.py index cd98570..8f7e3d0 100644 --- a/setup.py +++ b/setup.py @@ -16,13 +16,14 @@ extras_require = { "all": [ "aiomysql>=0.1.1", "httpx[http2]>=0.23.0", "aio-pika>=8.1.1", - "cryptography", "motor>=3.1.1", "playwright>=1.31.1" + "cryptography", "motor>=3.1.1", "playwright>=1.31.1", "pyhttpx>=2.10.1" ], "aiomysql": ["aiomysql>=0.1.1", "cryptography"], "httpx": ["httpx[http2]>=0.23.0"], "aio-pika": ["aio-pika>=8.1.1"], "mongo": ["motor>=3.1.1"], "playwright": ["playwright>=1.31.1"], + "pyhttpx": ["pyhttpx>=2.10.1"] } setup(