-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
conlin-huang
committed
Mar 28, 2023
1 parent
ce56a4f
commit e7f7c1a
Showing
4 changed files
with
116 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
1.2.7 | ||
1.2.8 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
import asyncio | ||
import logging | ||
from urllib.parse import urlparse | ||
|
||
import pyhttpx | ||
|
||
from aioscrapy import Request | ||
from aioscrapy.core.downloader.handlers import BaseDownloadHandler | ||
from aioscrapy.http import HtmlResponse | ||
from aioscrapy.settings import Settings | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class PyhttpxHandler(BaseDownloadHandler): | ||
|
||
def __init__(self, settings): | ||
self.settings: Settings = settings | ||
self.pyhttpx_client_args: dict = self.settings.get('PYHTTPX_CLIENT_ARGS', {}) | ||
self.verify_ssl: bool = self.settings.get("VERIFY_SSL") | ||
self.loop = asyncio.get_running_loop() | ||
|
||
@classmethod | ||
def from_settings(cls, settings: Settings): | ||
return cls(settings) | ||
|
||
async def download_request(self, request: Request, _) -> HtmlResponse: | ||
kwargs = { | ||
'timeout': self.settings.get('DOWNLOAD_TIMEOUT'), | ||
'cookies': dict(request.cookies), | ||
'data': request.body or None, | ||
'verify': self.verify_ssl | ||
} | ||
headers = request.headers or self.settings.get('DEFAULT_REQUEST_HEADERS') | ||
kwargs['headers'] = headers | ||
|
||
proxy = request.meta.get("proxy") | ||
if proxy: | ||
parsed_url = urlparse(proxy) | ||
kwargs["proxies"] = {'https': parsed_url.netloc.split('@')[-1]} | ||
if parsed_url.password or parsed_url.username: | ||
kwargs['proxy_auth'] = (parsed_url.username, parsed_url.password) | ||
logger.debug(f"use proxy {proxy}: {request.url}") | ||
|
||
session_args = self.pyhttpx_client_args.copy() | ||
session = pyhttpx.HttpSession(**session_args) | ||
response = await asyncio.to_thread(session.request, request.method, request.url, **kwargs) | ||
return HtmlResponse( | ||
'', | ||
status=response.status_code, | ||
headers=response.headers, | ||
body=response.content, | ||
cookies=dict(response.cookies), | ||
encoding=response.encoding | ||
) | ||
|
||
async def close(self): | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
import logging | ||
|
||
from aioscrapy import Request | ||
from aioscrapy.spiders import Spider | ||
from aioscrapy.http import Response | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class DemoPyhttpxSpider(Spider): | ||
name = 'DemoPyhttpxSpider' | ||
|
||
custom_settings = dict( | ||
USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36", | ||
# DOWNLOAD_DELAY=3, | ||
# RANDOMIZE_DOWNLOAD_DELAY=True, | ||
CONCURRENT_REQUESTS=1, | ||
LOG_LEVEL='INFO', | ||
CLOSE_SPIDER_ON_IDLE=True, | ||
DOWNLOAD_HANDLERS={ | ||
'http': 'aioscrapy.core.downloader.handlers.pyhttpx.PyhttpxHandler', | ||
'https': 'aioscrapy.core.downloader.handlers.pyhttpx.PyhttpxHandler', | ||
}, | ||
PYHTTPX_CLIENT_ARGS=dict( | ||
browser_type='chrome', | ||
http2=True | ||
) | ||
) | ||
|
||
start_urls = ['https://tls.peet.ws/api/all'] | ||
|
||
@staticmethod | ||
async def process_request(request, spider): | ||
""" request middleware """ | ||
pass | ||
|
||
@staticmethod | ||
async def process_response(request, response, spider): | ||
""" response middleware """ | ||
return response | ||
|
||
@staticmethod | ||
async def process_exception(request, exception, spider): | ||
""" exception middleware """ | ||
pass | ||
|
||
async def parse(self, response: Response): | ||
print(response.text) | ||
|
||
async def process_item(self, item): | ||
print(item) | ||
|
||
|
||
if __name__ == '__main__': | ||
DemoPyhttpxSpider.start() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters