diff --git a/README.md b/README.md index 792ceee..ff77432 100644 --- a/README.md +++ b/README.md @@ -1,171 +1,171 @@ - -### aio-scrapy - -An asyncio + aiolibs crawler imitate scrapy framework - -English | [中文](./doc/README_ZH.md) - -### Overview -- aio-scrapy framework is base on opensource project Scrapy & scrapy_redis. -- aio-scrapy implements compatibility with scrapyd. -- aio-scrapy implements redis queue and rabbitmq queue. -- aio-scrapy is a fast high-level web crawling and web scraping framework, used to crawl websites and extract structured data from their pages. -- Distributed crawling/scraping. -### Requirements - -- Python 3.9+ -- Works on Linux, Windows, macOS, BSD - -### Install - -The quick way: - -```shell -# Install the latest aio-scrapy -pip install git+https://github.com/conlin-huang/aio-scrapy - -# default -pip install aio-scrapy - -# Install all dependencies -pip install aio-scrapy[all] - -# When you need to use mysql/httpx/rabbitmq/mongo -pip install aio-scrapy[aiomysql,httpx,aio-pika,mongo] -``` - -### Usage - -#### create project spider: - -```shell -aioscrapy startproject project_quotes -``` - -``` -cd project_quotes -aioscrapy genspider quotes -``` - -quotes.py - -```python -from aioscrapy.spiders import Spider - - -class QuotesMemorySpider(Spider): - name = 'QuotesMemorySpider' - - start_urls = ['https://quotes.toscrape.com'] - - async def parse(self, response): - for quote in response.css('div.quote'): - yield { - 'author': quote.xpath('span/small/text()').get(), - 'text': quote.css('span.text::text').get(), - } - - next_page = response.css('li.next a::attr("href")').get() - if next_page is not None: - yield response.follow(next_page, self.parse) - - -if __name__ == '__main__': - QuotesMemorySpider.start() - -``` - -run the spider: - -```shell -aioscrapy crawl quotes -``` - -#### create single script spider: - -```shell -aioscrapy genspider single_quotes -t single -``` - -single_quotes.py: - -```python -from aioscrapy.spiders import Spider - - -class QuotesMemorySpider(Spider): - name = 'QuotesMemorySpider' - custom_settings = { - "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36", - 'CLOSE_SPIDER_ON_IDLE': True, - # 'DOWNLOAD_DELAY': 3, - # 'RANDOMIZE_DOWNLOAD_DELAY': True, - # 'CONCURRENT_REQUESTS': 1, - # 'LOG_LEVEL': 'INFO' - } - - start_urls = ['https://quotes.toscrape.com'] - - @staticmethod - async def process_request(request, spider): - """ request middleware """ - return request - - @staticmethod - async def process_response(request, response, spider): - """ response middleware """ - return response - - @staticmethod - async def process_exception(request, exception, spider): - """ exception middleware """ - pass - - async def parse(self, response): - for quote in response.css('div.quote'): - yield { - 'author': quote.xpath('span/small/text()').get(), - 'text': quote.css('span.text::text').get(), - } - - next_page = response.css('li.next a::attr("href")').get() - if next_page is not None: - yield response.follow(next_page, self.parse) - - async def process_item(self, item): - print(item) - - -if __name__ == '__main__': - QuotesMemorySpider.start() - -``` - -run the spider: - -```shell -aioscrapy runspider quotes.py -``` - - -### more commands: - -```shell -aioscrapy -h -``` - -### Documentation -[doc](./doc/documentation.md) - -### Ready - -please submit your sugguestion to owner by issue - -## Thanks - -[aiohttp](https://github.com/aio-libs/aiohttp/) - -[scrapy](https://github.com/scrapy/scrapy) - + +### aio-scrapy + +An asyncio + aiolibs crawler imitate scrapy framework + +English | [中文](./doc/README_ZH.md) + +### Overview +- aio-scrapy framework is base on opensource project Scrapy & scrapy_redis. +- aio-scrapy implements compatibility with scrapyd. +- aio-scrapy implements redis queue and rabbitmq queue. +- aio-scrapy is a fast high-level web crawling and web scraping framework, used to crawl websites and extract structured data from their pages. +- Distributed crawling/scraping. +### Requirements + +- Python 3.9+ +- Works on Linux, Windows, macOS, BSD + +### Install + +The quick way: + +```shell +# Install the latest aio-scrapy +pip install git+https://github.com/conlin-huang/aio-scrapy + +# default +pip install aio-scrapy + +# Install all dependencies +pip install aio-scrapy[all] + +# When you need to use mysql/httpx/rabbitmq/mongo +pip install aio-scrapy[aiomysql,httpx,aio-pika,mongo] +``` + +### Usage + +#### create project spider: + +```shell +aioscrapy startproject project_quotes +``` + +``` +cd project_quotes +aioscrapy genspider quotes +``` + +quotes.py + +```python +from aioscrapy.spiders import Spider + + +class QuotesMemorySpider(Spider): + name = 'QuotesMemorySpider' + + start_urls = ['https://quotes.toscrape.com'] + + async def parse(self, response): + for quote in response.css('div.quote'): + yield { + 'author': quote.xpath('span/small/text()').get(), + 'text': quote.css('span.text::text').get(), + } + + next_page = response.css('li.next a::attr("href")').get() + if next_page is not None: + yield response.follow(next_page, self.parse) + + +if __name__ == '__main__': + QuotesMemorySpider.start() + +``` + +run the spider: + +```shell +aioscrapy crawl quotes +``` + +#### create single script spider: + +```shell +aioscrapy genspider single_quotes -t single +``` + +single_quotes.py: + +```python +from aioscrapy.spiders import Spider + + +class QuotesMemorySpider(Spider): + name = 'QuotesMemorySpider' + custom_settings = { + "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36", + 'CLOSE_SPIDER_ON_IDLE': True, + # 'DOWNLOAD_DELAY': 3, + # 'RANDOMIZE_DOWNLOAD_DELAY': True, + # 'CONCURRENT_REQUESTS': 1, + # 'LOG_LEVEL': 'INFO' + } + + start_urls = ['https://quotes.toscrape.com'] + + @staticmethod + async def process_request(request, spider): + """ request middleware """ + pass + + @staticmethod + async def process_response(request, response, spider): + """ response middleware """ + return response + + @staticmethod + async def process_exception(request, exception, spider): + """ exception middleware """ + pass + + async def parse(self, response): + for quote in response.css('div.quote'): + yield { + 'author': quote.xpath('span/small/text()').get(), + 'text': quote.css('span.text::text').get(), + } + + next_page = response.css('li.next a::attr("href")').get() + if next_page is not None: + yield response.follow(next_page, self.parse) + + async def process_item(self, item): + print(item) + + +if __name__ == '__main__': + QuotesMemorySpider.start() + +``` + +run the spider: + +```shell +aioscrapy runspider quotes.py +``` + + +### more commands: + +```shell +aioscrapy -h +``` + +### Documentation +[doc](./doc/documentation.md) + +### Ready + +please submit your sugguestion to owner by issue + +## Thanks + +[aiohttp](https://github.com/aio-libs/aiohttp/) + +[scrapy](https://github.com/scrapy/scrapy) + diff --git a/aioscrapy/VERSION b/aioscrapy/VERSION index 05060b8..a96f385 100644 --- a/aioscrapy/VERSION +++ b/aioscrapy/VERSION @@ -1 +1 @@ -1.2.15 \ No newline at end of file +1.2.16 \ No newline at end of file diff --git a/aioscrapy/templates/spiders/single.tmpl b/aioscrapy/templates/spiders/single.tmpl index aef46a7..10feabc 100644 --- a/aioscrapy/templates/spiders/single.tmpl +++ b/aioscrapy/templates/spiders/single.tmpl @@ -1,38 +1,38 @@ -from aioscrapy import Spider - - -class $classname(Spider): - name = '$name' - custom_settings = { - "CLOSE_SPIDER_ON_IDLE": True - } - start_urls = [] - - @staticmethod - async def process_request(request, spider): - """ request middleware """ - return request - - @staticmethod - async def process_response(request, response, spider): - """ response middleware """ - return response - - @staticmethod - async def process_exception(request, exception, spider): - """ exception middleware """ - pass - - async def parse(self, response): - item = { - 'title': '\n'.join(response.xpath('//title/text()').extract()), - } - yield item - - async def process_item(self, item): - print(item) - - -if __name__ == '__main__': - dome = $classname() - dome.start() +from aioscrapy import Spider + + +class $classname(Spider): + name = '$name' + custom_settings = { + "CLOSE_SPIDER_ON_IDLE": True + } + start_urls = [] + + @staticmethod + async def process_request(request, spider): + """ request middleware """ + pass + + @staticmethod + async def process_response(request, response, spider): + """ response middleware """ + return response + + @staticmethod + async def process_exception(request, exception, spider): + """ exception middleware """ + pass + + async def parse(self, response): + item = { + 'title': '\n'.join(response.xpath('//title/text()').extract()), + } + yield item + + async def process_item(self, item): + print(item) + + +if __name__ == '__main__': + dome = $classname() + dome.start() diff --git a/doc/README_ZH.md b/doc/README_ZH.md index c323f15..5cc1314 100644 --- a/doc/README_ZH.md +++ b/doc/README_ZH.md @@ -1,178 +1,178 @@ - - -### aio-scrapy - -基于asyncio及aio全家桶, 使用scrapy框架流程及标准的一个异步爬虫框架 - -[English](../README.md) | 中文 - -### 概述 - -- aio-scrapy框架基于开源项目Scrapy & scrapy_redis,可以理解为scrapy-redis的asyncio版本。 -- aio-scrapy实现了对scrapyd的支持。 -- aio-scrapy实现了redis队列和rabbitmq队列。 -- aio-scrapy是一个快速的高级web爬行和web抓取框架,用于抓取网站并从其页面提取结构化数据。 -- 分布式爬虫。 -### 需求 - -- Python 3.7+ -- Works on Linux, Windows, macOS, BSD - -### 安装 - -快速安装方式: - -```shell -# 安装最新的代码 -pip install git+https://github.com/conlin-huang/aio-scrapy - -# default -pip install aio-scrapy - -# 安装所有的依赖 -pip install aio-scrapy[all] - -# 需要使用到mysql/httpx/rabbitmq/mongo相关功能 -pip install aio-scrapy[aiomysql,httpx,aio-pika,mongo] -``` - -### 用法 - -#### 创建项目爬虫: - -```shell -aioscrapy startproject project_quotes -``` - -``` -cd project_quotes -aioscrapy genspider quotes -``` - -quotes.py - -```python -from aioscrapy.spiders import Spider - - -class QuotesMemorySpider(Spider): - name = 'QuotesMemorySpider' - - start_urls = ['https://quotes.toscrape.com'] - - async def parse(self, response): - for quote in response.css('div.quote'): - yield { - 'author': quote.xpath('span/small/text()').get(), - 'text': quote.css('span.text::text').get(), - } - - next_page = response.css('li.next a::attr("href")').get() - if next_page is not None: - yield response.follow(next_page, self.parse) - - -if __name__ == '__main__': - QuotesMemorySpider.start() - -``` - -运行爬虫: - -```shell -aioscrapy crawl quotes -``` - -#### 创建单个爬虫脚本: - -```shell -aioscrapy singlespider single_quotes -``` - -single_quotes.py: - -```python -from aioscrapy.spiders import Spider - - -class QuotesMemorySpider(Spider): - name = 'QuotesMemorySpider' - custom_settings = { - "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36", - 'CLOSE_SPIDER_ON_IDLE': True, - # 'DOWNLOAD_DELAY': 3, - # 'RANDOMIZE_DOWNLOAD_DELAY': True, - # 'CONCURRENT_REQUESTS': 1, - # 'LOG_LEVEL': 'INFO' - } - - start_urls = ['https://quotes.toscrape.com'] - - @staticmethod - async def process_request(request, spider): - """ request middleware """ - return request - - @staticmethod - async def process_response(request, response, spider): - """ response middleware """ - return response - - @staticmethod - async def process_exception(request, exception, spider): - """ exception middleware """ - pass - - async def parse(self, response): - for quote in response.css('div.quote'): - yield { - 'author': quote.xpath('span/small/text()').get(), - 'text': quote.css('span.text::text').get(), - } - - next_page = response.css('li.next a::attr("href")').get() - if next_page is not None: - yield response.follow(next_page, self.parse) - - async def process_item(self, item): - print(item) - - -if __name__ == '__main__': - QuotesMemorySpider.start() - -``` - -运行爬虫: - -```shell -aioscrapy runspider quotes.py -``` - - -### 更多命令: - -```shell -aioscrapy -h -``` -### 使用文档 - -[文档](./documentation_zh.md) - -### 准备 - -请向我通过issue的方式提出您的建议 - -### 联系 - -QQ: 995018884 -WeChat: h995018884 - -## 感谢 - -[aiohttp](https://github.com/aio-libs/aiohttp/) - -[scrapy](https://github.com/scrapy/scrapy) - + + +### aio-scrapy + +基于asyncio及aio全家桶, 使用scrapy框架流程及标准的一个异步爬虫框架 + +[English](../README.md) | 中文 + +### 概述 + +- aio-scrapy框架基于开源项目Scrapy & scrapy_redis,可以理解为scrapy-redis的asyncio版本。 +- aio-scrapy实现了对scrapyd的支持。 +- aio-scrapy实现了redis队列和rabbitmq队列。 +- aio-scrapy是一个快速的高级web爬行和web抓取框架,用于抓取网站并从其页面提取结构化数据。 +- 分布式爬虫。 +### 需求 + +- Python 3.7+ +- Works on Linux, Windows, macOS, BSD + +### 安装 + +快速安装方式: + +```shell +# 安装最新的代码 +pip install git+https://github.com/conlin-huang/aio-scrapy + +# default +pip install aio-scrapy + +# 安装所有的依赖 +pip install aio-scrapy[all] + +# 需要使用到mysql/httpx/rabbitmq/mongo相关功能 +pip install aio-scrapy[aiomysql,httpx,aio-pika,mongo] +``` + +### 用法 + +#### 创建项目爬虫: + +```shell +aioscrapy startproject project_quotes +``` + +``` +cd project_quotes +aioscrapy genspider quotes +``` + +quotes.py + +```python +from aioscrapy.spiders import Spider + + +class QuotesMemorySpider(Spider): + name = 'QuotesMemorySpider' + + start_urls = ['https://quotes.toscrape.com'] + + async def parse(self, response): + for quote in response.css('div.quote'): + yield { + 'author': quote.xpath('span/small/text()').get(), + 'text': quote.css('span.text::text').get(), + } + + next_page = response.css('li.next a::attr("href")').get() + if next_page is not None: + yield response.follow(next_page, self.parse) + + +if __name__ == '__main__': + QuotesMemorySpider.start() + +``` + +运行爬虫: + +```shell +aioscrapy crawl quotes +``` + +#### 创建单个爬虫脚本: + +```shell +aioscrapy singlespider single_quotes +``` + +single_quotes.py: + +```python +from aioscrapy.spiders import Spider + + +class QuotesMemorySpider(Spider): + name = 'QuotesMemorySpider' + custom_settings = { + "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36", + 'CLOSE_SPIDER_ON_IDLE': True, + # 'DOWNLOAD_DELAY': 3, + # 'RANDOMIZE_DOWNLOAD_DELAY': True, + # 'CONCURRENT_REQUESTS': 1, + # 'LOG_LEVEL': 'INFO' + } + + start_urls = ['https://quotes.toscrape.com'] + + @staticmethod + async def process_request(request, spider): + """ request middleware """ + pass + + @staticmethod + async def process_response(request, response, spider): + """ response middleware """ + return response + + @staticmethod + async def process_exception(request, exception, spider): + """ exception middleware """ + pass + + async def parse(self, response): + for quote in response.css('div.quote'): + yield { + 'author': quote.xpath('span/small/text()').get(), + 'text': quote.css('span.text::text').get(), + } + + next_page = response.css('li.next a::attr("href")').get() + if next_page is not None: + yield response.follow(next_page, self.parse) + + async def process_item(self, item): + print(item) + + +if __name__ == '__main__': + QuotesMemorySpider.start() + +``` + +运行爬虫: + +```shell +aioscrapy runspider quotes.py +``` + + +### 更多命令: + +```shell +aioscrapy -h +``` +### 使用文档 + +[文档](./documentation_zh.md) + +### 准备 + +请向我通过issue的方式提出您的建议 + +### 联系 + +QQ: 995018884 +WeChat: h995018884 + +## 感谢 + +[aiohttp](https://github.com/aio-libs/aiohttp/) + +[scrapy](https://github.com/scrapy/scrapy) +