Skip to content

Commit

Permalink
1.18 新增sakura主题适配,修复文章重复爬取问题
Browse files Browse the repository at this point in the history
  • Loading branch information
noionion committed May 19, 2021
1 parent ceefac2 commit e29889d
Show file tree
Hide file tree
Showing 5 changed files with 126 additions and 25 deletions.
20 changes: 12 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,18 @@
![](https://cdn.nlark.com/yuque/0/2021/png/8391485/1612877553087-3087b091-93ce-40fd-a49f-8baf0f0f49c4.png#align=left&display=inline&height=521&margin=%5Bobject%20Object%5D&name=image.png&originHeight=521&originWidth=386&size=161076&status=done&style=none&width=386)

```
目前 release 1.17 版本:
① 支持butterfly、volantis、matery 主题的友链获取
支持**小康友链及volantis 主题友链,即部署于 gitee 上的 issuse 友链获取**
③ 支持**butterfly、volantis、matery 主题**的最新文章获取
目前 release 1.18 版本:
① 支持butterfly、volantis、matery、sakura主题的友链获取
支持小康友链及 volantis 主题友链,即部署于 gitee 上的 issuse 友链获取
③ 支持 butterfly、volantis、matery、sakura 主题的最新文章获取
④ 支持大部分拥有 sitemap 网站的文章获取
⑤ 拥有友链屏蔽、关键词屏蔽、等自定义 yaml 的配置项
⑥ 代码重构并规范化,便于二次开发
⑥ 代码重构并规范化,便于二次开发
bug修复
① 重复爬取同一文章问题
```
预览链接:https://zfe.space/friendcircle/
预览链接:https://noionion.top/friendcircle/

教程请查阅:https://zfe.space/post/friend-link-circle.html

Expand Down Expand Up @@ -83,13 +86,14 @@
```PY
# component
from theme import butterfly,matery,volantis
from theme import butterfly,matery,volantis,sakura
# theme fit massage
themes = [
butterfly,
matery,
volantis
volantis,
sakura
]
```
Expand Down
8 changes: 4 additions & 4 deletions handlers/coreDatas.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,11 +136,11 @@ def query_leancloud():
query_list = query_leancloud()

# 重复审查
def repeat(name):
def repeat(link):
upload = 'true'
for query_item in query_list:
title = query_item.get('title')
if name == title:
url = query_item.get('link')
if link == url:
upload = 'false'
return upload

Expand All @@ -152,7 +152,7 @@ def repeat(name):
friendpoor.set('link', item['link'])
friendpoor.set('author', item['name'])
friendpoor.set('headimg', item['img'])
upload = repeat(item['title'])
upload = repeat(item['link'])
if upload == 'true':
try:
friendpoor.save()
Expand Down
8 changes: 4 additions & 4 deletions handlers/coreLink.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def sitmap_get(user_info, post_poor, config=config.yml):
print('-------执行sitemap规则----------')
print('执行链接:', user_info[1])
link = user_info[1]
error_sitmap = 'false'
error_sitmap = False
try:
result = request.get_data(link + '/sitemap.xml')
soup = BeautifulSoup(result, 'html.parser')
Expand All @@ -183,7 +183,7 @@ def takeSecond(elem):

new_link_list.sort(key=takeSecond, reverse=True)
if len(url) == 0:
error_sitmap = 'true'
error_sitmap = True
print('该网站可能没有sitemap')
block_word = config['setting']['block_word']
new_loc = []
Expand Down Expand Up @@ -263,13 +263,13 @@ def takeSecond(elem):
print(e.__traceback__.tb_frame.f_globals["__file__"])
print(e.__traceback__.tb_lineno)
print('网站不包含规范的时间格式!')
error_sitmap = 'true'
error_sitmap = True
except Exception as e:
print('无法请求sitemap')
print(e)
print(e.__traceback__.tb_frame.f_globals["__file__"])
print(e.__traceback__.tb_lineno)
error_sitmap = 'true'
error_sitmap = True
print('-----------结束sitemap规则----------')
print('\n')
return error_sitmap, post_poor
23 changes: 14 additions & 9 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import sys

# component
from theme import butterfly,matery,volantis
from theme import butterfly,matery,volantis,sakura

# handlers
from handlers.coreSettings import configs
Expand All @@ -36,7 +36,8 @@
themes = [
butterfly,
matery,
volantis
volantis,
sakura
]

# ---------- #
Expand Down Expand Up @@ -91,22 +92,26 @@ def spider(item):
nonlocal total_count
nonlocal post_poor
nonlocal error_count
error = False
error = True
try:
total_count += 1
for themelinkfun in themes:
if error:
break
error = themelinkfun.get_last_post(item, post_poor)
error, post_poor = sitmap_get(item, post_poor)
if error:
print("-----------获取主页信息失败,采取sitemap策略----------")
error, post_poor = sitmap_get(item, post_poor)
print("-----------获取sitemap信息失败,采取主页爬虫策略----------")
for themelinkfun in themes:
if not error:
break
error = themelinkfun.get_last_post(item, post_poor)

except Exception as e:
print('\n')
print(item, "运用主页及sitemap爬虫爬取失败!请检查")
print('\n')
print(e)
error_count += 1

if error: error = 'true'
else: error = 'false'
item.append(error)
return item

Expand Down
92 changes: 92 additions & 0 deletions theme/sakura.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import datetime
from request_data import request

# sakura 友链规则
def get_friendlink(friendpage_link, friend_poor):
result = request.get_data(friendpage_link)
soup = BeautifulSoup(result, 'html.parser')
main_content = soup.find_all('li', {"class": "link-item"})
for item in main_content:
img = item.find('img').get('data-src')
link = item.find('a').get('href')
name = item.find('span').text
if "#" in link:
pass
else:
user_info = []
user_info.append(name)
user_info.append(link)
user_info.append(img)
print('----------------------')
try:
print('好友名%r' % name)
except:
print('非法用户名')
print('头像链接%r' % img)
print('主页链接%r' % link)
friend_poor.append(user_info)

# 从sakura主页获取文章
def get_last_post(user_info,post_poor):
error_sitmap = False
link = user_info[1]
print('\n')
print('-------执行sakura主页规则----------')
print('执行链接:', link)
result = request.get_data(link)
soup = BeautifulSoup(result, 'html.parser')
main_content = soup.find_all(id='main')
time_excit = soup.find_all('div',{"class": "post-date"})
if main_content and time_excit:
error_sitmap = True
link_list = main_content[0].find_all('div', {"class": "post-date"})
lasttime = datetime.datetime.strptime('1970-01-01', "%Y-%m-%d")
for index, item in enumerate(link_list):
time = item.text
time = time.replace("|","")
time = time.replace(" ", "")
time = time.replace("\n", "")
time = time.replace("发布于", "")
time = time.replace("\t", "")

if lasttime < datetime.datetime.strptime(time, "%Y-%m-%d"):
lasttime = datetime.datetime.strptime(time, "%Y-%m-%d")
lasttime = lasttime.strftime('%Y-%m-%d')
print('最新时间是', lasttime)
last_post_list = main_content[0].find_all('article', {"class": "post"})
for item in last_post_list:
time_created = item.find('div', {"class": "post-date"}).text.strip()
time_created = time_created.replace(" ", "")
time_created = time_created.replace("发布于", "")
time_created = time_created.replace("\t", "")
if time_created == lasttime:
error_sitmap = False
print(lasttime)
a = item.find('a')
# print(item.find('a'))
alink = a['href']
alinksplit = alink.split("/", 1)
stralink = alinksplit[1].strip()
if link[-1] != '/':
link = link + '/'
print(item.find('h3').text.strip().encode("gbk", 'ignore').decode('gbk', 'ignore'))
link = link.split('/')[0]
print(link + '/' + stralink)
print("-----------获取到匹配结果----------")
post_info = {
'title': item.find('h3').text.strip(),
'time': lasttime,
'link': link + '/' + stralink,
'name': user_info[0],
'img': user_info[2]
}
post_poor.append(post_info)
else:
error_sitmap = True
print('貌似不是类似sakura主题!')
print("-----------结束sakura主页规则----------")
print('\n')
return error_sitmap

0 comments on commit e29889d

Please sign in to comment.