-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathzhihu.py
191 lines (165 loc) · 7.17 KB
/
zhihu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# -*- coding: utf-8 -*-
from __future__ import print_function
import datetime
import json
import logging
import logging.config
import time
import requests
from bs4 import BeautifulSoup
from account_pool import AccountPool
from redis_util import RedisUtil
from mongo_util import Mongo
class ZhihuCrawler:
def __init__(self):
self.base_url = 'https://www.zhihu.com'
self.settings = 'https://www.zhihu.com/settings/profile'
self.headers = {
"User-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:41.0) Gecko/20100101 Firefox/41.0',
"Referer": 'http://www.zhihu.com/',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Host': 'www.zhihu.com',
}
# 爬虫起点
self.start_user = None
# 已爬取用户ID的Redis Set Key
self.pass_key = 'zhihu:pass'
# 爬取失败用户ID的Redis Set Key
self.fail_key = 'zhihu:fail'
# 待爬取用户ID的Redis List Key
self.queue_key = 'user'
# 知乎账号池
self.pool = AccountPool()
# 采用requests库保存会话信息
self.session = requests.session()
# mongodb存储爬取的用户信息
self.mongo = Mongo(database='zhihu')
# redis存储爬取状态信息
self.redis = RedisUtil(host='localhost', port=6379, namespace='zhihu')
# logger配置
logging.config.fileConfig("./Log/zhihu.conf")
self.logger = logging.getLogger('zhihu')
self.use_account()
'''
切换账号
'''
def use_account(self):
cookie = self.pool.get()
if cookie is None:
self.logger.error('NO ACCOUNT')
return False
self.session.cookies.update(cookie)
return self.is_login()
'''
验证是否处于登录状态
'''
def is_login(self):
login_code = self.session.get(self.settings, headers=self.headers, allow_redirects=False).status_code
return True if login_code == 200 else False
'''
获取用户基本信息
包括其关注者列表
'''
def get_user_basic(self, username):
home_url = self.base_url + '/people/' + username + '/following'
req = self.session.get(url=home_url, headers=self.headers, verify=True)
soup = BeautifulSoup(req.text, 'lxml')
user_info = dict()
data = soup.find('div', id='data')['data-state']
data = json.loads(data, encoding='utf-8')
user = data['entities']['users'][username]
followings = list(data['entities']['users'])
followings.remove(username)
img = soup.find('img', class_='Avatar Avatar--large UserAvatar-inner')
user_info['avatar'] = img['src'] if img is not None else ''
user_info['name'] = user['name']
user_info['headline'] = user['headline']
user_info['gender'] = 'Male' if user['gender'] else 'Female'
user_info['description'] = user['description']
user_info['business'] = user['business']['name'] if 'business' in user.keys() else ''
user_info['answerCount'] = int(user['answerCount'])
user_info['favoriteCount'] = int(user['favoriteCount'])
user_info['thankedCount'] = int(user['thankedCount'])
user_info['followerCount'] = int(user['followerCount'])
user_info['followingCount'] = int(user['followingCount'])
user_info['educations'] = list()
user_info['employments'] = list()
user_info['locations'] = list()
for edu in user['educations']:
info = dict()
info['school'] = edu['school']['name'] if 'school' in edu.keys() else ''
info['major'] = edu['major']['name'] if 'major' in edu.keys() else ''
user_info['educations'].append(info)
for loc in user['locations']:
info = dict()
info['name'] = loc['name']
user_info['locations'].append(info)
for em in user['employments']:
info = dict()
info['name'] = em['company']['name'] if 'name' in em.keys() else ''
info['job'] = em['job']['name'] if 'job' in em.keys() else ''
user_info['employments'].append(info)
user_info['create_time'] = datetime.datetime.now()
user_info['following'] = followings
return user_info, followings
'''
采用BFS沿着关注链爬取用户
depth: 当前层数
max_depth: 最大层数
'''
def following_crawler(self, depth, max_depth=5):
if depth > max_depth:
return
depths = ['#{}'.format(i) for i in range(max_depth)]
index = 0
s_cnt = self.redis.ssize(self.pass_key)
f_cnt = self.redis.ssize(self.fail_key)
if self.redis.get(self.queue_key) is None:
self.start_user = raw_input('从谁开始爬? ') .strip()
self.redis.put(self.start_user)
self.redis.put('#0')
while index <= max_depth:
while not self.redis.empty(self.queue_key):
username = self.redis.get(self.queue_key)
try:
index = depths.index(username)
break
except Exception as e:
pass
if self.redis.sismem(self.pass_key, username) or self.redis.sismem(self.fail_key, username):
continue
self.logger.info('[{}]'.format(username))
try:
basic, followings = self.get_user_basic(username)
self.redis.sadd_items(self.pass_key, username)
self.redis.put(self.queue_key, *tuple(followings))
self.mongo.save_user(basic)
s_cnt += 1
except Exception as e:
self.logger.info(e.message)
self.logger.info('--------{}--------failed'.format(username))
self.redis.sadd_items(self.fail_key, username)
f_cnt += 1
# 知乎反爬虫力度太大,由于只有俩账号,只好放慢速度
if (f_cnt + s_cnt + 1) % 5 == 0:
self.logger.info('---------\nsleep at {}\n---------'.format(datetime.datetime.now()))
time.sleep(5)
if (f_cnt + s_cnt + 1) % 50 == 0:
self.logger.info('---------\nsleep at {}\n---------'.format(datetime.datetime.now()))
time.sleep(15)
if (f_cnt + s_cnt + 1) % 25 == 0:
if not self.use_account():
self.logger.error('Account Error')
raise Exception('Account Error')
else:
self.logger.info('--------\nchange account\n--------')
self.redis.put(self.queue_key, depths[index + 1])
self.logger.info(
'---------\nDepth {} crawled.\t Fail/Success: {}/{} got\n----------'.format(index, f_cnt, s_cnt))
index = index + 1
if __name__ == '__main__':
zhihu = ZhihuCrawler()
zhihu.following_crawler(0, max_depth=5)