-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathcrawler.py
77 lines (68 loc) · 2.56 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# -*- coding=utf-8 -*-
import json
from datetime import datetime
import requests
import config
import pymysql
import gevent
from gevent import monkey
monkey.patch_all()
class Crawler(object):
def run(self, url):
print('crawl ', url)
self.parse_page(url)
def down(self,url):
try:
return requests.get(url=url, headers=config.HEADERS).text
except Exception as e:
print('down err>>>', e)
def parse_page(self, url):
content = self.down(url)
js = json.loads(content)
datas = []
for c in js['comments']:
data = {}
try:
data['commentId'] = c['commentId']
data['content'] = config.PATTERN.sub('', c['content'])
data['likedCount'] = int(c['likedCount'])
data['time'] = datetime.fromtimestamp(c['time']//1000)
data['userId'] = c['user']['userId']
datas.append(data)
except Exception as e:
print('解析js出错>>>', e)
self.save(datas)
def save(self, datas):
conn = pymysql.connect(host=config.HOST, user=config.USER, passwd=config.PASSWD, db=config.DATABASE, charset='utf8mb4') # 注意字符集要设为utf8mb4,以支持存储评论中的emoji表情
cursor = conn.cursor()
sql = 'insert into '+config.TABLE_COMMENTS+' (commentId,content,likedCount,time,userId,songId,songName) VALUES (%s,%s,%s,%s,%s,%s,%s)'
for data in datas:
try:
# cursor.execute('SELECT max(id) FROM '+config.TABLE_COMMENTS)
# s = cursor.fetchone()[0]
# if s:
# id_ = s+1
# else:
# id_ = 1
cursor.execute(sql, (data['commentId'], data['content'], data['likedCount'], data['time'], data['userId'], config.SONGID, config.SONGNAME))
conn.commit()
except Exception as e:
print('存储错误>>>', e)
cursor.close()
conn.close()
def main(self, pages):
url_list = [config.ROOT_URL%(num*config.LIMIT_NUM) for num in range(0, pages//config.LIMIT_NUM+1)]
job_list = [gevent.spawn(self.run, url) for url in url_list]
gevent.joinall(job_list)
def getTotal():
try:
req = requests.get(config.ROOT_URL%(0), headers=config.HEADERS).text
js = json.loads(req)
return js['total']
except Exception as e:
print(e)
return None
if __name__=="__main__":
total = getTotal()
spider = Crawler()
spider.main(total)