-
Notifications
You must be signed in to change notification settings - Fork 23
/
Copy pathbaike.py
72 lines (61 loc) · 2.41 KB
/
baike.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/env python3
# encoding=utf-8
## 时间:2017-9-1 ##
## 作者:孙士标 ##
## 博客地址:http://bbiao.me
import re
import codecs
import requests
from bs4 import BeautifulSoup
__author__ = 'sunshibiao'
pattern = re.compile('article block untagged mb15 typs_\w+')
url = 'http://www.qiushibaike.com/hot/'
def get_html(url):
''' 获取html '''
html = requests.get(url).content
return html
def get_page_list(html):
''' 构造所有的页面 '''
next_page_url = url
html = get_html(next_page_url)
soup = BeautifulSoup(html, 'lxml')
content = soup.find('ul', attrs={'class': 'pagination'})
links = []
each_pages = []
for i in content.find_all('li'):
if i.find('span', attrs={'class': 'page-numbers'}):
nums = i.find('span', attrs={'class': 'page-numbers'})
num = nums.getText()
links.append(int(num))
for link in range(1, links[-1]+1):
each_page = next_page_url + 'page/' + str(link)
each_pages.append(each_page)
return each_pages
def get_text_or_pic(url):
''' 获取文本内容并写入文件, 如果是图片就保存图片 '''
with codecs.open('duanzi', 'wb', 'utf-8') as fp:
n = 1
for i in url:
html = get_html(i)
soup = BeautifulSoup(html, 'lxml')
content = soup.find('div', attrs={'id': 'content-left', 'class': 'col1'})
text_lists = []
for attr in set(re.findall(pattern, str(soup))):
for text_list in content.find_all('div', attrs={'class': attr}):
if text_list.find('div', {'class': 'thumb'}):
img_link = text_list.find('div', {'class': 'thumb'})
img = img_link.find('img')['src']
rel_img = 'https:' + img
print('正在下载第%s张图片' % n)
filename = ('第%s张.jpg' % n)
with open(filename, 'wb+') as jpg:
jpg.write(requests.get(rel_img).content)
n += 1
else:
text = text_list.find('div', attrs={'class': 'content'}).find('span').getText()
text_lists.append(text)
fp.write('{duanzi}'.format(duanzi='\n'.join(text_lists)))
def main():
get_text_or_pic(get_page_list(get_html(url)))
if __name__ == '__main__':
main()