-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsport_news_crwaling.py
executable file
·115 lines (91 loc) · 3.95 KB
/
sport_news_crwaling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import requests
from bs4 import BeautifulSoup
import redis
import time
import json
class NewsCrwaling:
def __init__(self):
self.rconn = redis.StrictRedis(host='ec2-3-34-134-147.ap-northeast-2.compute.amazonaws.com', port=6379, db=1,
decode_responses=True)
self.redis_data_expire_time = 252000
@property
def flat_form_list(self):
keys = 'flat_form_list'
flat_form_lists = ['naver', 'daum']
for flat_form_list in flat_form_lists:
self.rconn.sadd(keys, flat_form_list)
@property
def naver_sports_crwaling(self):
web_site_name = 'naver'
base_url = "https://sports.news.naver.com/"
sport_news_category_urls = dict(
baseball='kbaseball/index.nhn',
wbaseball='wbaseball/index.nhn',
football='kfootball/index.nhn',
wfootball='wfootball/index.nhn',
basketball='basketball/index.nhn',
volleyball='volleyball/index.nhn',
golf='golf/index.nhn',
general='general/index.nhn',
esports='esports/index.nhn'
)
select_location = '#content > div > div.home_feature > div.feature_side > div > ol'
a_tag_class_name = 'link_news_end'
self.crwaling_operator(web_site_name=web_site_name,
base_url=base_url,
sport_news_category_urls=sport_news_category_urls,
select_location=select_location,
a_tag_class_name=a_tag_class_name)
@property
def daum_sports_crwaling(self):
web_site_name = 'daum'
base_url = "https://sports.daum.net/"
sport_news_category_urls = dict(
baseball='baseball',
wbaseball='worldbaseball',
football='soccer',
basketball='basketball',
volleyball='volleyball',
golf='golf',
general='general',
esports='esports',
)
select_location = '#cSub > div > div.top_rank > ol:nth-child(3)'
a_tag_class_name = 'link_txt'
self.crwaling_operator(web_site_name=web_site_name,
base_url=base_url,
sport_news_category_urls=sport_news_category_urls,
select_location=select_location,
a_tag_class_name=a_tag_class_name)
def crwaling_operator(self, web_site_name, base_url, sport_news_category_urls, select_location,
a_tag_class_name):
for category, sport_news_category_url in sport_news_category_urls.items():
req = requests.get(base_url + sport_news_category_url)
html = req.text
soup = BeautifulSoup(html, 'html.parser')
news = soup.select(
select_location
)
titles = news[0].find_all('a', class_=a_tag_class_name)
news_dict = dict()
if web_site_name == "naver":
for rank, title in enumerate(titles):
news_data = {'title': title.text, 'url': base_url + title.get('href')}
news_dict.update({rank + 1: news_data})
elif web_site_name == "daum":
for rank, title in enumerate(titles):
news_data = {'title': title.text, 'url': title.get('href')}
news_dict.update({rank + 1: news_data})
print(web_site_name, category, news_dict)
news_dict = json.dumps(news_dict, ensure_ascii=False).encode('utf-8')
key = web_site_name + ':' + category
self.rconn.set(key, news_dict, self.redis_data_expire_time)
def main(self):
self.flat_form_list
self.daum_sports_crwaling
self.naver_sports_crwaling
if __name__ == "__main__":
news_crwaling = NewsCrwaling()
while (True):
news_crwaling.main()
time.sleep(600)