-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
88 lines (75 loc) · 2.45 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import json
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
def feedback2dict(item):
spans = item.find_all('span')
links = item.find_all('a')
date = spans[0].text
user_type = spans[1].text[3:]
if len(spans) > 3:
text = spans[3].text
else:
text = spans[2].text
user = links[0].text
if '(' in user:
pos = user.find('(')
user = u'hidden {}'.format(user[pos:])
return {
'user': user,
'user_type': user_type,
'date': date,
'text': text
}
def scrape_page(page, timeout=10):
driver.get('https://ec.europa.eu/info/law/better-regulation/have-your-say/initiatives/13375-Prolongation-du-reglement-sur-le-certificat-COVID-numerique-de-l%E2%80%99UE/feedback_fr?p_id=27926341&page={}'.format(page))
t = 0
while not 'Résultats' in driver.page_source:
time.sleep(1)
t += 1
if t > timeout:
return None
soup = BeautifulSoup(driver.page_source, 'html.parser')
items = soup.find_all('feedback-item')
return [feedback2dict(item) for item in items]
def create_driver():
while True:
try:
options = Options()
options.headless = True
driver = webdriver.Firefox(
executable_path='/usr/local/bin/geckodriver', options=options)
return driver
except Exception as e:
print(e)
print('Something went wrong creating driver, sleeping for 60s.')
time.sleep(60)
def get_current_page():
try:
with open('curpage.txt', 'rt') as f:
return int(f.read())
except:
return 0
def set_current_page(page):
with open('curpage.txt', 'wt') as f:
f.write('{}'.format(page))
if __name__ == '__main__':
driver = create_driver()
page = get_current_page()
while True:
try:
feedbacks = scrape_page(page)
except Exception as e:
print(e)
feedbacks = None
if feedbacks and len(feedbacks) > 0:
with open('certificat-covid-eu.json', 'at') as f:
for feedback in feedbacks:
f.write('{}\n'.format(json.dumps(feedback)))
print('page {} scrapped'.format(page))
page += 1
set_current_page(page)
else:
print('Something went wrong. Sleeping for 60s.')
time.sleep(60)