-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparser.py
141 lines (134 loc) · 6.84 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import urllib.request
try:
from BeautifulSoup import BeautifulSoup
except ImportError:
from bs4 import BeautifulSoup
import locale
from datetime import datetime
import csv
import json
import logging
import sys
from datetime import timedelta
import csvdbtools
class Parser:
datetimeFormat = "%Y-%m-%dT%H:%M:%S+02:00"
columnNames = ["Record number", "Date", "All", "Hlavní město Praha",
"Středočeský kraj", "Ústecký kraj", "Královéhradecký kraj",
"Zlínský kraj", "Olomoucký kraj", "Pardubický kraj",
"Kraj Vysočina", "Plzeňský kraj", "Jihomoravský kraj",
"Liberecký kraj", "Karlovarský kraj", "Moravskoslezský kraj",
"Jihočeský kraj"]
def __init__(self):
locale.setlocale(locale.LC_ALL, 'cs_CZ.utf8')
pass
def parse(self, pathToConfirmedCSV, pathToRecoveredCSV, pathToDeathsCSV, pathToCurrentNumbersJSON):
try:
fp = urllib.request.urlopen("https://cs.wikipedia.org/w/api.php?action=parse&page=Pandemie_covidu-19_v_%C4%8Cesku&prop=text&formatversion=2&format=json")
html = json.loads(fp.read().decode("utf8"))["parse"]["text"]
fp.close
except urllib.error.HTTPError as e:
logging.error('HTTPError = ' + str(e.code))
return False
except urllib.error.URLError as e:
logging.error('URLError = ' + str(e.reason))
return False
try:
soup = BeautifulSoup(html, 'html.parser')
except Exception as e:
logging.error("BeatifulSoupException:\n"+str(e.reason))
all = {"confirmed":{"number":"", "date":""},
"recovered":{"number":"", "date":""},
"deaths": {"number":"", "date":""}}
with open(pathToCurrentNumbersJSON, "r") as file:
try:
fileJson = json.load(file);
if len(fileJson) == len(all): all = fileJson
except ValueError:
pass
update = False
try:
# parsing current numbers
confirmedPerKraj = {}
for trTag in soup.find_all('tr'):
allThTags = trTag.find_all('th')
if len(allThTags) == 1:
"""
if allThTags[0].get_text() == "Rozšíření":
textRozsireni = trTag.find_all("td")[0].get_text()
infoPerKraj = textRozsireni.splitlines()[3:]
for krajInfo in infoPerKraj:
krajName, confirmedInKraj = krajInfo.split("(")
krajName = krajName.strip()
confirmedInKraj = "".join(filter(str.isdigit, confirmedInKraj[:-1]))
confirmedPerKraj[krajName] = confirmedInKraj
"""
if allThTags[0].get_text() == "Nakažení":
textConfirmed = trTag.find_all("td")[0].get_text()
num, date = textConfirmed.split("(",2)
num = "".join(filter(str.isdigit, num))
date = date.strip().replace("(", "").replace(")", "")
dateobject = None
try:
dateobject = datetime.strptime(date, "%d. %B %Y")
except ValueError:
return False
all["confirmed"]["number"]=num
# here comes csv handling
appended = csvdbtools.csvAppendIfNew(dateobject, num, None, pathToConfirmedCSV)
if appended:
update = True
if dateobject.date() == datetime.today().date():
datetimeConverted = datetime.now().strftime(self.datetimeFormat)
else:
datetimeConverted = dateobject.strftime(self.datetimeFormat)
all["confirmed"]["date"]=datetimeConverted
if allThTags[0].get_text() == "Zotavení":
textConfirmed = trTag.find_all("td")[0].get_text()
num, date = textConfirmed.split("(",2)
num = "".join(filter(str.isdigit, num))
date = date.strip().replace("(", "").replace(")", "")
dateobject = None
try:
dateobject = datetime.strptime(date, "%d. %B %Y")
except ValueError:
return False
all["recovered"]["number"]=num
# here comes csv handling
appended = csvdbtools.csvAppendIfNew(dateobject, num, None, pathToRecoveredCSV)
if appended:
update = True
if dateobject.date() == datetime.today().date():
datetimeConverted = datetime.now().strftime(self.datetimeFormat)
else:
datetimeConverted = dateobject.strftime(self.datetimeFormat)
all["recovered"]["date"]=datetimeConverted
if allThTags[0].get_text() == "Úmrtí":
textConfirmed = trTag.find_all("td")[0].get_text()
num, date = textConfirmed.split("(",2)
num = "".join(filter(str.isdigit, num))
date = date.strip().replace("(", "").replace(")", "")
dateobject = None
try:
dateobject = datetime.strptime(date, "%d. %B %Y")
except ValueError:
return False
all["deaths"]["number"]=num
# here comes csv handling
appended = csvdbtools.csvAppendIfNew(dateobject, num, None, pathToDeathsCSV)
if appended:
update = True
if dateobject.date() == datetime.today().date():
datetimeConverted = datetime.now().strftime(self.datetimeFormat)
else:
datetimeConverted = dateobject.strftime(self.datetimeFormat)
all["deaths"]["date"]=datetimeConverted
if (update):
pass
except:
logging.error("Unexpected error:\n"+str(sys.exc_info()[0]))
raise
if(update):
with open(pathToCurrentNumbersJSON, "w+") as jsonfile:
json.dump(all, jsonfile)
return update