-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconventoarchiver.py
244 lines (192 loc) · 8.04 KB
/
conventoarchiver.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
"""Convento Archiver attempts to make inroads into archiving articles
from the My Convento newsroom platform.
We're interested in two links from My Convento resources, the press-release
itself, and the PDF copy of the same release.
Original: https://myconvento.com/public/newsroom/data/plugin/news/run/show_news/news_id/2998166/lang/22/id/
PDF: https://myconvento.com/application/controllers/newsroom/pdf.php?news_id=enc2_YVdaaU9YQmxNM05FTVhWT1VscEJSREJrV2s4MmR6MDk&l=22&lang=22&id=
Process:
1. Cycle through index pages capturing all IDs.
2. Create a listing of links to specific press releases.
3. Read each page of the press release for the PDF link.
4. Return a list of PDFs and press releases. These are formatted in
html and json for now.
5. Optionally, submit to the Internet Archive (not implemented...).
"""
import configparser
import json
import re
import sys
import urllib.request
# Config all happens below. It requires config.cfg files to be setup
# correctly and to be provided as arguments to the script. We could
# maybe wrap all of this into a function, but as is, it will run first
# thing in the script, and ensures everything is setup okay. If not,
# the script won't run which is essentially what we're looking for if
# the configuration isn't correct.
CONFIG = ""
try:
CONFIG = sys.argv[1]
except IndexError as err:
print(
"Make sure you have provided one argument, e.g. `python conventoarchiver.py yourconfig.cfg`:",
err,
file=sys.stderr,
)
sys.exit(1)
config = configparser.ConfigParser()
config.read(CONFIG)
try:
indices_url = config["main"]["convento_indices"]
pages = int(config["main"]["number_of_pages"])
suffix = config["main"]["sitemap_suffix"]
except KeyError as err:
print("Problem reading config cannot access key:", err, file=sys.stderr)
sys.exit(1)
def _id_regex():
"""ID regex defines six groups of patterns, for which we are
interested in the fifth group (match[4]) which should be a
large integer that representa a page ID. The string that it should
match against looks as follows:
`a href="https://myconvento.com/public/newsroom/data/plugin/news/run/show_news/news_id/3107954/lang/`
:return: regex string for matching press release IDs (string)
"""
return '(a class="" )(href=")(.*)(news_id/)(\d*)(\/lang\/)'
def _title_regex():
"""Regex that defines three groups of which we are interested in
the second (match[1]) which should provide us with the title as
defined in HTML for a webpage. The string that it should match
against looks as follows:
`<title>Newsroom der ...</title>`
:return: regex string for matching the title element in an HTML
(string)
"""
return "(<title>)(.*)<(/title>)"
def _pdf_url_regex():
"""Regex that defines five groups of patterns, for which we are
interested in the fourth group (match[3]) which should be a URL
pointing at a PDF copy of a press release for which we already have
a link to the HTML version.
`<i class="fa fa-file-pdf-o"></i> <a href="https://myconvento.com/application/controllers/newsroom/pdf.php?news_id=enc2_YVdaaU9YQmxNM05FTVhWT1VscEJSREJrV2s4MmR6MDk&l=22&lang=22&id=" rel=`
:return: regex string for matching PDF URLs (string)
"""
return '(i class=")(fa fa-file-pdf-o)(.*href=")(.*)(" rel=")'
def capture_ids(indices_url):
"""Capture all IDs for press releases from the My Convento index
pages.
:param indices_url: My Convento index base URL, ending in page=
which will be incremented page=1, page=2 etc. to capture all
ids until there are no more ids to capture ()
:return: list of press-release IDs (list).
"""
ids = []
curr_ids = []
for page in range(1, pages):
page_url = "{}{}".format(indices_url, page)
print("Page URL:", page_url, file=sys.stderr)
resp = urllib.request.urlopen(page_url)
data = resp.read().decode("utf8")
id_re = re.compile(_id_regex())
matches = re.findall(id_re, data)
page_ids = [match[4] for match in matches]
if curr_ids and page_ids == curr_ids:
break
curr_ids = page_ids
ids = ids + page_ids
print("Number of IDs:", len(set(ids)), file=sys.stderr)
return set(ids)
def construct_pr_html_url(id_):
"""Construct a URL for the html press-release pages.
:param id_: ID of a press release in My Convento.
:return: URL (string).
"""
canonical_convento_url = (
"https://myconvento.com/public/newsroom/data/plugin/news/run/show_news/news_id/"
)
return "{}{}".format(canonical_convento_url, id_)
def capture_pdf_links(urls):
"""Capture links for press-release PDF files from My Convneto
press-release HTML pages.
The list output from here looks as follows:
[
(title (string), url (string), pdf_link (string)),
(title (string), url (string), pdf_link (string)),
(title (string), url (string), pdf_link (string)),
(title (string), url (string), pdf_link (string)),
]
Access to the values in the second position of the list might look
as follows, given list link_tuples:
title = link_tuples[1][0]
url = link_tuples[1][1]
pdf = link_tuples[1][2]
Through iteration, the caller can access all items.
:param urls: list of press-release URLs (list)
:return: list of PDF links (list)
"""
pdfs = []
for url in urls:
resp = urllib.request.urlopen(url)
data = resp.read().decode("utf8")
title_re = re.compile(_title_regex())
try:
title = title_re.findall(data)[0][1].replace("Newsroom der - ", "")
except IndexError:
title = None
pass
pdf_re = re.compile(_pdf_url_regex())
pdf_link = pdf_re.findall(data)[0][3]
pdfs.append((title, url, pdf_link))
return pdfs
def output_simple_html(link_tuples):
"""Output a simple HTML listing of all press-releases and PDF links.
:param link_tuples: list of tuples containing, title, link, and PDF
values (list)
:return: None (nonetype)
"""
links = ""
for linkset in link_tuples:
links = "{} <ul>\n".format(links)
links = "{} <li>Title: {}</li>\n".format(links, linkset[0])
links = '{} <li>HTML: <a href="{}">{}</a></li>\n'.format(
links, linkset[1], linkset[1]
)
links = '{} <li>PDF: <a href="{}">{}</a></li>\n'.format(
links, linkset[2], linkset[2]
)
links = "{} </ul>".format(links)
html = (
"<!DOCTYPE html>\n"
"<html>\n"
" <head><title>Newsroom Sitemap</title></head>\n"
" <body>\n{}\n </body>\n"
"</html>"
)
with open("sitemap-{}.htm".format(suffix), "wb") as sitemap:
sitemap.write(html.format(links).encode("utf8"))
def output_simple_json(link_tuples):
"""Output a simple JSON listing of all press-releases and PDF links.
:param link_tuples: list of tuples containing, title, link, and PDF
values (list)
:return: None (nonetype)
"""
with open("sitemap-{}.json".format(suffix), "wb") as sitemap:
sitemap.write(json.dumps(link_tuples, indent=2, sort_keys=True).encode("utf8"))
def output_simple_text(link_tuples):
"""Output a simple text listing of all press-releases and PDF links.
:param link_tuples: list of tuples containing, title, link, and PDF
values (list)
:return: None (nonetype)
"""
with open("sitemap-{}.txt".format(suffix), "wb") as sitemap:
for link in link_tuples:
sitemap.write("{}\n".format(link[1]).encode("utf8"))
sitemap.write("{}\n".format(link[2]).encode("utf8"))
def main():
"""Primary entry point of the script."""
ids = capture_ids(indices_url)
urls = [construct_pr_html_url(id_) for id_ in ids]
pdfs_releases = capture_pdf_links(urls)
output_simple_html(pdfs_releases)
output_simple_json(pdfs_releases)
output_simple_text(pdfs_releases)
if __name__ == "__main__":
main()