-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlibguider.py
64 lines (53 loc) · 2.14 KB
/
libguider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import argparse
import aiohttp
import asyncio
import async_timeout
import backoff
import json
from os import mkdir, path
from random import randint
import time
import urllib
parser = argparse.ArgumentParser()
parser.add_argument("--site_id", help="Libguides Site ID", required=True)
parser.add_argument("--api_key", help="Libguides API key", required=True)
parser.add_argument("--guide_status", help="Libguides statuses to include. ex. 1 or 1,2",
default="1")
args = parser.parse_args()
site_id = args.site_id
lg_api_key = args.api_key
status = args.guide_status
@backoff.on_exception(backoff.expo, aiohttp.ClientError, max_time=60)
async def fetch(session, url):
async with async_timeout.timeout(10):
async with session.get(url) as response:
return await response.text()
async def lg_pages_fetch(guide, session, semaphore):
await semaphore.acquire()
try:
for page in guide["pages"]:
guide_path = f"./data/{guide['id']}"
if not path.exists(guide_path):
try:
mkdir(guide_path)
except FileExistsError:
pass
with open(f"{guide_path}/guide.json", "w") as guide_json:
guide_json.write(json.dumps(guide))
filename = f"{guide_path}/page-{page['id']}.html"
if not path.exists(filename):
html = await fetch(session, page["url"])
with open(filename, "w") as f:
f.write(html)
finally:
semaphore.release()
async def main():
before = time.time()
semaphore = asyncio.Semaphore(15)
with urllib.request.urlopen(f"https://lgapi-us.libapps.com/1.1/guides?site_id={site_id}&key={lg_api_key}&status={status}&expand=owner,subjects,pages") as r:
lgb = json.loads(r.read().decode('utf-8'))
async with aiohttp.ClientSession() as session:
await asyncio.gather(*[lg_pages_fetch(guide, session, semaphore) for guide in lgb])
print(f"Operation took {time.time() - before} seconds")
asyncio.run(main())
# egrep -oh 'libproxy\.temple\.edu\/[a-zA-Z0-9&?:/\.=-]*' * | grep -v public | sort | uniq -c | sort -n