-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
241 lines (202 loc) · 8.01 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
#!/usr/bin/env python
#-*- coding: utf-8 -*-
"""
The program to handle converting Imperial Library to Kindle
1) Download an index of all books for each Game category
2) For each book in an index, convert the text to a small HTML format
3) Run kindlegen by feeding it an XML file describing the book's metadata
4) Move the book into a directory corresponding to it's game
* Books have the filename "the-url-the-book-came-from-v1.mobi"
* Books will have titles in the format "[game] Book Title v1, The"
* TES Online has a TON of books which will clutter your Kindle completely
"""
# TODO: add PIL to the requirements and work on a PIL image generator
from bs4 import BeautifulSoup as BS # parser
from subprocess import call as s_call # sub-system call
from requests import get as re_get # fetch data from URI
from os import mkdir, listdir # list and make directories
from os.path import join, exists # check path exists, join path strings
from shutil import move as fmove # file moving
from string import printable # printable characters
from sys import argv # obvious
from time import sleep # be nice to the Imperial Library
# Output settings
OUTDIR = "out"
OUTNAME = "{1}.mobi"
ROOT_URL = "http://www.imperial-library.info"
SLEEP_TIME = 2
# Really crappy check to see if we have a local bin or not
KGEN_BIN = "kindlegen" if "kindlegen" not in listdir(".") else "./kindlegen"
# All games organized by section
SOURCE_URLS = {
"daggerfall" : "/books/daggerfall/by-title",
"battlespire" : "/books/battlespire/by-title",
"redguard" : "/books/redguard/by-title",
"morrowind" : "/books/morrowind/by-title",
"shadowkey" : "/books/shadowkey/by-title",
"oblivion" : "/books/oblivion/by-title",
"skyrim" : "/books/skyrim/by-title",
"online" : "/books/online/by-title",
}
# We have to re-write an OPF file with each book's details
# Kindlegen uses the OPF XML file as a way of writing info about the book
# No point in creating a complex ETree to express this when I'm changing two things
# TODO: add a PIL-created image to the cover to make it feel authentic
OPF_BLOB = """
<?xml version="1.0" encoding="iso-8859-1"?>
<package unique-identifier="uid" xmlns:opf="http://www.idpf.org/2007/opf" xmlns:asd="http://www.idpf.org/asdfaf">
<metadata>
<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core" xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/">
<dc:Title>{0}</dc:Title>
<dc:Language>en</dc:Language>
<dc:Creator>{1}</dc:Creator>
<dc:Copyrights>Bethesda</dc:Copyrights>
<dc:Publisher>Bethesda</dc:Publisher>
</dc-metadata>
</metadata>
<manifest>
<item id="text" media-type="text/x-oeb1-document" href="output.html"></item>
</manifest>
<spine toc="ncx">
<itemref idref="text"/>
</spine>
<guide>
<reference type="text" title="Book" href="output.html"/>
</guide>
</package>
"""
# The HTML of the book, doesn't have to be very complex
HTML_BLOB = """
<html>
<head><style>.rtecenter{{text-align:center;}}</style></head>
<body>
{0}
</body>
</html>
"""
def craft_url(url):
"""
Return a URL using the ROOT_RUL variable at top-level
"""
return "{}{}".format(ROOT_URL, url)
def get_bs(url):
"""
Return a parser of the given URL fragment
/content/something-id -> BS(ROOT_URL/content/something-id)
Applies a ton of filtering/replacing so we don't get bad characters
"""
text = re_get(craft_url(url)).text
text = "".join([c for c in text if c in printable])
text = text.replace("\n", " ").replace("\xa0", " ")
return BS(text, 'html.parser')
def craft_fname(section, url):
"""
Craft a filename for moving the MOBI file
"""
return OUTNAME.format(section, url.split("/").pop())
def run_kindlegen():
"""
Call the kindlegen compiler to create an output MOBI blob
The blob then has to be moved by the program to a folder
"""
return s_call([KGEN_BIN, "-o", "output.mobi", "book.opf"])
def get_title(bs, game):
"""
Return the title of the given book BS
Luckily it seems to be the only H1 tag in the BS
Format it so "The" appears at the end of the book
Versions of the book can be before "The", what do I care?
Target output: "[dag] Real Barenziah v1, The"
"""
title = bs.h1.text
if title.lower().strip().startswith("the"):
splits = title.split(" ")
title = "{1}, {0}".format(splits.pop(0), " ".join(splits))
return "[{0}] {1}".format(game[:3].lower(), bs.h1.text)
def get_author(bs):
"""
Find the author name, which exists in a strange div next to some strange
characters, next to a text label of "Author", so we must remove
all of that to extract the final Author name
"""
auth = bs.find_all("div", class_="field-item odd")
if not len(auth):
return ""
auth = auth[0].text.replace("Author:", "")
auth = "".join([c for c in auth if c in printable]).strip()
return auth
def gather_text(bs):
"""
Collect all paragraphs under the node-content div
These can be joined later but for now, only return a list
'\xa0' gets inserted by BS so we have to clear it after it parses
"""
node_root = bs.find_all("div", class_="node-content")
return [str(p).replace("\xa0", " ") for p in node_root[1].find_all("p")]
def to_book(game, url):
"""
Main book algorithm
1. first detect if a given URL is a listing of multiple books
2. then pass the URL(s) to a processing section where we write files
3. Run kindlegen on the newly created html/opf blobs
4. Move the MOBI output into it's respective folder with it's new name
"""
raw_bs = get_bs(url)
vol_ls = raw_bs.find_all("div", class_="book-navigation")
# Default settings if the page grabbed is only 1 book long
# If there is more tha one volume on the page, find them all
s_href = [url]
s_bses = [raw_bs]
if vol_ls:
s_href = [x.a["href"] for x in vol_ls[0].find_all("li")]
s_bses = [get_bs(href) for href in s_href]
# Write HTML, OPF, and run the compiler
for url, bs in zip(s_href, s_bses):
with open("output.html", "w") as hf:
hf.write(HTML_BLOB.format("\n".join(gather_text(bs))))
with open("book.opf", "w") as of:
of.write(OPF_BLOB.format(get_title(bs, game), get_author(bs)))
try:
run_kindlegen() # run the compiler
fmove("output.mobi", join(OUTDIR, game, craft_fname(game, url)))
except Exception as e:
print("Error occurred: {}".format(str(e)))
return True
def main(*args, **kwargs):
"""
Main program to set up the processing
"""
book_hrefs = list() # list to collect all HREFs gathered
# first check whether we have an output directory
print("Checking main output directory...")
if not exists(OUTDIR):
print("Creating book output directory")
mkdir(OUTDIR)
print("Checking sub-directories...")
for game in SOURCE_URLS.keys():
if not exists(join(OUTDIR, game)):
mkdir(join(OUTDIR, game))
print("Done creating directories")
# Build a list of URLs to scan from each index
for game, source_url in SOURCE_URLS.items():
print("Downloading index of {}...".format(source_url))
index = get_bs(source_url) # get the BS of the current index
divls = index.find_all("div", class_="view-content")[0].find_all("a")
book_hrefs.extend([(game, x['href']) for x in divls])
print("Total count: {}".format(len(book_hrefs)))
# Process each URL and it's game to the to_book() function
count = 0
for game, href in book_hrefs:
print("Processing {}...".format(href))
to_book(game, href)
sleep(SLEEP_TIME)
count += 1
print("{0} books processed".format(count))
# end
return None
if __name__ == "__main__":
if "-t" in argv or "--test" in argv:
print("Testing")
quit()
main()
# end