-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAPI.py
273 lines (222 loc) · 9.05 KB
/
API.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
import HP
import pickle
import os
from gutenberg_book import *
from gather_info import *
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers
from metadata import create_metadata
from paragraph import *
import paragraph_analyse as pa
def get_books(books_list=None, books_features=None, book_object=True):
"""
Args:
books_list: (Optional) list of books gutenberg ID to create books. if it is None then it will return all books.
books_features: (Optional) features to get books with that feature. a dictionary which values are set or list.
book_object: if it is True then returns GutenbergBooks else it will return metadata
Returns:
a dictionary of books objects {id: GutenbergBook(id)/metadata(id)}
"""
if books_list is not None and books_features is not None:
raise AttributeError("only one of books_list and books_features should be identified")
if os.path.isfile(HP.BOOKS_DATA_PATH):
pk = open(HP.BOOKS_DATA_PATH, "rb")
books_metadata = pickle.load(pk)
pk.close()
assert isinstance(books_metadata, dict)
else:
books_metadata = dict()
if books_list is not None:
books_list = books_list & set(books_metadata)
books_metadata = {id: books_metadata[id] for id in books_list}
if books_features is not None:
for feature, items in books_features.items():
books_metadata = {id: metadata for id, metadata in books_metadata.items()
if items.issubset(metadata[feature])}
if book_object:
return create_gutenberg_books(books_metadata, dic=True)
return books_metadata
def add_books(books_list):
"""
Add books to dataset
Args:
books_list: a set of books id or GutenbergBooks
"""
gb_list = {book for book in books_list if isinstance(book, GutenbergBook)}
id_list = {book for book in books_list if isinstance(book, int)}
books = get_books()
id_list = id_list - set(books)
assert all([isvalid(i) for i in id_list])
new_metadata_id = create_metadata(id_list)
new_books = gb_list | create_gutenberg_books(new_metadata_id)
for book in new_books:
books[book.id] = book
path = os.path.dirname(HP.BOOKS_DATA_PATH)
if not os.path.exists(path):
os.makedirs(path)
pk = open(HP.BOOKS_DATA_PATH, "wb")
pickle.dump(create_metadata(books.values()), pk)
pk.close()
def get_bookshelves(bookshelves_list=None):
"""
Args:
bookshelves_list: (Optional) list of bookshelves to return. if it is None then it will return all bookshelves
Returns:
a dictionary of bookshelves {bookshelf: bookshelf_elements_id}
"""
if os.path.isfile(HP.BOOK_SHELVES_PATH):
pk = open(HP.BOOK_SHELVES_PATH, "rb")
bookshelves = pickle.load(pk)
pk.close()
assert isinstance(bookshelves, dict)
if bookshelves_list is not None:
bookshelves_list = bookshelves_list & set(bookshelves)
bookshelves = {bookshelf: bookshelves[bookshelf] for bookshelf in bookshelves_list}
else:
bookshelves = dict()
return bookshelves
def add_bookshelves(new_bookshelves):
"""
Get all books in bookshelf <name> and update metadata
Args:
new_bookshelves: a dictionary {bookshelf_name: {id of bookshelf's books}}
Returns:
None
"""
assert isinstance(new_bookshelves, dict)
bookshelves = defaultdict(lambda: set(), get_bookshelves())
new_ids = set.union(*new_bookshelves.values())
books = get_books(new_ids)
new_books = create_metadata(new_ids - set(books))
new_books = create_gutenberg_books(new_books, dic=True)
books = dict(books.items() + new_books.items())
for bookshelf, bookshelf_ids in new_bookshelves.items():
bookshelves[bookshelf] = bookshelves[bookshelf] | bookshelf_ids
for id in bookshelf_ids:
books[id].add_bookshelf(bookshelf)
add_books(books.values())
path = os.path.dirname(HP.BOOK_SHELVES_PATH)
if not os.path.exists(path):
os.makedirs(path)
with open(HP.BOOK_SHELVES_PATH, "wb") as f:
pickle.dump(dict(bookshelves), f)
def get_paragraphs(paragraph_id=None, books=None, tags=None, num_sequential=1, paragraph_object=True, lowercase=False):
"""
Get paragraphs from args.
Args:
paragraph_id: (Optional) a list of ints
books: (Optional) a list of books id or GutenbergBooks
tags: (Optional) a list of tags. if an element of list is a set,
list or ... it means the tag should be at least one of those tags. for instance tags = [3, {4, 5}]
means that paragraphs with tag 3 and 4 or 5
num_sequential: the number of sequential paragraphs
paragraph_object: if it is True outputs will be type of Paragraph
lowercase: if it is True, then the output will be lowercase. it does not have effect if paragraph_object=True.
Returns:
a list of paragraphs or list of tuples of paragraphs if num_sequential > 1
"""
if paragraph_id is not None and (books is not None or tags is not None):
raise ValueError("if paragraph_id is given, books and tags can't be accepted.")
with open(HP.PARAGRAPH_METADATA_PATH, "rb") as pkl:
met_data = pickle.load(pkl)
with open(HP.PARAGRAPH_DATA_PATH, "rb") as pkl:
text = pickle.load(pkl, encoding='latin1')
pars = create_paragraphs(met_data, text)
if paragraph_id is not None:
pars = {i: par for i, par in pars.items() if par.id in paragraph_id}
if books is not None:
books = {i for i in books if isinstance(i, int)} | {book.id for book in books if isinstance(book, GutenbergBook)}
pars = {i: par for i, par in pars.items() if par.book_id in books}
if tags is not None:
tags = [{tag} for tag in tags if isinstance(tag, int)] + [set(tag) for tag in tags if not isinstance(tag, int)]
pars = {i: par for i, par in pars.items() if all([not par.tags.isdisjoint(tag) for tag in tags])}
if num_sequential == 1:
if paragraph_object:
return list(pars.values())
else:
return [par.text(lowercase=lowercase) for par in pars.values()]
elif num_sequential > 1:
pars2 = []
for par in pars.values():
pp = [par]
next_par = par
flag = True
for k in range(1, num_sequential):
cur_par = next_par
id = cur_par.next_id
if id not in pars:
flag = False
break
next_par = pars[id]
pp.append(next_par)
if flag:
pars2.append(tuple(pp))
if paragraph_object:
return pars2
else:
return [tuple(par.text(lowercase=lowercase) for par in pt) for pt in pars2]
else:
raise ValueError("num_sequential most be positive")
def download_books(books, rewrite=False, ignore_invalid_books=True, Print=False):
"""
Download all books in books and save the text in file HP.BOOKS_PATH/<book id>.txt .
Args:
books: a list of ids or GutenbergBooks
rewrite: (Optional) if True rewrite the existing files
ignore_invalid_books: (Optional) if True then it will ignore invalid books
"""
if not os.path.exists(HP.BOOKS_PATH):
os.makedirs(HP.BOOKS_PATH)
books_num = len(books)
i = 0
for book in books:
i = i + 1
if isinstance(book, GutenbergBook):
id = book.id
else:
id = book
if not isvalid(id):
if ignore_invalid_books:
continue
else:
raise AttributeError("invalid book id")
path = HP.BOOKS_PATH + str(id) + ".txt"
if (not rewrite) and os.path.isfile(path):
continue
if Print:
print("downloading " + str(id))
print(str(i) + '/' + str(books_num))
text = strip_headers(load_etext(id)).strip().encode('UTF-8')
f = open(path, "w")
f.write(text)
f.close()
def get_paragraphs_from_book(book, Paragraph_Object=True):
"""
Create a list of paragraphs from books
Args:
book: either an int (book_id) or GutenbergBook
Paragraph_Object: if it is True, the outputs will be Paragraph_Objects
Returns:
a dictionary of Paragraphs (id: Paragraph(id))
"""
if isinstance(book, GutenbergBook):
book = book.id
elif not isinstance(book, int):
raise TypeError("book should be an int")
path = HP.BOOKS_PATH + str(book) + ".txt"
if not os.path.isfile(path):
raise IOError("no such file directory as " + path)
with open(path, "r") as f:
text = f.read()
text = text.split('\n\n')
text2 = []
for par in text:
try:
txt = pa.tokenize(par)
text2.append(txt)
except UnicodeDecodeError:
text2.append([[['<utf8-error>']]])
text = [par for par in text2 if par != []]
if not Paragraph_Object:
return text
return [Paragraph(t) for t in text]