-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparagraph.py
183 lines (149 loc) · 5.12 KB
/
paragraph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
from collections import defaultdict
class Paragraph(object):
def __init__(self, text, id=None, book_id=None, next_id=None, prev_id=None, tags=set()):
if id is not None:
assert isinstance(id, int)
assert id > 0
if book_id is not None:
assert isinstance(book_id, int)
assert book_id > 0
if next_id is not None:
assert isinstance(next_id, int)
assert next_id > 0
if prev_id is not None:
assert isinstance(prev_id, int)
assert prev_id > 0
assert isinstance(tags, set)
assert isinstance(text, list)
assert all([isinstance(sent, list) for sent in text])
assert all([all([isinstance(word, str) for word in sent]) for sent in text])
self._id = id
self._text = text
self._book_id = book_id
self._tags = tags
self._next_id = next_id
self._prev_id = prev_id
@property
def id(self):
return self._id
@property
def has_book(self):
return self._book_id is not None
@property
def book_id(self):
return self._book_id
@property
def next_id(self):
return self._next_id
@next_id.setter
def next_id(self, next_id):
if next_id is not None:
assert isinstance(next_id, int)
assert next_id > 0
self._next_id = next_id
@property
def prev_id(self):
return self._prev_id
@prev_id.setter
def prev_id(self, prev_id):
if prev_id is not None:
assert isinstance(prev_id, int)
assert prev_id > 0
self._prev_id = prev_id
@property
def tags(self):
return self._tags.copy()
def add_tag(self, new_tags):
if isinstance(new_tags, int):
new_tags = {new_tags}
self._tags = self._tags | set(new_tags)
@property
def metadata(self):
return paragraph_metadata(self.id, self.book_id, self.prev_id, self.next_id, self.tags)
def text(self, format="sentences", lowercase=False):
"""
Return the text of Paragraphs
Args:
format: if it is "sentences" then the output will be a list of lists each list contain the tokens of a sentence.
if it is "words" then the output will be the list of tokens.
if it is "text" then the output will be a string; the text of paragraph
lowercase: a boolean. if it is true then the output will be lowercase
Returns:
depend on format, a list of strings, a list of lists of strings or a string
"""
if format == "sentences":
if lowercase:
return [[word.lower() for word in sent] for sent in self._text]
else:
return [[word for word in sent] for sent in self._text]
elif format == "words":
words = sum([sent for sent in self._text], [])
if lowercase:
return [word.lower() for word in words]
else:
return words
elif format == "text":
words = sum([sent for sent in self._text], [])
text = " ".join(words)
if lowercase:
return text.lowercase()
else:
return text
else:
raise ValueError('format should be one of ["sentences", "words", "text"]')
def create_paragraphs(paragraph_metadata, paragraph_text):
assert set(paragraph_metadata) == set(paragraph_text)
pars = []
for i, met in paragraph_metadata.items():
mt = defaultdict(lambda : None, met)
if "tags" not in met:
tags = dict()
else:
tags = met["tags"]
par = Paragraph(text=paragraph_text[i], id=mt["id"], book_id=mt["book_id"], next_id=mt["next_id"],
prev_id=mt["prev_id"], tags=tags)
pars.append((i, par))
return dict(pars)
def paragraph_metadata(id=None, book_id=None, prev_id=None, next_id=None, tags=None):
"""
A helper for creating metadata
Args:
id, book_id, prev_id, next_id, are integers. tags is either integer or list, set and ... of integers.
if one of them is None, it will be ignored in return
Returns:
a dictionary of metadata form for a book with keys which are given
"""
res = dict()
x = id
name = "id"
if x is not None:
if not isinstance(x, int):
raise TypeError()
res[name] = x
x = book_id
name = "book_id"
if x is not None:
if not isinstance(x, int):
raise TypeError()
res[name] = x
x = prev_id
name = "prev_id"
if x is not None:
if not isinstance(x, int):
raise TypeError()
res[name] = x
x = next_id
name = "next_id"
if x is not None:
if not isinstance(x, int):
raise TypeError()
res[name] = x
x = tags
name = "tags"
if x is not None:
if isinstance(x, int):
res[name] = {x}
else:
assert all([isinstance(s, int) for s in x])
res[name] = set(x)
return res