-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdocuments.py
101 lines (74 loc) · 2.82 KB
/
documents.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import json
import typing
class Document(typing.NamedTuple):
doc_id: str
text: str
class DocumentCollection:
def __init__(self, documents: typing.List[Document] = None):
if documents is None:
documents = []
self.docs: typing.List[Document] = documents
def add_document(self, doc: Document):
self.docs.append(doc)
def get_all_docs(self) -> typing.List[Document]:
return self.docs
def get(self, doc_id) -> typing.Optional[Document]:
for doc in self.docs:
if doc.doc_id == doc_id:
return doc
return None
class DictDocumentCollection(DocumentCollection):
def __init__(self):
self.docs = dict()
def add_document(self, doc: Document):
self.docs[doc.doc_id] = doc
def get_all_docs(self) -> typing.List[Document]:
return list(self.docs.values())
def get(self, doc_id) -> typing.Optional[Document]:
return self.docs.get(doc_id)
def write(self, path: str):
json_data = {'docs': [doc._asdict() for doc in self.docs.values()]}
with open(path, 'w') as fp:
json.dump(obj=json_data, fp=fp)
@staticmethod
def read(path: str) -> 'DictDocumentCollection':
out = DictDocumentCollection()
with open(path) as fp:
collection_dict = json.load(fp)
doc_records = collection_dict['docs']
for record in doc_records:
doc = Document(doc_id=record['doc_id'], text=record['text'])
out.add_document(doc)
return out
class TransformedDocument(typing.NamedTuple):
doc_id: str
tokens: typing.List[str]
class TransformedDocumentCollection:
def __init__(self):
self.docs: typing.List[TransformedDocument] = []
def get_all_docs(self) -> typing.List[TransformedDocument]:
return self.docs
def add_document(self, doc: TransformedDocument):
self.docs.append(doc)
def write(self, path: str):
json_data = {'docs': [td._asdict() for td in self.docs]}
with open(path, 'w') as fp:
json.dump(obj=json_data, fp=fp)
@staticmethod
def read(path: str) -> 'TransformedDocumentCollection':
out = TransformedDocumentCollection()
with open(path) as fp:
collection_dict = json.load(fp)
doc_records = collection_dict['docs']
for record in doc_records:
doc = TransformedDocument(doc_id=record['doc_id'], tokens=record['tokens'])
out.add_document(doc)
return out
@staticmethod
def load(path: str) -> 'TransformedDocumentCollection':
with open(path) as fp:
data = json.load(fp)
docs = [TransformedDocument(**doc) for doc in data['docs']]
collection = TransformedDocumentCollection()
collection.docs = docs
return collection