-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.py
79 lines (61 loc) · 2.26 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import typing
from documents import TransformedDocument
class Index:
def add_document(self, doc: TransformedDocument):
pass
def search(self, query: typing.List[str]) -> typing.List[str]:
pass
class NaiveIndex(Index):
def __init__(self):
self.docs = dict()
def add_document(self, doc: TransformedDocument):
self.docs[doc.doc_id] = set(doc.tokens)
def search(self, query: typing.List[str]) -> typing.List[str]:
"""
Does search using the index.
:param query: List of query terms.
:return: List of doc_ids for matching documents in correct order.
"""
query_terms_set = set(query)
matching_doc_ids = []
for doc_id, doc_terms_set in self.docs.items():
if query_terms_set.issubset(doc_terms_set):
matching_doc_ids.append(doc_id)
return matching_doc_ids
from collections import defaultdict
import json
from index import Index
class SimpleInvertedIndex(Index):
def __init__(self):
self.doc_id_sets = defaultdict(set)
def add_document(self, doc: TransformedDocument):
for token in doc.tokens:
self.doc_id_sets[token].add(doc.doc_id)
def search(self, query: typing.List[str]) -> typing.List[str]:
"""
Does search using the index.
:param query: List of query terms.
:return: List of doc_ids for matching documents in correct order.
"""
matching_doc_ids = None
for term in query:
doc_ids = self.doc_id_sets[term]
if matching_doc_ids is None:
matching_doc_ids = doc_ids
else:
matching_doc_ids = matching_doc_ids.intersection(doc_ids)
return list(matching_doc_ids)
def write(self, file_path: str):
data = []
for key, value in self.doc_id_sets.items():
data.append({'term': key, 'doc_ids': list(value)})
with open(file_path, 'w') as f:
json.dump(data, f)
@staticmethod
def read(file_path: str):
out = SimpleInvertedIndex()
with open(file_path, 'r') as f:
data = json.load(f)
for entry in data:
out.doc_id_sets[entry['term']] = set(entry['doc_ids'])
return out