-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_indexing_process.py
68 lines (52 loc) · 2.45 KB
/
test_indexing_process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from unittest import TestCase
import indexing_process
from indexing_process import *
from documents import *
class FakeTokenizer(Tokenizer):
def tokenize(self, text):
return text.lower().split()
def create_document_collection():
docs = DocumentCollection()
docs.add_document(Document(doc_id='1', text='some text'))
docs.add_document(Document(doc_id='2', text='some other text'))
return docs
class TestTokenizerOnlyDocumentTransformer(TestCase):
def test_run(self):
document_transformer = TokenizerOnlyDocumentTransformer(FakeTokenizer())
transformed_docs = document_transformer.transform_documents(create_document_collection())
self.assertEqual([TransformedDocument(doc_id='1', tokens=['some', 'text']),
TransformedDocument(doc_id='2', tokens=['some', 'other', 'text'])],
transformed_docs.docs)
class FakeSource(indexing_process.Source):
def read_documents(self):
docs = DocumentCollection()
docs.add_document(Document(doc_id='1', text='some text'))
return docs
class FakeDocumentTransformer(indexing_process.DocumentTransformer):
def transform_documents(
self, document_collection: DocumentCollection) -> TransformedDocumentCollection:
docs = document_collection.get_all_docs()
out = TransformedDocumentCollection()
for d in docs:
transformed_doc = TransformedDocument(doc_id=d.doc_id, tokens=[d.text])
out.add_document(transformed_doc)
return out
class FakeIndex(Index):
def __init__(self):
self.docs = []
def add_document(self, doc: TransformedDocument):
self.docs.append(doc)
class TestIndexingProcess(TestCase):
def test_run(self):
process = indexing_process.IndexingProcess(
document_transformer=FakeDocumentTransformer(),
index=FakeIndex()
)
doc_collection, index = process.run(FakeSource())
self.assertEqual([Document(doc_id='1', text='some text')], doc_collection.docs)
self.assertEqual([TransformedDocument(doc_id='1', tokens=['some text'])], index.docs)
class TestTokenizerOnlyQueryTransformer(TestCase):
def test_transform_query(self):
query_transformer = TokenizerOnlyQueryTransformer(FakeTokenizer())
transformed_query = query_transformer.transform_query("here is a test query")
self.assertEqual(transformed_query, ['here', 'is', 'a', 'test', 'query'])