-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcounting.py
66 lines (50 loc) · 1.99 KB
/
counting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import collections
# import typing
import typing
from documents import TransformedDocument, TransformedDocumentCollection
# def count_words(doc: TransformedDocument) -> typing.Dict[str, int]:
# out = dict()
# for token in doc.tokens:
# if token in out:
# out[token] += 1
# else:
# out[token] = 1
# return out
# def count_words(doc: TransformedDocument) -> typing.Dict[str, int]:
# out = collections.defaultdict(int)
# for token in doc.tokens:
# out[token] += 1
# return out
def count_words(doc: TransformedDocument) -> collections.Counter:
return collections.Counter(doc.tokens)
def count_words_in_collection(docs: TransformedDocumentCollection) -> collections.Counter:
totals = collections.Counter()
for doc in docs.get_all_docs():
totals.update(count_words(doc))
return totals
def document_counts(docs: TransformedDocumentCollection) -> collections.Counter:
"""
Compute number of documents each word occurs in.
:param docs: TransformedDocumentCollection to run over
:return: A counter mapping tokens to the number of documents each token occurs in.
"""
num_docs = collections.Counter()
for doc in docs.get_all_docs():
num_docs.update(collections.Counter(set(doc.tokens)))
return num_docs
def doc_tf_idf_scores(doc: TransformedDocument,
doc_frequencies: collections.Counter) -> typing.Dict[str, float]:
out = dict()
term_frequencies = count_words(doc)
for term, freq in term_frequencies:
weight = freq / doc_frequencies[term]
out[term] = weight
return out
def tf_idf_scores(docs: TransformedDocumentCollection):
doc_frequencies = document_counts(docs)
out = list()
for doc in docs.get_all_docs():
out.append(doc_tf_idf_scores(doc, doc_frequencies))
return out
def query_score(query: typing.List[str], doc_weights: typing.Dict[str, float]) -> float:
return sum([doc_weights[term] for term in query])