-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheck_duplicates.py
126 lines (102 loc) · 2.38 KB
/
check_duplicates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import pymongo
import config
import json
import mongo_connect as mc
def no_duplicates():
'''
Verify that the database does not have duplicate articles
'''
cursor = mc.getData()
title_set = set()
try:
while True:
title = cursor.next()["title"]
if title in title_set:
print(title)
return False
else:
title_set.add(title)
except StopIteration:
return True
def get_duplicates():
'''
returns array of duplicate articles titles in the database
'''
cursor = mc.getData()
title_set = set()
duplicates = []
try:
while True:
title = cursor.next()["title"]
if title in title_set:
duplicates.append(title)
else:
title_set.add(title)
except StopIteration:
return duplicates
def count_duplicates_by_truthiness():
'''
Counts the number of duplicate articles in the database
'''
cursor = mc.getData()
title_set = set()
true_duplicate_count = 0
false_duplicate_count = 0
try:
while True:
article = cursor.next()
title = article["title"]
truth = article["truth"]
if title in title_set:
if truth:
true_duplicate_count += 1
else:
false_duplicate_count += 1
else:
title_set.add(title)
except StopIteration:
return true_duplicate_count, false_duplicate_count
def count_articles():
cursor = mc.getData()
count = 0
try:
while True:
cursor.next()
count+=1
except StopIteration:
return count
def remove_duplicates(A):
'''
Removes all duplicate titles in array A from the database
e.g.
A = [Cat, Cat]
db = [Cat, Dog, Cat, Cat] -> [Cat, Dog]
'''
Collection = mc.getCollection()
for article_title in A:
Collection.delete_one({"title":article_title})
def count_true_and_false():
'''
Counts number of true and false articles in db
'''
cursor = mc.getData()
trueCount = 0
falseCount = 0
try:
while True:
article = cursor.next()
truth = article["truth"]
if truth:
trueCount += 1
else:
falseCount += 1
except StopIteration:
return trueCount, falseCount
if __name__ == '__main__':
duplicates = get_duplicates()
num_dup_true, num_dup_false = count_duplicates_by_truthiness()
print("Duplicates by truthiness: True: {}\tFalse: {}".format(num_dup_true, num_dup_false))
print(count_articles())
num_true, num_false = count_true_and_false()
print("Overall article truthiness: True: {}\tFalse: {}".format(num_true, num_false))
remove_duplicates(duplicates)