-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_nil_entites.py
41 lines (30 loc) · 1.27 KB
/
get_nil_entites.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# Compares two mappings: old and new, find the ones that are in the new but not old
import argparse
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-o', '--old_wikidata_mapping')
parser.add_argument('-n', '--new_wikidata_mapping')
parser.add_argument('-w', '--output_file')
args = parser.parse_args()
old_ids = set()
with open(args.old_wikidata_mapping) as f:
for l in f:
parts = l.split("\t")
old_ids.add(parts[0])
new_ids = set()
mmap = {}
with open(args.new_wikidata_mapping) as f:
for l in f:
parts = l.split("\t")
new_ids.add(parts[0])
mmap[parts[0]] = parts[1]
nil_ids = new_ids - old_ids
print("Number of NILs: " + str(len(nil_ids)))
with open(args.output_file, "w") as f:
for id in nil_ids:
enwiki = mmap[id]
# filers out some types but it's not strictly necessary since we only take into account Name/Article namespace
if enwiki.startswith("Category:") or enwiki.startswith("Wikipedia:") or enwiki.startswith("Module:") \
or enwiki.startswith("Template:") or enwiki.startswith("Portal:"):
continue
f.write(id + '\t' + enwiki)