From e4e0c113ca0d105847ee400e1e73fc77bf5986c9 Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Fri, 9 Sep 2016 02:10:47 -0400 Subject: [PATCH] prevents duplicates in simstring database --- install.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/install.py b/install.py index 8858f66..8b60783 100644 --- a/install.py +++ b/install.py @@ -92,11 +92,15 @@ def parse_and_encode_ngrams(extracted_it, simstring_dir, cuisty_dir): mkdir(cuisty_dir) ss_db = SimstringDBWriter(simstring_dir) - cuisty_db = CuiSemTypesDB(cuisty_dir) + simstring_terms = set() + for i, (term, cui, stys, preferred) in enumerate(extracted_it, start=1): - ss_db.insert(term) + if term not in simstring_terms: + ss_db.insert(term) + simstring_terms.add(term) + cuisty_db.insert(term, cui, stys, preferred)