diff --git a/src/indexing/km_util.py b/src/indexing/km_util.py index 24271bc..5d298df 100644 --- a/src/indexing/km_util.py +++ b/src/indexing/km_util.py @@ -57,6 +57,17 @@ def write_all_lines(path: str, items: 'list[str]') -> None: def get_tokens(text: str) -> 'list[str]': l_text = text.lower() tokens = tokenizer.tokenize(l_text) + + # remove underscores + if '_' in text: + new_tokens = [] + + for token in tokens: + spl = token.split('_') + new_tokens.extend(spl) + + tokens = new_tokens + return tokens def sanitize_text(text: str) -> str: diff --git a/src/tests/test_index_building.py b/src/tests/test_index_building.py index 3f641f2..2876dbf 100755 --- a/src/tests/test_index_building.py +++ b/src/tests/test_index_building.py @@ -23,7 +23,7 @@ def delete_existing_index(data_dir): assert not os.path.exists(index_dir) def test_tokenization(): - text = "The quick brown fox jumped over the lazy dog." + text = "The_quick brown fox jumped over the lazy dog." tokens = util.get_tokens(text) assert "the" in tokens