diff --git a/src/indexing/index.py b/src/indexing/index.py index f3e21be..360c10f 100644 --- a/src/indexing/index.py +++ b/src/indexing/index.py @@ -54,7 +54,8 @@ def query_index(self, query: str) -> 'set[int]': result = self._query_disk(tokens) if len(result) < 10000 or len(tokens) > 1: - _place_in_mongo(query, result) + if os.path.exists(self._bin_path): + _place_in_mongo(query, result) self._query_cache[query] = result diff --git a/src/server/app.py b/src/server/app.py index d20ef10..98442b6 100644 --- a/src/server/app.py +++ b/src/server/app.py @@ -4,7 +4,7 @@ from redis import Redis from rq import Queue from flask_restful import Api -from workers.work import km_work, skim_work, triple_miner_work, update_index_work, clear_mongo_cache +from workers.work import km_work, km_work_all_vs_all, triple_miner_work, update_index_work, clear_mongo_cache import logging from flask_bcrypt import Bcrypt @@ -104,7 +104,7 @@ def _get_km_job(): ## ******** SKiM Post/Get ******** @_app.route('/skim/api/jobs/', methods=['POST']) def _post_skim_job(): - return _post_generic(skim_work, request) + return _post_generic(km_work_all_vs_all, request) @_app.route('/skim/api/jobs/', methods=['GET']) def _get_skim_job(): diff --git a/src/tests/test_work.py b/src/tests/test_work.py new file mode 100644 index 0000000..3ab2ed0 --- /dev/null +++ b/src/tests/test_work.py @@ -0,0 +1,39 @@ +import pytest +import os +import shutil +from indexing.index import Index +from indexing.index_builder import IndexBuilder +from workers import kinderminer as km +from indexing import km_util as util +from .test_index_building import data_dir +from workers import work +import workers.loaded_index as li + +def test_skim_work(data_dir): + index_dir = util.get_index_dir(data_dir) + + # delete the index if it exists already + if os.path.exists(index_dir): + shutil.rmtree(index_dir) + assert not os.path.exists(index_dir) + + # build the index + indexer = IndexBuilder(data_dir) + indexer.build_index() + idx = Index(data_dir) + + li.pubmed_path = data_dir + li.the_index = idx + + # test SKiM with only A-B terms + result = work.km_work_all_vs_all({'a_terms': ['cancer'], 'b_terms': ['test']}) + assert len(result) == 1 + assert 'c_term' not in result[0] + assert result[0]['ab_count'] > 0 + + # test SKiM with A-B-C terms + result = work.km_work_all_vs_all({'a_terms': ['cancer'], 'b_terms': ['test'], 'c_terms': ['coffee'], 'top_n': 50, 'ab_fet_threshold': 0.8}) + assert len(result) == 1 + assert result[0]['c_term'] == 'coffee' + assert result[0]['ab_count'] > 0 + assert result[0]['bc_count'] > 0 \ No newline at end of file diff --git a/src/workers/work.py b/src/workers/work.py index 77d0a9a..4c83714 100644 --- a/src/workers/work.py +++ b/src/workers/work.py @@ -1,4 +1,5 @@ import math +import sys import indexing.index from rq import get_current_job, Queue from rq.worker import Worker @@ -49,15 +50,34 @@ def km_work(json: list): return return_val -def skim_work(json: dict): +def km_work_all_vs_all(json: dict): indexing.index._connect_to_mongo() return_val = [] + km_only = False a_terms = json['a_terms'] b_terms = json['b_terms'] - c_terms = json['c_terms'] - top_n = json['top_n'] - ab_fet_threshold = json['ab_fet_threshold'] + + if 'c_terms' in json: + # SKiM query + c_terms = json['c_terms'] + + top_n = json['top_n'] + ab_fet_threshold = json['ab_fet_threshold'] + else: + # KM query + km_only = True + c_terms = ['__KM_ONLY__'] # dummy variable + + if 'top_n' in json: + top_n = json['top_n'] + else: + top_n = sys.maxsize + + if 'ab_fet_threshold' in json: + ab_fet_threshold = json['ab_fet_threshold'] + else: + ab_fet_threshold = math.inf if 'censor_year' in json: censor_year = json['censor_year'] @@ -92,33 +112,37 @@ def skim_work(json: dict): # take top N per a-b pair and run b-terms against c-terms for i, c_term in enumerate(c_terms): for ab in ab_results: - b_term = ab['b_term'] - bc = km.kinderminer_search(b_term, c_term, li.the_index, censor_year, return_pmids) - abc_result = { 'a_term': ab['a_term'], 'b_term': ab['b_term'], - 'c_term': c_term, 'ab_pvalue': ab['pvalue'], 'ab_sort_ratio': ab['sort_ratio'], 'ab_pred_score': km.get_prediction_score(ab['pvalue'], ab['sort_ratio']), - 'bc_pvalue': bc['pvalue'], - 'bc_sort_ratio': bc['sort_ratio'], - 'bc_pred_score': km.get_prediction_score(bc['pvalue'], bc['sort_ratio']), - 'a_count': ab['len(a_term_set)'], 'b_count': ab['len(b_term_set)'], - 'c_count': bc['len(b_term_set)'], 'ab_count': ab['len(a_b_intersect)'], - 'bc_count': bc['len(a_b_intersect)'], - 'total_count': bc['n_articles'] + 'total_count': ab['n_articles'] } if return_pmids: abc_result['ab_pmid_intersection'] = str(ab['pmid_intersection']) - abc_result['bc_pmid_intersection'] = str(bc['pmid_intersection']) + + # add c-terms and b-c term KM info (SKiM) + if not km_only: + b_term = ab['b_term'] + bc = km.kinderminer_search(b_term, c_term, li.the_index, censor_year, return_pmids) + + abc_result['c_term'] = c_term + abc_result['bc_pvalue'] = bc['pvalue'] + abc_result['bc_sort_ratio'] = bc['sort_ratio'] + abc_result['bc_pred_score'] = km.get_prediction_score(bc['pvalue'], bc['sort_ratio']) + abc_result['c_count'] = bc['len(b_term_set)'] + abc_result['bc_count'] = bc['len(a_b_intersect)'] + + if return_pmids: + abc_result['bc_pmid_intersection'] = str(bc['pmid_intersection']) return_val.append(abc_result) _update_job_status('progress', i + 1)