Skip to content

Commit

Permalink
implement all vs all KM and unit test (#35)
Browse files Browse the repository at this point in the history
* implement all vs all KM and unit test

* dont cache results if no index is loaded
  • Loading branch information
rmillikin authored May 13, 2022
1 parent fa0b15a commit 63682cf
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 19 deletions.
3 changes: 2 additions & 1 deletion src/indexing/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ def query_index(self, query: str) -> 'set[int]':
result = self._query_disk(tokens)

if len(result) < 10000 or len(tokens) > 1:
_place_in_mongo(query, result)
if os.path.exists(self._bin_path):
_place_in_mongo(query, result)

self._query_cache[query] = result

Expand Down
4 changes: 2 additions & 2 deletions src/server/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from redis import Redis
from rq import Queue
from flask_restful import Api
from workers.work import km_work, skim_work, triple_miner_work, update_index_work, clear_mongo_cache
from workers.work import km_work, km_work_all_vs_all, triple_miner_work, update_index_work, clear_mongo_cache
import logging
from flask_bcrypt import Bcrypt

Expand Down Expand Up @@ -104,7 +104,7 @@ def _get_km_job():
## ******** SKiM Post/Get ********
@_app.route('/skim/api/jobs/', methods=['POST'])
def _post_skim_job():
return _post_generic(skim_work, request)
return _post_generic(km_work_all_vs_all, request)

@_app.route('/skim/api/jobs/', methods=['GET'])
def _get_skim_job():
Expand Down
39 changes: 39 additions & 0 deletions src/tests/test_work.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import pytest
import os
import shutil
from indexing.index import Index
from indexing.index_builder import IndexBuilder
from workers import kinderminer as km
from indexing import km_util as util
from .test_index_building import data_dir
from workers import work
import workers.loaded_index as li

def test_skim_work(data_dir):
index_dir = util.get_index_dir(data_dir)

# delete the index if it exists already
if os.path.exists(index_dir):
shutil.rmtree(index_dir)
assert not os.path.exists(index_dir)

# build the index
indexer = IndexBuilder(data_dir)
indexer.build_index()
idx = Index(data_dir)

li.pubmed_path = data_dir
li.the_index = idx

# test SKiM with only A-B terms
result = work.km_work_all_vs_all({'a_terms': ['cancer'], 'b_terms': ['test']})
assert len(result) == 1
assert 'c_term' not in result[0]
assert result[0]['ab_count'] > 0

# test SKiM with A-B-C terms
result = work.km_work_all_vs_all({'a_terms': ['cancer'], 'b_terms': ['test'], 'c_terms': ['coffee'], 'top_n': 50, 'ab_fet_threshold': 0.8})
assert len(result) == 1
assert result[0]['c_term'] == 'coffee'
assert result[0]['ab_count'] > 0
assert result[0]['bc_count'] > 0
56 changes: 40 additions & 16 deletions src/workers/work.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import math
import sys
import indexing.index
from rq import get_current_job, Queue
from rq.worker import Worker
Expand Down Expand Up @@ -49,15 +50,34 @@ def km_work(json: list):

return return_val

def skim_work(json: dict):
def km_work_all_vs_all(json: dict):
indexing.index._connect_to_mongo()
return_val = []
km_only = False

a_terms = json['a_terms']
b_terms = json['b_terms']
c_terms = json['c_terms']
top_n = json['top_n']
ab_fet_threshold = json['ab_fet_threshold']

if 'c_terms' in json:
# SKiM query
c_terms = json['c_terms']

top_n = json['top_n']
ab_fet_threshold = json['ab_fet_threshold']
else:
# KM query
km_only = True
c_terms = ['__KM_ONLY__'] # dummy variable

if 'top_n' in json:
top_n = json['top_n']
else:
top_n = sys.maxsize

if 'ab_fet_threshold' in json:
ab_fet_threshold = json['ab_fet_threshold']
else:
ab_fet_threshold = math.inf

if 'censor_year' in json:
censor_year = json['censor_year']
Expand Down Expand Up @@ -92,33 +112,37 @@ def skim_work(json: dict):
# take top N per a-b pair and run b-terms against c-terms
for i, c_term in enumerate(c_terms):
for ab in ab_results:
b_term = ab['b_term']
bc = km.kinderminer_search(b_term, c_term, li.the_index, censor_year, return_pmids)

abc_result = {
'a_term': ab['a_term'],
'b_term': ab['b_term'],
'c_term': c_term,

'ab_pvalue': ab['pvalue'],
'ab_sort_ratio': ab['sort_ratio'],
'ab_pred_score': km.get_prediction_score(ab['pvalue'], ab['sort_ratio']),

'bc_pvalue': bc['pvalue'],
'bc_sort_ratio': bc['sort_ratio'],
'bc_pred_score': km.get_prediction_score(bc['pvalue'], bc['sort_ratio']),

'a_count': ab['len(a_term_set)'],
'b_count': ab['len(b_term_set)'],
'c_count': bc['len(b_term_set)'],
'ab_count': ab['len(a_b_intersect)'],
'bc_count': bc['len(a_b_intersect)'],
'total_count': bc['n_articles']
'total_count': ab['n_articles']
}

if return_pmids:
abc_result['ab_pmid_intersection'] = str(ab['pmid_intersection'])
abc_result['bc_pmid_intersection'] = str(bc['pmid_intersection'])

# add c-terms and b-c term KM info (SKiM)
if not km_only:
b_term = ab['b_term']
bc = km.kinderminer_search(b_term, c_term, li.the_index, censor_year, return_pmids)

abc_result['c_term'] = c_term
abc_result['bc_pvalue'] = bc['pvalue']
abc_result['bc_sort_ratio'] = bc['sort_ratio']
abc_result['bc_pred_score'] = km.get_prediction_score(bc['pvalue'], bc['sort_ratio'])
abc_result['c_count'] = bc['len(b_term_set)']
abc_result['bc_count'] = bc['len(a_b_intersect)']

if return_pmids:
abc_result['bc_pmid_intersection'] = str(bc['pmid_intersection'])

return_val.append(abc_result)
_update_job_status('progress', i + 1)
Expand Down

0 comments on commit 63682cf

Please sign in to comment.