Skip to content

Commit

Permalink
Fix B-term crowdout (#50)
Browse files Browse the repository at this point in the history
* fix B-term crowdout

* add assert
  • Loading branch information
rmillikin authored Feb 2, 2023
1 parent bcb1cc5 commit 6f99bea
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 2 deletions.
5 changes: 4 additions & 1 deletion src/tests/test_work.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,4 +63,7 @@ def test_skim_work(data_dir):
assert result[0]['c_count'] == 10
assert result[0]['bc_count'] == 2
assert result[0]['ab_pmid_intersection'] == [34579798, 34579095, 34579733]
assert result[0]['bc_pmid_intersection'] == [34580748, 34578919]
assert result[0]['bc_pmid_intersection'] == [34580748, 34578919]

result = work.km_work_all_vs_all({'a_terms': ['cancer'], 'b_terms': ['carcinoma', 'downregulation'], 'c_terms': ['crop'], 'top_n': 1, 'ab_fet_threshold': 0.3, 'bc_fet_threshold': 0.3})
assert len(result) == 1
13 changes: 12 additions & 1 deletion src/workers/work.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def km_work_all_vs_all(json: dict):
km.get_prediction_score(res['pvalue'], res['sort_ratio']),
reverse=True)

ab_results = ab_results[:top_n]
ab_results = ab_results[:top_n + 20]

# RAM efficiency. decache unneeded tokens/terms
b_terms_used = set([ab_res['b_term'] for ab_res in ab_results])
Expand Down Expand Up @@ -202,6 +202,17 @@ def km_work_all_vs_all(json: dict):

c_term_n += 1

if top_n < sys.maxsize:
# sometimes high prediction score A-B pairs with no B-C pairs will
# crowd out lower-scoring A-B pairs that have B-C pairs. we want to
# ignore the former in favor of including the latter. we added
# 20 extra B-terms above as padding, now we need to filter out any
# extra B-terms not in the top N.
top_n_b = [x['b_term'] for x in ab_results]
abc_bs = set([x['b_term'] for x in return_val])
top_n_b = set([x for x in top_n_b if x in abc_bs][:top_n])
return_val = [x for x in return_val if x['b_term'] in top_n_b]

_update_job_status('progress', 1.0000)
return return_val

Expand Down

0 comments on commit 6f99bea

Please sign in to comment.