From 22d5c8a12972257a336603a94bfcf0e0ae066ab1 Mon Sep 17 00:00:00 2001 From: Robert Millikin Date: Fri, 22 Sep 2023 13:36:21 -0500 Subject: [PATCH] implement chi square test (#56) --- src/tests/test_kinderminer.py | 13 +++++++++++++ src/workers/kinderminer.py | 20 +++++++++++++++++--- src/workers/work.py | 5 +++-- 3 files changed, 33 insertions(+), 5 deletions(-) diff --git a/src/tests/test_kinderminer.py b/src/tests/test_kinderminer.py index 7ab9f4a..72f730a 100755 --- a/src/tests/test_kinderminer.py +++ b/src/tests/test_kinderminer.py @@ -24,6 +24,19 @@ def test_fisher_exact_test(): sort_ratio = km.get_sort_ratio(table) assert sort_ratio == pytest.approx(15 / 59) +def test_chisq_pvalue(): + table = [[10, 3000], [2000, 10000000]] + pvalue = km.chi_square(table) + assert pvalue == pytest.approx(2.583e-30, abs=1e-30) + + table = [[1, 3000], [2000, 10000000]] + pvalue = km.chi_square(table) + assert pvalue == 1 + + table = [[0, 100], [0, 10000000]] + pvalue = km.chi_square(table) + assert pvalue == 1 + def test_text_sanitation(): text = 'Testing123****.' sanitized_text = index.sanitize_term(text) diff --git a/src/workers/kinderminer.py b/src/workers/kinderminer.py index 8275bc7..a15dfd1 100755 --- a/src/workers/kinderminer.py +++ b/src/workers/kinderminer.py @@ -20,6 +20,14 @@ def get_contingency_table(a_term_set: set, b_term_set: set, total_n: int): def fisher_exact(table) -> float: return scipy.stats.fisher_exact(table, fet_sided)[1] +def chi_square(table) -> float: + try: + return scipy.stats.chi2_contingency(table, fet_sided)[1] + except ValueError: + # default to a p-value of 1.0 + # this happens if the sum of a row or column is 0 + return 1.0 + def get_sort_ratio(table) -> float: denom = (table[0][0] + table[1][0]) if denom == 0: @@ -27,7 +35,9 @@ def get_sort_ratio(table) -> float: return table[0][0] / denom -def kinderminer_search(a_term: str, b_term: str, idx: Index, censor_year = math.inf, return_pmids = False, top_n_articles = math.inf) -> dict: +def kinderminer_search(a_term: str, b_term: str, idx: Index, censor_year = math.inf, + return_pmids = False, top_n_articles = math.inf, + scoring = 'fet') -> dict: """""" start_time = time.perf_counter() result = dict() @@ -48,8 +58,12 @@ def kinderminer_search(a_term: str, b_term: str, idx: Index, censor_year = math. n_a_and_b = table[0][0] n_articles = idx.n_articles(censor_year) - # perform fisher's exact test - pvalue = fisher_exact(table) + # perform statistical test (default fisher's exact test) + if scoring == 'chi-square': + pvalue = chi_square(table) + else: # 'fet' + pvalue = fisher_exact(table) + sort_ratio = get_sort_ratio(table) run_time = time.perf_counter() - start_time diff --git a/src/workers/work.py b/src/workers/work.py index e413067..780d557 100644 --- a/src/workers/work.py +++ b/src/workers/work.py @@ -67,6 +67,7 @@ def km_work_all_vs_all(json: dict): a_terms = json['a_terms'] b_terms = json['b_terms'] + scoring = json.get('scoring', 'fet') if 'c_terms' in json: # SKiM query @@ -107,7 +108,7 @@ def km_work_all_vs_all(json: dict): b_term = li.the_index.get_highest_priority_term(b_term_set, b_term_token_dict) b_term_set.remove(b_term) - res = km.kinderminer_search(a_term, b_term, li.the_index, censor_year, return_pmids, top_n_articles) + res = km.kinderminer_search(a_term, b_term, li.the_index, censor_year, return_pmids, top_n_articles, scoring) if res['pvalue'] <= ab_fet_threshold: ab_results.append(res) @@ -189,7 +190,7 @@ def km_work_all_vs_all(json: dict): if b_term == c_term: continue - bc = km.kinderminer_search(b_term, c_term, li.the_index, censor_year, return_pmids, top_n_articles) + bc = km.kinderminer_search(b_term, c_term, li.the_index, censor_year, return_pmids, top_n_articles, scoring) abc_result['c_term'] = c_term abc_result['bc_pvalue'] = bc['pvalue']