-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
987ac58
commit 809310d
Showing
9 changed files
with
289 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "c18d1940-9f34-4d23-bad0-feec22fc11b8", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import pandas as pd" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "f2848639-6a7a-4861-b482-6b52ce873b51", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"actual = pd.read_csv('data/spacy_tokenized')" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "573", | ||
"language": "python", | ||
"name": "573" | ||
}, | ||
"language_info": { | ||
"name": "" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,226 @@ | ||
#!/usr/bin/env python | ||
# coding: utf-8 | ||
|
||
from __future__ import division, print_function, unicode_literals | ||
import numpy as np | ||
import spacy | ||
import pandas as pd | ||
import NER_Model, Negation_Analyser | ||
|
||
|
||
import sys | ||
from collections import defaultdict | ||
|
||
def split_tag(chunk_tag): | ||
""" | ||
split chunk tag into IOBES prefix and chunk_type | ||
e.g. | ||
B-PER -> (B, PER) | ||
O -> (O, None) | ||
""" | ||
if chunk_tag == 'O': | ||
return ('O', None) | ||
if "N-" in chunk_tag: | ||
return chunk_tag[0], "N-" + chunk_tag.split("-")[2] | ||
else: | ||
return chunk_tag.split('-', maxsplit=1) | ||
|
||
def is_chunk_end(prev_tag, tag): | ||
""" | ||
check if the previous chunk ended between the previous and current word | ||
e.g. | ||
(B-PER, I-PER) -> False | ||
(B-LOC, O) -> True | ||
Note: in case of contradicting tags, e.g. (B-PER, I-LOC) | ||
this is considered as (B-PER, B-LOC) | ||
""" | ||
prefix1, chunk_type1 = split_tag(prev_tag) | ||
prefix2, chunk_type2 = split_tag(tag) | ||
|
||
if prefix1 == 'O': | ||
return False | ||
if prefix2 == 'O': | ||
return prefix1 != 'O' | ||
|
||
if chunk_type1 != chunk_type2: | ||
return True | ||
|
||
else: | ||
return False | ||
|
||
def is_chunk_start(prev_tag, tag): | ||
""" | ||
check if a new chunk started between the previous and current word | ||
""" | ||
prefix1, chunk_type1 = split_tag(prev_tag) | ||
prefix2, chunk_type2 = split_tag(tag) | ||
|
||
if prefix2 == 'O': | ||
return False | ||
if prefix1 == 'O': | ||
return prefix2 != 'O' | ||
|
||
if chunk_type1 != chunk_type2: | ||
return True | ||
else: | ||
return False | ||
|
||
def calc_metrics(tp, p, t, percent=True): | ||
""" | ||
compute overall precision, recall and FB1 (default values are 0.0) | ||
if percent is True, return 100 * original decimal value | ||
""" | ||
precision = tp / p if p else 0 | ||
recall = tp / t if t else 0 | ||
fb1 = 2 * precision * recall / (precision + recall) if precision + recall else 0 | ||
if percent: | ||
return 100 * precision, 100 * recall, 100 * fb1 | ||
else: | ||
return precision, recall, fb1 | ||
|
||
|
||
def count_chunks(true_seqs, pred_seqs): | ||
""" | ||
true_seqs: a list of true tags | ||
pred_seqs: a list of predicted tags | ||
return: | ||
correct_chunks: a dict (counter), | ||
key = chunk types, | ||
value = number of correctly identified chunks per type | ||
true_chunks: a dict, number of true chunks per type | ||
pred_chunks: a dict, number of identified chunks per type | ||
correct_counts, true_counts, pred_counts: similar to above, but for tags | ||
""" | ||
correct_chunks = defaultdict(int) | ||
true_chunks = defaultdict(int) | ||
pred_chunks = defaultdict(int) | ||
|
||
correct_counts = defaultdict(int) | ||
true_counts = defaultdict(int) | ||
pred_counts = defaultdict(int) | ||
|
||
prev_true_tag, prev_pred_tag = 'O', 'O' | ||
correct_chunk = None | ||
|
||
for true_tag, pred_tag in zip(true_seqs, pred_seqs): | ||
if true_tag == pred_tag: | ||
correct_counts[true_tag] += 1 | ||
true_counts[true_tag] += 1 | ||
pred_counts[pred_tag] += 1 | ||
|
||
_, true_type = split_tag(true_tag) | ||
_, pred_type = split_tag(pred_tag) | ||
|
||
if correct_chunk is not None: | ||
true_end = is_chunk_end(prev_true_tag, true_tag) | ||
pred_end = is_chunk_end(prev_pred_tag, pred_tag) | ||
|
||
if pred_end and true_end: | ||
correct_chunks[correct_chunk] += 1 | ||
correct_chunk = None | ||
elif pred_end != true_end or true_type != pred_type: | ||
correct_chunk = None | ||
|
||
true_start = is_chunk_start(prev_true_tag, true_tag) | ||
pred_start = is_chunk_start(prev_pred_tag, pred_tag) | ||
|
||
if true_start and pred_start and true_type == pred_type: | ||
correct_chunk = true_type | ||
if true_start: | ||
true_chunks[true_type] += 1 | ||
if pred_start: | ||
pred_chunks[pred_type] += 1 | ||
|
||
prev_true_tag, prev_pred_tag = true_tag, pred_tag | ||
if correct_chunk is not None: | ||
correct_chunks[correct_chunk] += 1 | ||
|
||
return (correct_chunks, true_chunks, pred_chunks, | ||
correct_counts, true_counts, pred_counts) | ||
|
||
def get_result(correct_chunks, true_chunks, pred_chunks, | ||
correct_counts, true_counts, pred_counts, verbose=True): | ||
""" | ||
if verbose, print overall performance, as well as preformance per chunk type; | ||
otherwise, simply return overall prec, rec, f1 scores | ||
""" | ||
# sum counts | ||
sum_correct_chunks = sum(correct_chunks.values()) | ||
sum_true_chunks = sum(true_chunks.values()) | ||
sum_pred_chunks = sum(pred_chunks.values()) | ||
|
||
sum_correct_counts = sum(correct_counts.values()) | ||
sum_true_counts = sum(true_counts.values()) | ||
|
||
nonO_correct_counts = sum(v for k, v in correct_counts.items() if k != 'O') | ||
nonO_true_counts = sum(v for k, v in true_counts.items() if k != 'O') | ||
|
||
chunk_types = sorted(list(set(list(true_chunks) + list(pred_chunks)))) | ||
|
||
# compute overall precision, recall and FB1 (default values are 0.0) | ||
prec, rec, f1 = calc_metrics(sum_correct_chunks, sum_pred_chunks, sum_true_chunks) | ||
res = (prec, rec, f1) | ||
if not verbose: | ||
return res | ||
|
||
# print overall performance, and performance per chunk type | ||
|
||
print("processed %i tokens with %i phrases; " % (sum_true_counts, sum_true_chunks), end='') | ||
print("found: %i phrases; correct: %i.\n" % (sum_pred_chunks, sum_correct_chunks), end='') | ||
|
||
print("accuracy: %6.2f%%; (non-O)" % (100*nonO_correct_counts/nonO_true_counts)) | ||
print("accuracy: %6.2f%%; " % (100*sum_correct_counts/sum_true_counts), end='') | ||
print("precision: %6.2f%%; recall: %6.2f%%; FB1: %6.2f" % (prec, rec, f1)) | ||
|
||
# for each chunk type, compute precision, recall and FB1 (default values are 0.0) | ||
for t in chunk_types: | ||
prec, rec, f1 = calc_metrics(correct_chunks[t], pred_chunks[t], true_chunks[t]) | ||
print("%17s: " %t , end='') | ||
print("precision: %6.2f%%; recall: %6.2f%%; FB1: %6.2f" % | ||
(prec, rec, f1), end='') | ||
print(" %d" % pred_chunks[t]) | ||
|
||
return res | ||
# you can generate LaTeX output for tables like in | ||
# http://cnts.uia.ac.be/conll2003/ner/example.tex | ||
# but I'm not implementing this | ||
|
||
def evaluate(true_seqs, pred_seqs, verbose=True): | ||
(correct_chunks, true_chunks, pred_chunks, | ||
correct_counts, true_counts, pred_counts) = count_chunks(true_seqs, pred_seqs) | ||
result = get_result(correct_chunks, true_chunks, pred_chunks, | ||
correct_counts, true_counts, pred_counts, verbose=verbose) | ||
return result | ||
|
||
validataion_df = pd.read_csv('data/spacy_tokenized.csv') | ||
validataion_df = validataion_df.replace('B-','I-', regex=True) | ||
sentences = [] | ||
sent = [] | ||
tok_tag_act = [] | ||
tokens = [] | ||
actual_tags = [] | ||
predicted_tags = [] | ||
NER_Model.main() | ||
for index, row in validataion_df.iterrows(): | ||
if row['Tokens'] is np.nan: | ||
print(' '.join(sent)) | ||
print(f"Actual : {tok_tag_act}") | ||
predicted_ner = NER_Model.evaluate_results(sent) | ||
pred_tok_tag = Negation_Analyser.predict(predicted_ner, "tags") | ||
for tag in pred_tok_tag['token-tags']: | ||
predicted_tags.append(tag[1]) | ||
print(f"Predicted : {pred_tok_tag}") | ||
sent = [] | ||
tok_tag_act = [] | ||
|
||
else: | ||
sent.append(row['Tokens']) | ||
if(row['is_negative'] is True): | ||
tok_tag_act.append((row['Tokens'], row['Tags'].replace('I-', 'I-N-'))) | ||
actual_tags.append(row['Tags'].replace('I-', 'I-N-')) | ||
else: | ||
actual_tags.append(row['Tags']) | ||
tok_tag_act.append((row['Tokens'], row['Tags'])) | ||
|
||
|
||
evaluate(actual_tags, predicted_tags) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,4 +26,3 @@ dependencies: | |
- graphviz | ||
- pip: | ||
- contextualSpellCheck | ||
- spacy-model-en_core_web_sm |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.