Skip to content

Commit

Permalink
Initialization
Browse files Browse the repository at this point in the history
  • Loading branch information
kenchan0226 committed Jun 27, 2022
1 parent 5181e8e commit 13074f4
Showing 1 changed file with 190 additions and 0 deletions.
190 changes: 190 additions & 0 deletions dataset_extractive_fragment_stat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
import os
import json
from os.path import join
import re
import argparse
import numpy as np
#import textstat
#from summaqa.summaqa import QG_masked
#from utils.cost import WordBasedDifficulty, WikiRelativeWordFrequency
import pickle as pkl
from utils.string_helper import _make_n_gram
from collections import Counter


def _make_n_gram_phrase(sequence, n=2):
return (' '.join(sequence[i:i+n]) for i in range(len(sequence)-(n-1)))


def count_matched(phrase_list, phrase_dict):
num_matched = 0
for phrase in phrase_list:
if phrase in phrase_dict:
num_matched += 1
return num_matched


def compute_extractive_fragment(A, S):
"""
:param A: word list of article
:param S: word list of summary
:return: F: a list of word list, each word list is an extractive fragment
"""
F = []
i = 0
j = 0
while i < len(S):
f = []
while j < len(A):
if S[i] == A[j]:
i_pie = i
j_pie = j
while S[i_pie] == A[j_pie]:
i_pie += 1
j_pie += 1
if i_pie >= len(S) or j_pie >= len(A):
break
if len(f) <= i_pie - i:
f = S[i:i_pie]
j = j_pie
else:
j += 1
i = i + max(len(f), 1)
j = 1
if len(f) > 0:
F.append(f)
return F


def compute_extractive_fragment_density(ext_fragment_list, S):
#ext_fragment_list = compute_extractive_fragment(A, S)
density = sum([len(f)**2 for f in ext_fragment_list]) / len(S)
return density


def compute_extractive_fragment_coverage(ext_fragment_list, S):
#ext_fragment_list = compute_extractive_fragment(A, S)
coverage = sum([len(f) for f in ext_fragment_list]) / len(S)
return coverage


def n_gram_novelty(pred_n_gram_counter, src_n_gram_counter):
all_pred_n_grams = pred_n_gram_counter.keys()
num_unique_pred_n_grams = len(all_pred_n_grams)
num_unique_novel_pred_n_grams = 0
num_pred_n_grams = sum(pred_n_gram_counter.values())
num_novel_pred_n_grams = 0
for n_gram, cnt in pred_n_gram_counter.items():
if n_gram not in src_n_gram_counter:
num_unique_novel_pred_n_grams += 1
num_novel_pred_n_grams += cnt
unique_novel_n_gram_fraction = num_unique_novel_pred_n_grams / num_unique_pred_n_grams
novel_n_gram_fraction = num_novel_pred_n_grams / num_pred_n_grams
return unique_novel_n_gram_fraction, novel_n_gram_fraction


def _count_data(path):
""" count number of data in the given path"""
matcher = re.compile(r'[0-9]+\.json')
match = lambda name: bool(matcher.match(name))
names = os.listdir(path)
n_data = len(list(filter(match, names)))
return n_data

def main(data_dir, split):
split_dir = join(data_dir, split)
n_data = _count_data(split_dir)

total_num_samples = 0
total_num_summary_tokens = 0
total_num_summary_sents = 0
total_num_doc_tokens = 0
total_num_doc_sents = 0

all_ext_frag_density = []
all_ext_frag_coverage = []
abs_bin_counter = np.array([0, 0, 0])

for i in range(n_data):
js = json.load(open(join(split_dir, '{}.json'.format(i))))
if js['article'] and js['abstract']:
total_num_samples += 1
doc_sent_list = js['article']
num_doc_sents = len(doc_sent_list)
doc_str = ' '.join(doc_sent_list).lower()
doc_word_list = doc_str.split(' ')
num_doc_tokens = len(doc_word_list)

summary_sent_list = js['abstract']
num_summary_sents = len(summary_sent_list)
summary_str = ' '.join(summary_sent_list).lower()
summary_word_list = summary_str.split(' ')
num_summary_tokens = len(summary_word_list)

#ext_fragment_list = compute_extractive_fragment(doc_word_list, summary_word_list)

all_ext_frag_density.append(float(js["extractive_fragment_density"]))

all_ext_frag_coverage.append(float(js["extractive_fragment_coverage"]))

total_num_summary_tokens += num_summary_tokens
total_num_doc_tokens += num_doc_tokens
total_num_summary_sents += num_summary_sents
total_num_doc_sents += num_doc_sents

# assign to bin
if js["extractive_fragment_density"] <= 1.3:
abs_bin = 2
elif js["extractive_fragment_density"] <= 3.3:
abs_bin = 1
else:
abs_bin = 0
abs_bin_counter[abs_bin] += 1
"""
if js["extractive_fragment_density"] <= 2.0:
abs_bin = 1
else:
abs_bin = 0
abs_bin_counter[abs_bin] += 1
"""

avg_summary_tokens = total_num_summary_tokens/total_num_samples

print("avg # tokens in summary:\t{:.3f}".format(avg_summary_tokens))
print("avg # tokens in document:\t{:.3f}".format(total_num_doc_tokens/total_num_samples))
print("avg # sentences in summary:\t{:.3f}".format(total_num_summary_sents/total_num_samples))
print("avg ext. frag. density:\t{:.3f}".format(sum(all_ext_frag_density)/total_num_samples))

print("abs bin count")
print(abs_bin_counter)
abs_bin_counter_normalized = abs_bin_counter / total_num_samples
print(abs_bin_counter_normalized)

#print("extractive fragment density")
#all_ext_frag_density = np.array(all_ext_frag_density)
#hist, bins = np.histogram(all_ext_frag_density, bins=[0, 1.5, 2.5, 100], density=False)
#print(bins)
#print(hist)
#print("extractive fragment coverage")
#hist, bins = np.histogram(all_ext_frag_density, bins=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 10], density=False)
#print(bins)
#print(hist)

# save all_ext_frag_density array to bin
all_ext_frag_density = np.array(all_ext_frag_density)
with open(join(data_dir, 'ext_frag_density_{}.pkl'.format(split)), 'wb') as f:
pkl.dump(all_ext_frag_density, f, pkl.HIGHEST_PROTOCOL)


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description=('Preprocess review data')
)
parser.add_argument('-data_dir', type=str, action='store',
help='The directory of the data.')
parser.add_argument('-split', type=str, action='store',
help='train or val or test.')

args = parser.parse_args()

main(args.data_dir, args.split)

0 comments on commit 13074f4

Please sign in to comment.