diff --git a/.gitattributes b/.gitattributes index 4d7d7ba4..ace7876e 100644 --- a/.gitattributes +++ b/.gitattributes @@ -5,3 +5,4 @@ sadedegel/prebuilt/model/*.joblib filter=lfs diff=lfs merge=lfs -text sadedegel/bblock/data/bert/vocabulary.hdf5 filter=lfs diff=lfs merge=lfs -text sadedegel/bblock/data/icu/vocabulary.hdf5 filter=lfs diff=lfs merge=lfs -text sadedegel/bblock/data/simple/vocabulary.hdf5 filter=lfs diff=lfs merge=lfs -text +sadedegel/summarize/model/*.joblib filter=lfs diff=lfs merge=lfs -text diff --git a/prod.requirements.txt b/prod.requirements.txt index e1401bfc..b0593dc0 100644 --- a/prod.requirements.txt +++ b/prod.requirements.txt @@ -14,4 +14,7 @@ sadedegel-icu requests rich cached-property -h5py>=3.1.0,<=3.2.1 \ No newline at end of file +h5py>=3.1.0,<=3.2.1 + +lightgbm +randomname diff --git a/sadedegel/default.ini b/sadedegel/default.ini index c7a74d75..6b380e5c 100644 --- a/sadedegel/default.ini +++ b/sadedegel/default.ini @@ -35,4 +35,4 @@ method = smooth [bm25] k1 = 1.25 b = 0.75 -delta = 0 \ No newline at end of file +delta = 0 diff --git a/sadedegel/summarize/README.md b/sadedegel/summarize/README.md index 4b66d627..402cda1d 100644 --- a/sadedegel/summarize/README.md +++ b/sadedegel/summarize/README.md @@ -10,6 +10,64 @@ by recoding the **Round** of each sentences in which it is eliminated. Later a sentence is eliminated, higher its relative score is within a given news document. +## Summarizer Usage + +SadedeGel summarizers share same interface. + +First a `sadedegel.summarize.ExtractiveSummarizer` instance is constructed. +```python +from sadedegel.summarize import LengthSummarizer, TFIDFSummarizer, DecomposedKMeansSummarizer + +lsum = LengthSummarizer(normalize=True) +tfidf_sum = TFIDFSummarizer(normalize=True) +kmsum = DecomposedKMeansSummarizer(n_components=200, n_clusters=10) +``` + +Create a `sadedegel.Document` instance from the single document to be summarized. +```python +from sadedegel import Doc + +d = Doc("ABD'li yayın organı New York Times, yaklaşık 3 ay içinde kullanıcı sayısını sıfırdan milyonlara çıkaran kelime oyunu Wordle’ı satın aldığını duyurdu. New York Times kısa bir süre önce de spor haberleri sitesi The Athletic'i satın almak için 550 milyon doları gözden çıkarmış ve bu satın alma ile birlikte 1.2 milyon abone kazanmıştı. ...") +``` + +For obtaining a summary of k sentences where k < n_sentences. Call the instance with a `Document` object or `List[Sentences]` + +```python +summary1 = lsum(d, k=2) +summary2 = tfidf_sum(d, k=4) +summary3 = kmsum(d, k=5) +``` +Alternatively you can obtain the relevance score of all sentences that is used to rank them to before selecting top k sentences. + +```python +relevance_scores = kmsum.predict(d) +``` + +#### Supervised Ranker +All sadedegel summarizers work either with unsupervised or rule based methods to rank sentences before extracting top k as the summary. In the new release we are providing a ranker model that is trained on **SadedeGel Annotated Corpus** that has documents where each sentence has relevance label assigned by human annotators through a process of repetitive elimination. + +Ranker uses document-sentence embedding pairs from transformer based pre-trained models as features. Future releases will accomodate BoW based and decomposition based embeddings as well. +For possible pre-trained embedding types supported by sadedegel are `bert_32k_cased`, `bert_128k_cased`, `bert_32k_uncased`, `bert_128k_uncased`, `distilbert`. + +```python +from sadedegel.summarize import SupervisedSentenceRanker + +ranker = SupervisedSentenceRanker(vector_type="bert_32k_cased") +``` + +Supervised Ranker can be tuned for optimal performance over an embedding type and summarization percentage. Current ranker is optimized with `bert_128k_based` for average summarization performance over 10%, 50% and 80% of full document length. + +**Example**: Specific fine-tuning for short summaries with a smaller embedding extraction model. +```python +from sadedegel.summarize.supervised import RankerOptimizer + +fine_tuner = RankerOptimizer(vector_type="distilbert", + summarization_perc=0.1, + n_trials=20) + +fine_tuner.optimize() +``` + ## Summarizer Performance Given this [Model Definition](#sadedegel-model), @@ -28,6 +86,11 @@ ground truth human annotation (Best possible total `relevance` score that can be ### Performance Table +#### Release 0.21.1 +| Method | Parameter | ndcg(optimized for k=0.1) | ndcg(optimized for k=0.5) | ndcg(optimized for k=0.8) | +|------------------|------------------------------------------------------------------------------------------------------------------------------------------|---------------|---------------|---------------| +| SupervisedSentenceRanker | `{"vector_type": "bert_128k_cased"}` | 0.7620 | 0.7269 | 0.8163 | + #### Release 0.18 By 0.18 we have significantly changed the way we evaluate our summarizers. diff --git a/sadedegel/summarize/__init__.py b/sadedegel/summarize/__init__.py index 27a6be1a..ba84cef1 100644 --- a/sadedegel/summarize/__init__.py +++ b/sadedegel/summarize/__init__.py @@ -4,3 +4,4 @@ from .rank import TextRank, LexRankSummarizer # noqa: F401 from .tf_idf import TFIDFSummarizer # noqa: F401 from .bm25 import BM25Summarizer # noqa: F401 +from. supervised import SupervisedSentenceRanker, RankerOptimizer # noqa: F401 diff --git a/sadedegel/summarize/cluster.py b/sadedegel/summarize/cluster.py index c277a40a..5195aa55 100644 --- a/sadedegel/summarize/cluster.py +++ b/sadedegel/summarize/cluster.py @@ -58,10 +58,9 @@ def _predict(self, sentences: List[Sentences]): class DecomposedKMeansSummarizer(ExtractiveSummarizer): """BERT embeddings are high in dimension and potentially carry redundant information that can cause - overfitting or curse of dimensionality effecting in clustering embeddings. - - DecomposedKMeansSummarizer adds a PCA step (or any othe lsinear/non-linear dimensionality reduction technique) - before clustering to obtain highest variance in vector fed into clustering + overfitting or curse of dimensionality effecting in clustering embeddings. + DecomposedKMeansSummarizer adds a PCA step (or any other linear/non-linear dimensionality reduction technique) + before clustering to obtain highest variance in vector fed into clustering """ tags = ExtractiveSummarizer.tags + ['cluster', 'ml'] diff --git a/sadedegel/summarize/model/ranker_bert_128k_cased.joblib b/sadedegel/summarize/model/ranker_bert_128k_cased.joblib new file mode 100644 index 00000000..7bfc56a8 --- /dev/null +++ b/sadedegel/summarize/model/ranker_bert_128k_cased.joblib @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd8644d5a6a3c7d2a11ce1b0b1c26979b766456846bd0ed41f3ef83c69509373 +size 92756 diff --git a/sadedegel/summarize/supervised.py b/sadedegel/summarize/supervised.py new file mode 100644 index 00000000..94308723 --- /dev/null +++ b/sadedegel/summarize/supervised.py @@ -0,0 +1,160 @@ +from os.path import dirname +from pathlib import Path +from itertools import tee +import randomname + +import numpy as np +from typing import List +import joblib +from rich.console import Console +from rich.progress import track + +from ._base import ExtractiveSummarizer +from ..bblock.util import __transformer_model_mapper__ +from ..bblock import Sentences +from ..bblock.doc import DocBuilder +from .util.supervised_tuning import optuna_handler, create_empty_model, fit_ranker, save_ranker + + +__vector_types__ = list(__transformer_model_mapper__.keys()) + ["tfidf", "bm25"] +console = Console() + +try: + import pandas as pd +except ImportError: + console.log(("pandas package is not a general sadedegel dependency." + " But we do have a dependency on building our supervised ranker model")) + + +def load_model(vector_type, debug=False): + name = f"ranker_{vector_type}.joblib" + + if vector_type == "bert_128k_cased": + path = (Path(dirname(__file__)) / 'model' / name).absolute() + else: + path = Path(f"~/.sadedegel_data/models/{name}").expanduser() + + if not debug: + try: + model = joblib.load(path) + console.log(f"Initializing ranker model ranker_{vector_type}...", style="blue") + except Exception as e: + raise FileNotFoundError(f"A model trained for {vector_type} is not found. Please optimize one with " + f"sadedegel.summarize.RankerOptimizer. {e}") + + else: + model = name + + return model + + +class SupervisedSentenceRanker(ExtractiveSummarizer): + model = None + vector_type = None + debug = False + tags = ExtractiveSummarizer.tags + ["ml", "supervised", "rank"] + + def __init__(self, normalize=True, vector_type="bert_128k_cased", **kwargs): + super().__init__(normalize) + self.debug = kwargs.get("debug", False) + self.init_model(vector_type, self.debug) + + @classmethod + def init_model(cls, vector_type, debug): + db_switch = False + if vector_type not in __vector_types__: + raise ValueError(f"Not a valid vectorization for input sequence. Valid types are {__vector_types__}") + if cls.debug != debug: + cls.debug = debug + db_switch = True + if cls.debug: + console.log("SupervisedSentenceRanker: Switching debug mode ON.") + else: + console.log("SupervisedSentenceRanker Switching debug mode OFF.") + if cls.vector_type is not None and not db_switch: + if cls.vector_type == vector_type: + return 0 + + cls.model = load_model(vector_type, debug) + cls.vector_type = vector_type + + def _predict(self, sents: List[Sentences]) -> np.ndarray: + if self.vector_type not in ["tfidf", "bm25"]: + doc_sent_embeddings = self._get_pretrained_embeddings(sents) + else: + raise NotImplementedError("BoW interface for SupervisedSentenceRanker is not yet implemented.") + + if self.model is not None: + scores = self.model.predict(doc_sent_embeddings) + else: + raise ValueError("A ranker model is not found.") + + return scores + + def _get_pretrained_embeddings(self, sents: List[Sentences]) -> np.ndarray: + doc_embedding = sents[0].document.get_pretrained_embedding(architecture=self.vector_type, do_sents=False) + doc_embedding = np.vstack(len(sents) * [doc_embedding]) + sent_embeddings = sents[0].document.get_pretrained_embedding(architecture=self.vector_type, do_sents=True) + + return np.hstack([doc_embedding, sent_embeddings]) + + def _get_bow_vectors(self, sents: List[Sentences]) -> np.ndarray: + pass + + +class RankerOptimizer(SupervisedSentenceRanker): + def __init__(self, n_trials: int, vector_type: str, summarization_perc: float,**kwargs): + self.n_trials = n_trials + self.vector_type = vector_type + self.summarization_perc = summarization_perc + + def optimize(self): + """Optimize the ranker model for a custom summarization percentage. Optimize and dump a new model. + """ + run_name = randomname.get_name() + df, vecs = self._prepare_dataset() + + optuna_handler(n_trials=self.n_trials, run_name=run_name, + metadata=df, vectors=vecs, k=self.summarization_perc) + + model = create_empty_model(run_name) + ranker = fit_ranker(ranker=model, vectors=vecs, metadata=df) + save_ranker(ranker, name=self.vector_type) + + def _prepare_dataset(self): + try: + from sadedegel.dataset import load_raw_corpus, load_annotated_corpus + except Exception as e: + raise ValueError("Cannot import raw and annotated corpi.") + + annot = load_annotated_corpus() + annot_, annot = tee(annot) + + embs = [] + metadata = [] + Doc = DocBuilder() + for doc_id, doc in track(enumerate(annot), description="Processing documents", total=len(list(annot_))): + + relevance_scores = doc["relevance"] + d = Doc.from_sentences(doc["sentences"]) + sents = list(d) + + for sent_id, sent in enumerate(sents): + instance = dict() + instance["doc_id"] = doc_id + instance["sent_id"] = sent_id + instance["relevance"] = relevance_scores[sent_id] + + metadata.append(instance) + + if self.vector_type not in ["tfidf", "bm25"]: + doc_sent_embeddings = self._get_pretrained_embeddings(sents) + else: + raise NotImplementedError("BoW interface for SupervisedSentenceRanker is not yet implemented.") + + embs.append(doc_sent_embeddings) + + df = pd.DataFrame().from_records(metadata) + vecs = np.vstack(embs) + + return df, vecs diff --git a/sadedegel/summarize/util/supervised_tuning.py b/sadedegel/summarize/util/supervised_tuning.py new file mode 100644 index 00000000..24059a8b --- /dev/null +++ b/sadedegel/summarize/util/supervised_tuning.py @@ -0,0 +1,185 @@ +import json +import numpy as np +import lightgbm as lgb +from os import makedirs +from pathlib import Path +from functools import partial +import warnings +import joblib +from rich.console import Console +from rich.live import Live + +warnings.filterwarnings("ignore") + +console = Console() + +try: + import pandas as pd +except ImportError: + console.log(("pandas package is not a general sadedegel dependency." + " But we do have a dependency on building our supervised ranker model")) + +try: + import optuna + optuna.logging.set_verbosity(optuna.logging.WARN) +except ImportError: + console.log(("optuna package is not a general sadedegel dependency." + " But we do have a dependency on tuning our supervised ranker model. Please install optuna to proceed.")) + + +def check_log_dir(data_home="~/.sadedegel_data"): + logs_path = Path(data_home).expanduser() / "logs" + if not logs_path.exists(): + makedirs(logs_path) + + +def create_json_if_notxsts(data_home="~/.sadedegel_data", json_name=None): + logs_path = Path(data_home).expanduser() / "logs" + if not (logs_path / json_name).exists(): + with open(str(logs_path / json_name), "w") as jfile: + json.dump({}, jfile) + + +def log_early_stop(trial, mean_rounds, run_name): + check_log_dir() + create_json_if_notxsts(json_name=f"{run_name}_early_stop.json") + path = Path(f"~/.sadedegel_data/logs/{run_name}_early_stop.json").expanduser() + with open(path, "r+") as jfile: + trial_round_dict = json.load(jfile) + trial_round_dict.update({trial: mean_rounds}) + jfile.seek(0) + json.dump(trial_round_dict, jfile, indent=4) + + +def log_best_params(study, run_name): + check_log_dir() + path = Path(f"~/.sadedegel_data/logs/{run_name}_best_trial.json").expanduser() + + best_trial_dict = dict() + best_trial_dict["params"] = study.best_params + best_trial_dict["score"] = study.best_value + best_trial_dict["trial"] = study.best_trial.number + + console.log(f"Optimization DONE. Best Score so far: {study.best_value}. Saving parameter space to ~/.sadedegel_data/logs") + + with open(path, "w") as jfile: + json.dump(best_trial_dict, jfile) + + +def parse_early_stop(run_name, best_trial): + path = Path(f"~/.sadedegel_data/logs/{run_name}_early_stop.json").expanduser() + with open(path, "r") as jfile: + trials_dict = json.load(jfile) + return int(trials_dict[str(best_trial)]) + + +def parse_best_trial(run_name): + path = Path(f"~/.sadedegel_data/logs/{run_name}_best_trial.json").expanduser() + with open(path, "r") as jfile: + best_params_dict = json.load(jfile) + + return best_params_dict["params"], best_params_dict["trial"] + + +def ranker_objective(trial, vectors, metadata, k, run_name, live): + + if trial.number == 0: + live.update("Optuna tuning has started. Trial results will be reported live...", refresh=True) + + param_grid = { + "n_estimators": trial.suggest_int("n_estimators", 100, 1000), + "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.9), + "num_leaves": trial.suggest_int("num_leaves", 4, 512), + "max_depth": trial.suggest_int("max_depth", 2, 20), + "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 50, 1000), + "max_bin": trial.suggest_int("max_bin", 150, 300), + "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-3, 5), + "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-3, 5), + "bagging_fraction": trial.suggest_loguniform("bagging_fraction", 0.6, 0.95), + "feature_fraction": trial.suggest_loguniform("feature_fraction", 0.3, 0.95), + } + + summarization_perf = [] + best_iters = [] + uniq_docs = metadata.doc_id.unique() + + # Leave-one-out cross validation over documents. + for doc_id in uniq_docs: + train_docs = metadata.loc[metadata.doc_id != doc_id] + valid_docs = metadata.loc[metadata.doc_id == doc_id] + + train_ixs = train_docs.index.tolist() + valid_ixs = valid_docs.index.tolist() + + train_X, train_y = vectors[train_ixs], train_docs.relevance.values + valid_X, valid_y = vectors[valid_ixs], valid_docs.relevance.values + + qids_train = train_docs.groupby("doc_id")["doc_id"].count().to_numpy() + qids_valid = valid_docs.groupby("doc_id")["doc_id"].count().to_numpy() + + eval_at_perc = int(valid_X.shape[0] * k) + + # Fit ranker model + ranker = lgb.LGBMRanker(objective="lambdarank", metric="ndcg", verbose=-100, **param_grid) + ranker.fit(X=train_X, + y=train_y, + group=qids_train, + eval_set=[(valid_X, valid_y)], + eval_group=[qids_valid], + eval_at=eval_at_perc, + verbose=-100, + callbacks=[lgb.early_stopping(stopping_rounds=15, verbose=False)]) + + # Collect best early stopping for each validation document + best_iters.append(ranker.best_iteration_) + summarization_perf.append(ranker.best_score_["valid_0"][f"ndcg@{eval_at_perc}"]) + + log_early_stop(trial=trial.number, + mean_rounds=np.mean(best_iters), + run_name=run_name) + + return np.mean(summarization_perf) + + +def live_update_callback(study, trial, total_trials, live): + live.update(f"Trial: {trial.number + 1}/{total_trials} - Trial Score: {trial.value} - Best Score So Far: {study.best_value}", refresh=True) + + +def optuna_handler(n_trials, run_name, metadata, vectors, k): + with Live(console=console, screen=True, auto_refresh=False) as live: + objective = partial(ranker_objective, run_name=run_name, metadata=metadata, vectors=vectors, k=k, live=live) + live_update = partial(live_update_callback, total_trials=n_trials, live=live) + study = optuna.create_study(direction="maximize", study_name="LGBM Ranker") + study.optimize(objective, n_trials=n_trials, callbacks=[live_update]) + + log_best_params(study, run_name=run_name) + + +def create_empty_model(run_name: str): + params, trial = parse_best_trial(run_name=run_name) + num_rounds = parse_early_stop(run_name=run_name, best_trial=trial) + params["n_estimators"] = num_rounds + + model = lgb.LGBMRanker(objective="lambdarank", metric="ndcg", verbose=-100, **params) + + return model + + +def fit_ranker(ranker: lgb.LGBMRanker, vectors: np.ndarray, metadata): + console.log("Fitting model with optimal parameter space.", style="cyan") + + train_X, train_y = vectors, metadata.relevance.values + qids_train = metadata.groupby("doc_id")["doc_id"].count().to_numpy() + ranker.fit(train_X, train_y, group=qids_train) + + return ranker + + +def save_ranker(ranker: lgb.LGBMRanker, name: str): + basepath = Path(f"~/.sadedegel_data/models").expanduser() + if not basepath.exists(): + makedirs(basepath) + path = Path(f"{basepath}/ranker_{name}.joblib").expanduser() + joblib.dump(ranker, path) + + console.log(f"Model saved to ~/.sadedegel_data/models with name ranker_{name}") diff --git a/tests/summarizer/context.py b/tests/summarizer/context.py index fe40047c..1f8fcfc5 100644 --- a/tests/summarizer/context.py +++ b/tests/summarizer/context.py @@ -8,6 +8,7 @@ from sadedegel.summarize import KMeansSummarizer,AutoKMeansSummarizer,DecomposedKMeansSummarizer, BM25Summarizer # noqa # pylint: disable=unused-import, wrong-import-position from sadedegel.summarize import TextRank # noqa # pylint: disable=unused-import, wrong from sadedegel.summarize import TFIDFSummarizer # noqa # pylint: disable=unused-import +from sadedegel.summarize import SupervisedSentenceRanker, RankerOptimizer # noqa # pylint: disable=unused-import, wrong from sadedegel import Doc, tokenizer_context # noqa # pylint: disable=unused-import, wrong from sadedegel.bblock import BertTokenizer, SimpleTokenizer, ICUTokenizer # noqa # pylint: disable=unused-import, wrong from sadedegel.config import tf_context # noqa # pylint: disable=unused-import, wrong diff --git a/tests/summarizer/test_supervised.py b/tests/summarizer/test_supervised.py new file mode 100644 index 00000000..b42ce59b --- /dev/null +++ b/tests/summarizer/test_supervised.py @@ -0,0 +1,53 @@ +import pkgutil # noqa: F401 # pylint: disable=unused-import + + +from .context import SupervisedSentenceRanker, RankerOptimizer, Doc +import numpy as np +import pytest +import lightgbm as lgb + + +famous_quote = ("Merhaba dünya biz dostuz. Barış için geldik. Sizi lazerlerimizle buharlaştırmayacağız." + " Onun yerine kölemiz olacaksınız.") + + +@pytest.mark.skipif('pkgutil.find_loader("transformers") is None') +@pytest.mark.skipif('pkgutil.find_loader("pandas") is None') +@pytest.mark.skipif('pkgutil.find_loader("optuna") is None') +@pytest.mark.parametrize("normalize", [True, False]) +@pytest.mark.parametrize("text", [famous_quote]) +@pytest.mark.parametrize("vector", ["bert_128k_cased", "bert_32k_cased", "distilbert", "bert_32k_uncased", "bert_128k_uncased", "electra"]) +def test_ranker_init(normalize, text, vector): + if vector != "bert_128k_cased": + if vector == "electra": + with pytest.raises(ValueError, match=r".*Not a valid vectorization for input sequence.*"): + ranker = SupervisedSentenceRanker(vector_type=vector, debug=True) + else: + ranker = SupervisedSentenceRanker(vector_type=vector, debug=True) + assert ranker.model == f"ranker_{vector}.joblib" + + ranker = SupervisedSentenceRanker(vector_type="bert_128k_cased", debug=True) + assert ranker.model == f"ranker_bert_128k_cased.joblib" + + ranker = SupervisedSentenceRanker(vector_type="bert_32k_cased") + assert isinstance(ranker.model, lgb.sklearn.LGBMRanker) + else: + ranker = SupervisedSentenceRanker(vector_type=vector) + assert isinstance(ranker.model, lgb.sklearn.LGBMRanker) + + +@pytest.mark.skipif('pkgutil.find_loader("transformers") is None') +@pytest.mark.skipif('pkgutil.find_loader("pandas") is None') +@pytest.mark.skipif('pkgutil.find_loader("optuna") is None') +@pytest.mark.parametrize("normalize", [True, False]) +@pytest.mark.parametrize("text", [famous_quote]) +def test_summary(normalize, text): + d = Doc(text) + ranker = SupervisedSentenceRanker(vector_type="bert_128k_cased") + relevance_scores = ranker.predict(d) + assert len(relevance_scores) == 4 + if normalize: + np.sum(relevance_scores) == 1 + + for i in range(len(d)): + assert len(ranker(d, k=i+1)) == i+1