diff --git a/delft/applications/datasetTagger.py b/delft/applications/datasetTagger.py index 1ff7b66..5535d1d 100644 --- a/delft/applications/datasetTagger.py +++ b/delft/applications/datasetTagger.py @@ -8,17 +8,19 @@ from delft.sequenceLabelling import Sequence from delft.sequenceLabelling.reader import load_data_and_labels_json_offsets +from delft.utilities.Utilities import t_or_f from delft.utilities.misc import parse_number_ranges -def configure(architecture, output_path=None, max_sequence_length=-1, batch_size=-1, embeddings_name=None, - max_epoch=-1, use_ELMo=False, patience=-1): +def configure(architecture, output_path=None, max_sequence_length=-1, + batch_size=-1, embeddings_name=None, + max_epoch=-1, use_ELMo=False, patience=-1, early_stop=None): """ Set up the default parameters based on the model type. """ model_name = 'datasets' multiprocessing = True - early_stop = True + o_early_stop = True if "BERT" in architecture: # architectures with some transformer layer/embeddings inside @@ -57,18 +59,26 @@ def configure(architecture, output_path=None, max_sequence_length=-1, batch_size if max_epoch == -1: max_epoch = 60 + if early_stop is not None: + o_early_stop = early_stop + if patience == -1: patience = 5 - return batch_size, max_sequence_length, model_name, embeddings_name, max_epoch, multiprocessing, early_stop, patience + return batch_size, max_sequence_length, model_name, embeddings_name, max_epoch, multiprocessing, o_early_stop, patience # train a model with all available data def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None, - input_path=None, output_path=None, fold_count=1, - features_indices=None, max_sequence_length=-1, - batch_size=-1, max_epoch=-1, use_ELMo=False, patience=-1, - learning_rate=None, multi_gpu=False): + input_path=None, output_path=None, fold_count=1, + features_indices=None, max_sequence_length=-1, + batch_size=-1, use_ELMo=False, + max_epoch=-1, + patience=-1, + learning_rate=None, + early_stop=None, + multi_gpu=False): + print('Loading data...') if input_path is None: x_all1 = y_all1 = x_all2 = y_all2 = x_all3 = y_all3 = [] @@ -98,7 +108,8 @@ def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None, embeddings_name, max_epoch, use_ELMo, - patience) + patience, + early_stop) model = Sequence(model_name, recurrent_dropout=0.50, embeddings_name=embeddings_name, @@ -130,9 +141,14 @@ def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None, # split data, train a model and evaluate it def train_eval(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None, - input_path=None, output_path=None, fold_count=1, - features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1, use_ELMo=False, - patience=-1, learning_rate=None, multi_gpu=False): + input_path=None, output_path=None, fold_count=1, + features_indices=None, max_sequence_length=-1, batch_size=-1, use_ELMo=False, + max_epoch=-1, + patience=-1, + learning_rate=None, + early_stop=None, + multi_gpu=False): + print('Loading data...') if input_path is None: x_all1 = y_all1 = x_all2 = y_all2 = x_all3 = y_all3 = [] @@ -164,7 +180,8 @@ def train_eval(embeddings_name=None, architecture='BidLSTM_CRF', transformer=Non embeddings_name, max_epoch, use_ELMo, - patience=patience) + patience=patience, + early_stop=early_stop) model = Sequence(model_name, recurrent_dropout=0.50, embeddings_name=embeddings_name, @@ -285,10 +302,17 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non parser.add_argument("--patience", type=int, default=-1, help="patience, number of extra epochs to perform after " "the best epoch before stopping a training.") parser.add_argument("--learning-rate", type=float, default=None, help="Initial learning rate") + + parser.add_argument("--max-epoch", type=int, default=-1, + help="Maximum number of epochs.") + parser.add_argument("--early-stop", type=t_or_f, default=None, + help="Force early training termination when metrics scores are not improving " + + "after a number of epochs equals to the patience parameter.") parser.add_argument("--multi-gpu", default=False, help="Enable the support for distributed computing (the batch size needs to be set accordingly using --batch-size)", action="store_true") + args = parser.parse_args() action = args.action @@ -302,6 +326,8 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non use_ELMo = args.use_ELMo patience = args.patience learning_rate = args.learning_rate + max_epoch = args.max_epoch + early_stop = args.early_stop multi_gpu = args.multi_gpu if transformer is None and embeddings_name is None: @@ -310,16 +336,18 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non if action == "train": train(embeddings_name=embeddings_name, - architecture=architecture, - transformer=transformer, - input_path=input_path, - output_path=output, - max_sequence_length=max_sequence_length, - batch_size=batch_size, - use_ELMo=use_ELMo, - patience=patience, - learning_rate=learning_rate, - multi_gpu=multi_gpu) + architecture=architecture, + transformer=transformer, + input_path=input_path, + output_path=output, + max_sequence_length=max_sequence_length, + batch_size=batch_size, + use_ELMo=use_ELMo, + patience=patience, + learning_rate=learning_rate, + max_epoch=max_epoch, + early_stop=early_stop, + multi_gpu=multi_gpu) if action == "eval": if args.fold_count is not None and args.fold_count > 1: @@ -343,6 +371,8 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non use_ELMo=use_ELMo, patience=patience, learning_rate=learning_rate, + max_epoch=max_epoch, + early_stop=early_stop, multi_gpu=multi_gpu) if action == "tag": diff --git a/delft/applications/grobidTagger.py b/delft/applications/grobidTagger.py index 84a7d11..00d9bb9 100644 --- a/delft/applications/grobidTagger.py +++ b/delft/applications/grobidTagger.py @@ -8,13 +8,13 @@ from delft.sequenceLabelling import Sequence from delft.sequenceLabelling.reader import load_data_and_labels_crf_file -from delft.utilities.Utilities import longest_row +from delft.utilities.Utilities import longest_row, t_or_f MODEL_LIST = ['affiliation-address', 'citation', 'date', 'header', 'name-citation', 'name-header', 'software', 'figure', 'table', 'reference-segmenter', 'segmentation', 'funding-acknowledgement'] def configure(model, architecture, output_path=None, max_sequence_length=-1, batch_size=-1, - embeddings_name=None, max_epoch=-1, use_ELMo=False, patience=-1): + embeddings_name=None, max_epoch=-1, use_ELMo=False, patience=-1, early_stop=None): """ Set up the default parameters based on the model type. """ @@ -24,7 +24,7 @@ def configure(model, architecture, output_path=None, max_sequence_length=-1, bat model_name = 'grobid-' + model multiprocessing = True - early_stop = True + o_early_stop = True if architecture and "BERT" in architecture: # architectures with some transformer layer/embeddings inside @@ -56,7 +56,7 @@ def configure(model, architecture, output_path=None, max_sequence_length=-1, bat batch_size = 8 if max_sequence_length == -1: max_sequence_length = 512 - early_stop = False + o_early_stop = False if max_epoch == -1: max_epoch = 30 elif model.startswith("funding"): @@ -144,13 +144,17 @@ def configure(model, architecture, output_path=None, max_sequence_length=-1, bat if patience == -1: patience = 5 - return batch_size, max_sequence_length, model_name, embeddings_name, max_epoch, multiprocessing, early_stop, patience + if early_stop is not None: + o_early_stop = early_stop + + return batch_size, max_sequence_length, model_name, embeddings_name, max_epoch, multiprocessing, o_early_stop, patience # train a GROBID model with all available data -def train(model, embeddings_name=None, architecture=None, transformer=None, input_path=None, - output_path=None, features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1, - use_ELMo=False, incremental=False, input_model_path=None, patience=-1, learning_rate=None, multi_gpu=False): + +def train(model, embeddings_name=None, architecture=None, transformer=None, input_path=None, + output_path=None, features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1, + use_ELMo=False, incremental=False, input_model_path=None, patience=-1, learning_rate=None, early_stop=None, multi_gpu=False): print('Loading data...') if input_path == None: @@ -176,7 +180,8 @@ def train(model, embeddings_name=None, architecture=None, transformer=None, inpu embeddings_name, max_epoch, use_ELMo, - patience) + patience, early_stop) + model = Sequence(model_name, recurrent_dropout=0.50, embeddings_name=embeddings_name, @@ -217,7 +222,9 @@ def train(model, embeddings_name=None, architecture=None, transformer=None, inpu def train_eval(model, embeddings_name=None, architecture='BidLSTM_CRF', transformer=None, input_path=None, output_path=None, fold_count=1, features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1, - use_ELMo=False, incremental=False, input_model_path=None, patience=-1, learning_rate=None, multi_gpu=False): + use_ELMo=False, incremental=False, input_model_path=None, patience=-1, + learning_rate=None, early_stop=None, multi_gpu=False): + print('Loading data...') if input_path is None: x_all, y_all, f_all = load_data_and_labels_crf_file('data/sequenceLabelling/grobid/'+model+'/'+model+'-060518.train') @@ -243,7 +250,8 @@ def train_eval(model, embeddings_name=None, architecture='BidLSTM_CRF', transfor embeddings_name, max_epoch, use_ELMo, - patience) + patience, + early_stop) model = Sequence(model_name, recurrent_dropout=0.50, embeddings_name=embeddings_name, @@ -410,6 +418,13 @@ class Tasks: parser.add_argument("--patience", type=int, default=-1, help="patience, number of extra epochs to perform after " "the best epoch before stopping a training.") parser.add_argument("--learning-rate", type=float, default=None, help="Initial learning rate") + + parser.add_argument("--max-epoch", type=int, default=-1, + help="Maximum number of epochs for training.") + parser.add_argument("--early-stop", type=t_or_f, default=None, + help="Force early training termination when metrics scores are not improving " + + "after a number of epochs equals to the patience parameter.") + parser.add_argument("--multi-gpu", default=False, help="Enable the support for distributed computing (the batch size needs to be set accordingly using --batch-size)", action="store_true") @@ -430,6 +445,8 @@ class Tasks: incremental = args.incremental patience = args.patience learning_rate = args.learning_rate + max_epoch = args.max_epoch + early_stop = args.early_stop multi_gpu = args.multi_gpu if architecture is None: @@ -441,19 +458,21 @@ class Tasks: if action == Tasks.TRAIN: train(model, - embeddings_name=embeddings_name, - architecture=architecture, - transformer=transformer, - input_path=input_path, - output_path=output, - max_sequence_length=max_sequence_length, - batch_size=batch_size, - use_ELMo=use_ELMo, - incremental=incremental, - input_model_path=input_model_path, - patience=patience, - learning_rate=learning_rate, - multi_gpu=multi_gpu) + embeddings_name=embeddings_name, + architecture=architecture, + transformer=transformer, + input_path=input_path, + output_path=output, + max_sequence_length=max_sequence_length, + batch_size=batch_size, + use_ELMo=use_ELMo, + incremental=incremental, + input_model_path=input_model_path, + patience=patience, + learning_rate=learning_rate, + max_epoch=max_epoch, + early_stop=early_stop, + multi_gpu=multi_gpu) if action == Tasks.EVAL: if args.fold_count is not None and args.fold_count > 1: @@ -479,6 +498,8 @@ class Tasks: incremental=incremental, input_model_path=input_model_path, learning_rate=learning_rate, + max_epoch=max_epoch, + early_stop=early_stop, multi_gpu=multi_gpu) if action == Tasks.TAG: diff --git a/delft/applications/insultTagger.py b/delft/applications/insultTagger.py index 310646e..f784d97 100644 --- a/delft/applications/insultTagger.py +++ b/delft/applications/insultTagger.py @@ -5,26 +5,43 @@ import argparse import time -def configure(architecture, embeddings_name): - batch_size = 20 +from delft.utilities.Utilities import t_or_f + + +def configure(architecture, embeddings_name, batch_size=-1, max_epoch=-1, early_stop=None): + maxlen = 300 patience = 5 - early_stop = True - max_epoch = 50 + o_early_stop = True + if max_epoch == -1: + max_epoch = 50 + + if batch_size == -1: + batch_size = 20 # default bert model parameters if architecture.find("BERT") != -1: - batch_size = 10 - early_stop = False - max_epoch = 3 + if batch_size == -1: + batch_size = 10 + o_early_stop = False + if max_epoch == -1: + max_epoch = 3 + embeddings_name = None - return batch_size, maxlen, patience, early_stop, max_epoch, embeddings_name + if early_stop is not None: + o_early_stop = early_stop -def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None, use_ELMo=False, learning_rate=None, - multi_gpu=False): - batch_size, maxlen, patience, early_stop, max_epoch, embeddings_name = configure(architecture, embeddings_name) + return batch_size, maxlen, patience, o_early_stop, max_epoch, embeddings_name +def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None, + use_ELMo=False, learning_rate=None, + batch_size=-1, max_epoch=-1, early_stop=None, multi_gpu=False): + batch_size, maxlen, patience, early_stop, max_epoch, embeddings_name = configure(architecture, + embeddings_name, + batch_size, + max_epoch, + early_stop) root = 'data/sequenceLabelling/toxic/' train_path = os.path.join(root, 'corrected.xml') @@ -116,20 +133,32 @@ def annotate(texts, output_format, architecture='BidLSTM_CRF', transformer=None, ) parser.add_argument("--use-ELMo", action="store_true", help="Use ELMo contextual embeddings") parser.add_argument("--learning-rate", type=float, default=None, help="Initial learning rate") + parser.add_argument("--max-epoch", type=int, default=-1, + help="Maximum number of epochs.") + parser.add_argument("--batch-size", type=int, default=-1, help="batch-size parameter to be used.") + parser.add_argument("--early-stop", type=t_or_f, default=None, + help="Force early training termination when metrics scores are not improving " + + "after a number of epochs equals to the patience parameter.") + parser.add_argument("--multi-gpu", default=False, help="Enable the support for distributed computing (the batch size needs to be set accordingly using --batch-size)", action="store_true") + args = parser.parse_args() if args.action not in ('train', 'tag'): - print('action not specifed, must be one of [train,tag]') + print('action not specified, must be one of [train,tag]') embeddings_name = args.embedding architecture = args.architecture transformer = args.transformer use_ELMo = args.use_ELMo learning_rate = args.learning_rate + + batch_size = args.batch_size + max_epoch = args.max_epoch + early_stop = args.early_stop multi_gpu = args.multi_gpu if transformer == None and embeddings_name == None: @@ -139,8 +168,12 @@ def annotate(texts, output_format, architecture='BidLSTM_CRF', transformer=None, if args.action == 'train': train(embeddings_name=embeddings_name, architecture=architecture, - transformer=transformer, use_ELMo=use_ELMo, + transformer=transformer, + use_ELMo=use_ELMo, learning_rate=learning_rate, + batch_size=batch_size, + max_epoch=max_epoch, + early_stop=early_stop, multi_gpu=multi_gpu) if args.action == 'tag': diff --git a/delft/applications/nerTagger.py b/delft/applications/nerTagger.py index 747aaf6..9206541 100644 --- a/delft/applications/nerTagger.py +++ b/delft/applications/nerTagger.py @@ -1,77 +1,90 @@ import os import numpy as np from delft.sequenceLabelling import Sequence -from delft.utilities.Utilities import stats +from delft.utilities.Utilities import stats, t_or_f from delft.utilities.numpy import shuffle_arrays from delft.sequenceLabelling.reader import load_data_and_labels_conll, load_data_and_labels_lemonde, load_data_and_labels_ontonotes from sklearn.model_selection import train_test_split import argparse import time -def configure(architecture, dataset_type, lang, embeddings_name, use_ELMo, max_sequence_length=-1, batch_size=-1, - patience=-1): +def configure(architecture, dataset_type, lang, embeddings_name, + use_ELMo, max_sequence_length=-1, batch_size=-1, + patience=-1, max_epoch=-1, early_stop=None): - max_epoch = 60 - early_stop = True - multiprocessing = True + o_max_epoch = 60 + o_early_stop = True + o_multiprocessing = True + o_max_sequence_length = 300 + o_patience = 5 + o_batch_size = 32 # general RNN word embeddings input if embeddings_name is None: - embeddings_name = 'glove-840B' + o_embeddings_name = 'glove-840B' if lang == 'en': if dataset_type == 'conll2012': - embeddings_name = 'fasttext-crawl' + o_embeddings_name = 'fasttext-crawl' elif lang == 'fr': - embeddings_name = 'wiki.fr' + o_embeddings_name = 'wiki.fr' + else: + o_embeddings_name = embeddings_name if lang == 'fr': - multiprocessing = False + o_multiprocessing = False if architecture == "BidLSTM_CNN_CRF": - word_lstm_units = 200 - max_epoch = 30 - recurrent_dropout = 0.5 + o_word_lstm_units = 200 + o_max_epoch = 30 + o_recurrent_dropout = 0.5 else: - word_lstm_units = 100 - max_epoch = 50 - recurrent_dropout = 0.5 + o_word_lstm_units = 100 + o_max_epoch = 50 + o_recurrent_dropout = 0.5 if use_ELMo: # following should be done for predicting if max sequence length permits, it also boosts the runtime with ELMo embeddings signicantly # but requires more GPU memory - batch_size = 128 - max_sequence_length = 150 + o_batch_size = 128 + o_max_sequence_length = 150 # default bert model parameters if architecture.find("BERT") != -1: - batch_size = 32 - early_stop = True - max_sequence_length = 150 - max_epoch = 50 - embeddings_name = None + o_batch_size = 32 + o_early_stop = True + o_max_sequence_length = 150 + o_max_epoch = 50 + o_embeddings_name = None if dataset_type == 'conll2012': - multiprocessing = False + o_multiprocessing = False + + if patience > 0: + o_patience = patience - if patience == -1: - patience = 5 + if batch_size > 0: + o_batch_size = batch_size - if batch_size == -1: - batch_size = 32 + if max_sequence_length > 0: + o_max_sequence_length = max_sequence_length - if max_sequence_length == -1: - max_sequence_length = 300 + if max_epoch > 0: + o_max_epoch = max_epoch - return batch_size, max_sequence_length, patience, recurrent_dropout, early_stop, max_epoch, embeddings_name, word_lstm_units, multiprocessing + if early_stop is not None: + o_early_stop = early_stop + + return o_batch_size, o_max_sequence_length, o_patience, o_recurrent_dropout, o_early_stop, o_max_epoch, o_embeddings_name, o_word_lstm_units, o_multiprocessing # train a model with all available for a given dataset def train(dataset_type='conll2003', lang='en', embeddings_name=None, architecture='BidLSTM_CRF', - transformer=None, data_path=None, use_ELMo=False, max_sequence_length=-1, batch_size=-1, patience=-1, - learning_rate=None, multi_gpu=False): + + transformer=None, data_path=None, use_ELMo=False, max_sequence_length=-1, + batch_size=-1, patience=-1, learning_rate=None, max_epoch=-1, early_stop=None, multi_gpu=False): batch_size, max_sequence_length, patience, recurrent_dropout, early_stop, max_epoch, embeddings_name, word_lstm_units, multiprocessing = \ - configure(architecture, dataset_type, lang, embeddings_name, use_ELMo, max_sequence_length, batch_size, patience) + configure(architecture, dataset_type, lang, embeddings_name, use_ELMo, max_sequence_length, batch_size, patience, max_epoch, early_stop) if (dataset_type == 'conll2003') and (lang == 'en'): print('Loading data...') @@ -197,11 +210,13 @@ def train_eval(embeddings_name=None, batch_size=-1, max_sequence_length=-1, learning_rate=None, + max_epoch=-1, + early_stop=None, multi_gpu=False): batch_size, max_sequence_length, patience, recurrent_dropout, early_stop, max_epoch, embeddings_name, word_lstm_units, multiprocessing = \ configure(architecture, dataset_type, lang, embeddings_name, use_ELMo, - max_sequence_length=max_sequence_length, batch_size=batch_size, patience=patience) + max_sequence_length=max_sequence_length, batch_size=batch_size, patience=patience, max_epoch=max_epoch, early_stop=early_stop) if (dataset_type == 'conll2003') and (lang == 'en'): print('Loading CoNLL 2003 data...') @@ -616,15 +631,23 @@ def annotate(output_format, parser.add_argument("--patience", type=int, default=-1, help="patience, number of extra epochs to perform after " "the best epoch before stopping a training.") parser.add_argument("--learning-rate", type=float, default=None, help="Initial learning rate") + + parser.add_argument("--max-epoch", type=int, default=-1, + help="Maximum number of epochs.") + parser.add_argument("--early-stop", type=t_or_f, default=None, + help="Force early training termination when metrics scores are not improving " + + "after a number of epochs equals to the patience parameter.") + parser.add_argument("--multi-gpu", default=False, help="Enable the support for distributed computing (the batch size needs to be set accordingly using --batch-size)", action="store_true") + args = parser.parse_args() action = args.action if action not in ('train', 'tag', 'eval', 'train_eval'): - print('action not specifed, must be one of [train, train_eval, eval, tag]') + print('action not specified, must be one of [train, train_eval, eval, tag]') lang = args.lang dataset_type = args.dataset_type train_with_validation_set = args.train_with_validation_set @@ -640,6 +663,8 @@ def annotate(output_format, max_sequence_length = args.max_sequence_length batch_size = args.batch_size learning_rate = args.learning_rate + max_epoch = args.max_epoch + early_stop = args.early_stop multi_gpu = args.multi_gpu # name of embeddings refers to the file delft/resources-registry.json @@ -661,6 +686,8 @@ def annotate(output_format, batch_size=batch_size, patience=patience, learning_rate=learning_rate, + max_epoch=max_epoch, + early_stop=early_stop, multi_gpu=multi_gpu ) @@ -681,6 +708,8 @@ def annotate(output_format, batch_size=batch_size, patience=patience, learning_rate=learning_rate, + max_epoch=max_epoch, + early_stop=early_stop, multi_gpu=multi_gpu ) diff --git a/delft/utilities/Utilities.py b/delft/utilities/Utilities.py index 6c2ddd8..1821413 100644 --- a/delft/utilities/Utilities.py +++ b/delft/utilities/Utilities.py @@ -617,3 +617,13 @@ def longest_row(array): convert_conll2012_to_iob2(data_path, output_path) elif dataset_type == 'ontonotes': ontonotes_conll2012_names(data_path, output_path) + + +def t_or_f(arg): + ua = str(arg).upper() + if 'TRUE'.startswith(ua): + return True + elif 'FALSE'.startswith(ua): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected. Omit this option to use default values.') \ No newline at end of file diff --git a/doc/ner.md b/doc/ner.md index fb5f076..b439877 100644 --- a/doc/ner.md +++ b/doc/ner.md @@ -34,7 +34,7 @@ Results with transformer fine-tuning for CoNLL-2003 NER dataset, including a fin | --- | --- | --- | --- | | BERT | bert-base-cased | DeLFT | 91.19 | | BERT_CRF | bert-base-cased +CRF| DeLFT | 91.25 | -| BERT_ChainCRF | bert-base-cased +CRF| DeLFT | | +| BERT_ChainCRF | bert-base-cased +CRF| DeLFT | 91.22 | | BERT | roberta-base | DeLFT | 91.64 | Note: DeLFT uses `BERT` as architecture name for transformers in general, but the transformer model could be in principle any transformer variants preset in HuggingFace Hub. DeLFT supports 2 implementations of a CRF layer to be combined with RNN and transformer architectures: `CRF` based on TensorFlow Addons and `ChainCRF` a custom implementation. Both should produce similar accuracy results, but `ChainCRF` is significantly faster and robust.