From 78f5ac66cf2110f2d6b61921e3acff800041c56b Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 24 Aug 2023 13:42:19 +0900 Subject: [PATCH 1/6] add parameter --max-epoch to disable early_stop and override the default values --- delft/applications/datasetTagger.py | 14 ++++- delft/applications/grobidTagger.py | 13 ++++- delft/applications/insultTagger.py | 40 ++++++++++--- delft/applications/nerTagger.py | 90 +++++++++++++++++------------ 4 files changed, 107 insertions(+), 50 deletions(-) diff --git a/delft/applications/datasetTagger.py b/delft/applications/datasetTagger.py index c414b62a..bd8d37af 100644 --- a/delft/applications/datasetTagger.py +++ b/delft/applications/datasetTagger.py @@ -10,7 +10,8 @@ from delft.sequenceLabelling.reader import load_data_and_labels_json_offsets from delft.utilities.misc import parse_number_ranges -def configure(architecture, output_path=None, max_sequence_length=-1, batch_size=-1, embeddings_name=None, +def configure(architecture, output_path=None, max_sequence_length=-1, + batch_size=-1, embeddings_name=None, max_epoch=-1, use_ELMo=False, patience=-1): """ Set up the default parameters based on the model type. @@ -56,6 +57,8 @@ def configure(architecture, output_path=None, max_sequence_length=-1, batch_size if max_epoch == -1: max_epoch = 60 + else: + early_stop = False if patience == -1: patience = 5 @@ -285,6 +288,8 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non parser.add_argument("--patience", type=int, default=-1, help="patience, number of extra epochs to perform after " "the best epoch before stopping a training.") parser.add_argument("--learning-rate", type=float, default=None, help="Initial learning rate") + parser.add_argument("--max-epoch", type=int, default=-1, + help="Maximum number of epochs. If specified, it is assumed that earlyStop=False.") args = parser.parse_args() @@ -299,6 +304,7 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non use_ELMo = args.use_ELMo patience = args.patience learning_rate = args.learning_rate + max_epoch = args.max_epoch if transformer is None and embeddings_name is None: # default word embeddings @@ -314,7 +320,8 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non batch_size=batch_size, use_ELMo=use_ELMo, patience=patience, - learning_rate=learning_rate) + learning_rate=learning_rate, + max_epoch=max_epoch) if action == "eval": if args.fold_count is not None and args.fold_count > 1: @@ -337,7 +344,8 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non batch_size=batch_size, use_ELMo=use_ELMo, patience=patience, - learning_rate=learning_rate) + learning_rate=learning_rate, + max_epoch=max_epoch) if action == "tag": someTexts = [] diff --git a/delft/applications/grobidTagger.py b/delft/applications/grobidTagger.py index 0aee465c..54a1f182 100644 --- a/delft/applications/grobidTagger.py +++ b/delft/applications/grobidTagger.py @@ -26,6 +26,10 @@ def configure(model, architecture, output_path=None, max_sequence_length=-1, bat multiprocessing = True early_stop = True + # If max_epoch is set we disable the early_stop + if max_epoch > 0: + early_stop = False + if architecture and "BERT" in architecture: # architectures with some transformer layer/embeddings inside @@ -396,6 +400,8 @@ class Tasks: parser.add_argument("--patience", type=int, default=-1, help="patience, number of extra epochs to perform after " "the best epoch before stopping a training.") parser.add_argument("--learning-rate", type=float, default=None, help="Initial learning rate") + parser.add_argument("--max-epoch", type=int, default=-1, + help="Maximum number of epochs. If specified, it is assumed that earlyStop=False.") @@ -415,6 +421,7 @@ class Tasks: incremental = args.incremental patience = args.patience learning_rate = args.learning_rate + max_epoch = args.max_epoch if architecture is None: raise ValueError("A model architecture has to be specified: " + str(architectures)) @@ -436,7 +443,8 @@ class Tasks: incremental=incremental, input_model_path=input_model_path, patience=patience, - learning_rate=learning_rate) + learning_rate=learning_rate, + max_epoch=max_epoch) if action == Tasks.EVAL: if args.fold_count is not None and args.fold_count > 1: @@ -461,7 +469,8 @@ class Tasks: use_ELMo=use_ELMo, incremental=incremental, input_model_path=input_model_path, - learning_rate=learning_rate) + learning_rate=learning_rate, + max_epoch=max_epoch) if action == Tasks.TAG: someTexts = [] diff --git a/delft/applications/insultTagger.py b/delft/applications/insultTagger.py index 0d00a1b2..2be93985 100644 --- a/delft/applications/insultTagger.py +++ b/delft/applications/insultTagger.py @@ -5,24 +5,35 @@ import argparse import time -def configure(architecture, embeddings_name): - batch_size = 20 +def configure(architecture, embeddings_name, batch_size=-1, max_epoch=-1): + maxlen = 300 patience = 5 early_stop = True - max_epoch = 50 + if max_epoch == -1: + max_epoch = 50 + else: + early_stop = False + + if batch_size == -1: + batch_size = 20 # default bert model parameters if architecture.find("BERT") != -1: - batch_size = 10 + if batch_size == -1: + batch_size = 10 early_stop = False - max_epoch = 3 + if max_epoch == -1: + max_epoch = 3 + else: + early_stop = False + embeddings_name = None return batch_size, maxlen, patience, early_stop, max_epoch, embeddings_name -def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None, use_ELMo=False, learning_rate=None): - batch_size, maxlen, patience, early_stop, max_epoch, embeddings_name = configure(architecture, embeddings_name) +def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None, use_ELMo=False, learning_rate=None, batch_size=-1, max_epoch=-1): + batch_size, maxlen, patience, early_stop, max_epoch, embeddings_name = configure(architecture, embeddings_name, batch_size, max_epoch) root = 'data/sequenceLabelling/toxic/' @@ -115,7 +126,10 @@ def annotate(texts, output_format, architecture='BidLSTM_CRF', transformer=None, ) parser.add_argument("--use-ELMo", action="store_true", help="Use ELMo contextual embeddings") parser.add_argument("--learning-rate", type=float, default=None, help="Initial learning rate") - + parser.add_argument("--max-epoch", type=int, default=-1, + help="Maximum number of epochs. If specified, it is assumed that earlyStop=False.") + parser.add_argument("--batch-size", type=int, default=-1, help="batch-size parameter to be used.") + args = parser.parse_args() if args.action not in ('train', 'tag'): @@ -126,13 +140,21 @@ def annotate(texts, output_format, architecture='BidLSTM_CRF', transformer=None, transformer = args.transformer use_ELMo = args.use_ELMo learning_rate = args.learning_rate + batch_size = args.batch_size + max_epoch = args.max_epoch if transformer == None and embeddings_name == None: # default word embeddings embeddings_name = "glove-840B" if args.action == 'train': - train(embeddings_name=embeddings_name, architecture=architecture, transformer=transformer, use_ELMo=use_ELMo, learning_rate=learning_rate) + train(embeddings_name=embeddings_name, + architecture=architecture, + transformer=transformer, + use_ELMo=use_ELMo, + learning_rate=learning_rate, + batch_size=batch_size, + max_epoch=max_epoch) if args.action == 'tag': someTexts = ['This is a gentle test.', diff --git a/delft/applications/nerTagger.py b/delft/applications/nerTagger.py index 36a4deab..43abeb62 100644 --- a/delft/applications/nerTagger.py +++ b/delft/applications/nerTagger.py @@ -8,69 +8,81 @@ import argparse import time -def configure(architecture, dataset_type, lang, embeddings_name, use_ELMo, max_sequence_length=-1, batch_size=-1, - patience=-1): +def configure(architecture, dataset_type, lang, embeddings_name, + use_ELMo, max_sequence_length=-1, batch_size=-1, + patience=-1, max_epoch=-1): - max_epoch = 60 - early_stop = True - multiprocessing = True + o_max_epoch = 60 + o_early_stop = True + o_multiprocessing = True + o_max_sequence_length = 300 + o_patience = 5 + o_batch_size = 32 # general RNN word embeddings input if embeddings_name is None: - embeddings_name = 'glove-840B' + o_embeddings_name = 'glove-840B' if lang == 'en': if dataset_type == 'conll2012': - embeddings_name = 'fasttext-crawl' + o_embeddings_name = 'fasttext-crawl' elif lang == 'fr': - embeddings_name = 'wiki.fr' + o_embeddings_name = 'wiki.fr' + else: + o_embeddings_name = embeddings_name if lang == 'fr': - multiprocessing = False + o_multiprocessing = False if architecture == "BidLSTM_CNN_CRF": - word_lstm_units = 200 - max_epoch = 30 - recurrent_dropout = 0.5 + o_word_lstm_units = 200 + o_max_epoch = 30 + o_recurrent_dropout = 0.5 else: - word_lstm_units = 100 - max_epoch = 50 - recurrent_dropout = 0.5 + o_word_lstm_units = 100 + o_max_epoch = 50 + o_recurrent_dropout = 0.5 if use_ELMo: # following should be done for predicting if max sequence length permits, it also boosts the runtime with ELMo embeddings signicantly # but requires more GPU memory - batch_size = 128 - max_sequence_length = 150 + o_batch_size = 128 + o_max_sequence_length = 150 # default bert model parameters if architecture.find("BERT") != -1: - batch_size = 32 - early_stop = True - max_sequence_length = 150 - max_epoch = 50 - embeddings_name = None + o_batch_size = 32 + o_early_stop = True + o_max_sequence_length = 150 + o_max_epoch = 50 + o_embeddings_name = None if dataset_type == 'conll2012': - multiprocessing = False + o_multiprocessing = False + + if patience > 0: + o_patience = patience + + if batch_size > 0: + o_batch_size = batch_size - if patience == -1: - patience = 5 + if max_sequence_length > 0: + o_max_sequence_length = max_sequence_length - if batch_size == -1: - batch_size = 32 + if max_epoch > 0: + o_max_epoch = max_epoch + o_early_stop = False - if max_sequence_length == -1: - max_sequence_length = 300 - return batch_size, max_sequence_length, patience, recurrent_dropout, early_stop, max_epoch, embeddings_name, word_lstm_units, multiprocessing + return o_batch_size, o_max_sequence_length, o_patience, o_recurrent_dropout, o_early_stop, o_max_epoch, o_embeddings_name, o_word_lstm_units, o_multiprocessing # train a model with all available for a given dataset def train(dataset_type='conll2003', lang='en', embeddings_name=None, architecture='BidLSTM_CRF', - transformer=None, data_path=None, use_ELMo=False, max_sequence_length=-1, batch_size=-1, patience=-1, learning_rate=None): + transformer=None, data_path=None, use_ELMo=False, max_sequence_length=-1, + batch_size=-1, patience=-1, learning_rate=None, max_epoch=-1): batch_size, max_sequence_length, patience, recurrent_dropout, early_stop, max_epoch, embeddings_name, word_lstm_units, multiprocessing = \ - configure(architecture, dataset_type, lang, embeddings_name, use_ELMo, max_sequence_length, batch_size, patience) + configure(architecture, dataset_type, lang, embeddings_name, use_ELMo, max_sequence_length, batch_size, patience, max_epoch) if (dataset_type == 'conll2003') and (lang == 'en'): print('Loading data...') @@ -195,11 +207,12 @@ def train_eval(embeddings_name=None, patience=-1, batch_size=-1, max_sequence_length=-1, - learning_rate=None): + learning_rate=None, + max_epoch=-1): batch_size, max_sequence_length, patience, recurrent_dropout, early_stop, max_epoch, embeddings_name, word_lstm_units, multiprocessing = \ configure(architecture, dataset_type, lang, embeddings_name, use_ELMo, - max_sequence_length=max_sequence_length, batch_size=batch_size, patience=patience) + max_sequence_length=max_sequence_length, batch_size=batch_size, patience=patience, max_epoch=max_epoch) if (dataset_type == 'conll2003') and (lang == 'en'): print('Loading CoNLL 2003 data...') @@ -613,6 +626,8 @@ def annotate(output_format, parser.add_argument("--patience", type=int, default=-1, help="patience, number of extra epochs to perform after " "the best epoch before stopping a training.") parser.add_argument("--learning-rate", type=float, default=None, help="Initial learning rate") + parser.add_argument("--max-epoch", type=int, default=-1, + help="Maximum number of epochs. If specified, it is assumed that earlyStop=False.") args = parser.parse_args() @@ -634,6 +649,7 @@ def annotate(output_format, max_sequence_length = args.max_sequence_length batch_size = args.batch_size learning_rate = args.learning_rate + max_epoch = args.max_epoch # name of embeddings refers to the file delft/resources-registry.json # be sure to use here the same name as in the registry ('glove-840B', 'fasttext-crawl', 'word2vec'), @@ -653,7 +669,8 @@ def annotate(output_format, max_sequence_length=max_sequence_length, batch_size=batch_size, patience=patience, - learning_rate=learning_rate + learning_rate=learning_rate, + max_epoch=max_epoch ) if action == 'train_eval': @@ -672,7 +689,8 @@ def annotate(output_format, max_sequence_length=max_sequence_length, batch_size=batch_size, patience=patience, - learning_rate=learning_rate + learning_rate=learning_rate, + max_epoch=max_epoch ) if action == 'eval': From 963e57a09ee0e7b9d3a253655714ba2a7b33e5e3 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 25 Aug 2023 11:07:26 +0900 Subject: [PATCH 2/6] add --early-stop true/false and decouple from max_epoch --- delft/applications/datasetTagger.py | 26 +++++++++++++-------- delft/applications/grobidTagger.py | 36 +++++++++++++++++------------ delft/applications/insultTagger.py | 29 ++++++++++++++--------- delft/applications/nerTagger.py | 19 ++++++++++----- 4 files changed, 69 insertions(+), 41 deletions(-) diff --git a/delft/applications/datasetTagger.py b/delft/applications/datasetTagger.py index bd8d37af..49d5a8b4 100644 --- a/delft/applications/datasetTagger.py +++ b/delft/applications/datasetTagger.py @@ -12,14 +12,14 @@ def configure(architecture, output_path=None, max_sequence_length=-1, batch_size=-1, embeddings_name=None, - max_epoch=-1, use_ELMo=False, patience=-1): + max_epoch=-1, use_ELMo=False, patience=-1, early_stop=None): """ Set up the default parameters based on the model type. """ model_name = 'datasets' multiprocessing = True - early_stop = True + o_early_stop = True if "BERT" in architecture: # architectures with some transformer layer/embeddings inside @@ -57,13 +57,14 @@ def configure(architecture, output_path=None, max_sequence_length=-1, if max_epoch == -1: max_epoch = 60 - else: - early_stop = False + + if early_stop is not None: + o_early_stop = early_stop if patience == -1: patience = 5 - return batch_size, max_sequence_length, model_name, embeddings_name, max_epoch, multiprocessing, early_stop, patience + return batch_size, max_sequence_length, model_name, embeddings_name, max_epoch, multiprocessing, o_early_stop, patience # train a model with all available data @@ -71,7 +72,7 @@ def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None, input_path=None, output_path=None, fold_count=1, features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1, use_ELMo=False, patience=-1, - learning_rate=None): + learning_rate=None, early_stop=None): print('Loading data...') if input_path is None: x_all1 = y_all1 = x_all2 = y_all2 = x_all3 = y_all3 = [] @@ -101,7 +102,8 @@ def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None, embeddings_name, max_epoch, use_ELMo, - patience) + patience, + early_stop) model = Sequence(model_name, recurrent_dropout=0.50, embeddings_name=embeddings_name, @@ -290,6 +292,9 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non parser.add_argument("--learning-rate", type=float, default=None, help="Initial learning rate") parser.add_argument("--max-epoch", type=int, default=-1, help="Maximum number of epochs. If specified, it is assumed that earlyStop=False.") + parser.add_argument("--early-stop", type=bool, default=None, + help="Force training early termination when evaluation scores at the end of " + "n epochs are not changing.") args = parser.parse_args() @@ -305,6 +310,7 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non patience = args.patience learning_rate = args.learning_rate max_epoch = args.max_epoch + early_stop = args.early_stop if transformer is None and embeddings_name is None: # default word embeddings @@ -321,7 +327,8 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non use_ELMo=use_ELMo, patience=patience, learning_rate=learning_rate, - max_epoch=max_epoch) + max_epoch=max_epoch, + early_stop=early_stop) if action == "eval": if args.fold_count is not None and args.fold_count > 1: @@ -345,7 +352,8 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non use_ELMo=use_ELMo, patience=patience, learning_rate=learning_rate, - max_epoch=max_epoch) + max_epoch=max_epoch, + early_stop=early_stop) if action == "tag": someTexts = [] diff --git a/delft/applications/grobidTagger.py b/delft/applications/grobidTagger.py index 54a1f182..4ba6e9e1 100644 --- a/delft/applications/grobidTagger.py +++ b/delft/applications/grobidTagger.py @@ -14,7 +14,7 @@ def configure(model, architecture, output_path=None, max_sequence_length=-1, batch_size=-1, - embeddings_name=None, max_epoch=-1, use_ELMo=False, patience=-1): + embeddings_name=None, max_epoch=-1, use_ELMo=False, patience=-1, early_stop=None): """ Set up the default parameters based on the model type. """ @@ -24,11 +24,7 @@ def configure(model, architecture, output_path=None, max_sequence_length=-1, bat model_name = 'grobid-' + model multiprocessing = True - early_stop = True - - # If max_epoch is set we disable the early_stop - if max_epoch > 0: - early_stop = False + o_early_stop = True if architecture and "BERT" in architecture: # architectures with some transformer layer/embeddings inside @@ -60,7 +56,7 @@ def configure(model, architecture, output_path=None, max_sequence_length=-1, bat batch_size = 8 if max_sequence_length == -1: max_sequence_length = 512 - early_stop = False + o_early_stop = False if max_epoch == -1: max_epoch = 30 @@ -135,13 +131,16 @@ def configure(model, architecture, output_path=None, max_sequence_length=-1, bat if patience == -1: patience = 5 - return batch_size, max_sequence_length, model_name, embeddings_name, max_epoch, multiprocessing, early_stop, patience + if early_stop is not None: + o_early_stop = early_stop + + return batch_size, max_sequence_length, model_name, embeddings_name, max_epoch, multiprocessing, o_early_stop, patience # train a GROBID model with all available data def train(model, embeddings_name=None, architecture=None, transformer=None, input_path=None, output_path=None, features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1, - use_ELMo=False, incremental=False, input_model_path=None, patience=-1, learning_rate=None): + use_ELMo=False, incremental=False, input_model_path=None, patience=-1, learning_rate=None, early_stop=None): print('Loading data...') if input_path == None: @@ -207,7 +206,8 @@ def train(model, embeddings_name=None, architecture=None, transformer=None, inpu def train_eval(model, embeddings_name=None, architecture='BidLSTM_CRF', transformer=None, input_path=None, output_path=None, fold_count=1, features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1, - use_ELMo=False, incremental=False, input_model_path=None, patience=-1, learning_rate=None): + use_ELMo=False, incremental=False, input_model_path=None, patience=-1, + learning_rate=None, early_stop=None): print('Loading data...') if input_path is None: x_all, y_all, f_all = load_data_and_labels_crf_file('data/sequenceLabelling/grobid/'+model+'/'+model+'-060518.train') @@ -233,7 +233,8 @@ def train_eval(model, embeddings_name=None, architecture='BidLSTM_CRF', transfor embeddings_name, max_epoch, use_ELMo, - patience) + patience, + early_stop) model = Sequence(model_name, recurrent_dropout=0.50, embeddings_name=embeddings_name, @@ -401,9 +402,11 @@ class Tasks: "the best epoch before stopping a training.") parser.add_argument("--learning-rate", type=float, default=None, help="Initial learning rate") parser.add_argument("--max-epoch", type=int, default=-1, - help="Maximum number of epochs. If specified, it is assumed that earlyStop=False.") + help="Maximum number of epochs for training.") + parser.add_argument("--early-stop", type=bool, default=None, + help="Force training early termination when evaluation scores at the end of " + "n epochs are not changing.") - args = parser.parse_args() @@ -422,6 +425,7 @@ class Tasks: patience = args.patience learning_rate = args.learning_rate max_epoch = args.max_epoch + early_stop = args.early_stop if architecture is None: raise ValueError("A model architecture has to be specified: " + str(architectures)) @@ -444,7 +448,8 @@ class Tasks: input_model_path=input_model_path, patience=patience, learning_rate=learning_rate, - max_epoch=max_epoch) + max_epoch=max_epoch, + early_stop=early_stop) if action == Tasks.EVAL: if args.fold_count is not None and args.fold_count > 1: @@ -470,7 +475,8 @@ class Tasks: incremental=incremental, input_model_path=input_model_path, learning_rate=learning_rate, - max_epoch=max_epoch) + max_epoch=max_epoch, + early_stop=early_stop) if action == Tasks.TAG: someTexts = [] diff --git a/delft/applications/insultTagger.py b/delft/applications/insultTagger.py index 2be93985..b3243e66 100644 --- a/delft/applications/insultTagger.py +++ b/delft/applications/insultTagger.py @@ -5,15 +5,13 @@ import argparse import time -def configure(architecture, embeddings_name, batch_size=-1, max_epoch=-1): +def configure(architecture, embeddings_name, batch_size=-1, max_epoch=-1, early_stop=None): maxlen = 300 patience = 5 - early_stop = True + o_early_stop = True if max_epoch == -1: max_epoch = 50 - else: - early_stop = False if batch_size == -1: batch_size = 20 @@ -22,18 +20,22 @@ def configure(architecture, embeddings_name, batch_size=-1, max_epoch=-1): if architecture.find("BERT") != -1: if batch_size == -1: batch_size = 10 - early_stop = False + o_early_stop = False if max_epoch == -1: max_epoch = 3 - else: - early_stop = False embeddings_name = None - return batch_size, maxlen, patience, early_stop, max_epoch, embeddings_name + if early_stop is not None: + o_early_stop = early_stop + + return batch_size, maxlen, patience, o_early_stop, max_epoch, embeddings_name -def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None, use_ELMo=False, learning_rate=None, batch_size=-1, max_epoch=-1): - batch_size, maxlen, patience, early_stop, max_epoch, embeddings_name = configure(architecture, embeddings_name, batch_size, max_epoch) +def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None, + use_ELMo=False, learning_rate=None, + batch_size=-1, max_epoch=-1, early_stop=None): + batch_size, maxlen, patience, early_stop, max_epoch, embeddings_name = configure( + architecture, embeddings_name, batch_size, max_epoch, early_stop) root = 'data/sequenceLabelling/toxic/' @@ -129,6 +131,9 @@ def annotate(texts, output_format, architecture='BidLSTM_CRF', transformer=None, parser.add_argument("--max-epoch", type=int, default=-1, help="Maximum number of epochs. If specified, it is assumed that earlyStop=False.") parser.add_argument("--batch-size", type=int, default=-1, help="batch-size parameter to be used.") + parser.add_argument("--early-stop", type=bool, default=None, + help="Force training early termination when evaluation scores at the end of " + "n epochs are not changing.") args = parser.parse_args() @@ -142,6 +147,7 @@ def annotate(texts, output_format, architecture='BidLSTM_CRF', transformer=None, learning_rate = args.learning_rate batch_size = args.batch_size max_epoch = args.max_epoch + early_stop = args.early_stop if transformer == None and embeddings_name == None: # default word embeddings @@ -154,7 +160,8 @@ def annotate(texts, output_format, architecture='BidLSTM_CRF', transformer=None, use_ELMo=use_ELMo, learning_rate=learning_rate, batch_size=batch_size, - max_epoch=max_epoch) + max_epoch=max_epoch, + early_stop=early_stop) if args.action == 'tag': someTexts = ['This is a gentle test.', diff --git a/delft/applications/nerTagger.py b/delft/applications/nerTagger.py index 43abeb62..e7cb9d50 100644 --- a/delft/applications/nerTagger.py +++ b/delft/applications/nerTagger.py @@ -10,7 +10,7 @@ def configure(architecture, dataset_type, lang, embeddings_name, use_ELMo, max_sequence_length=-1, batch_size=-1, - patience=-1, max_epoch=-1): + patience=-1, max_epoch=-1, early_stop=None): o_max_epoch = 60 o_early_stop = True @@ -70,8 +70,9 @@ def configure(architecture, dataset_type, lang, embeddings_name, if max_epoch > 0: o_max_epoch = max_epoch - o_early_stop = False + if early_stop is not None: + o_early_stop = early_stop return o_batch_size, o_max_sequence_length, o_patience, o_recurrent_dropout, o_early_stop, o_max_epoch, o_embeddings_name, o_word_lstm_units, o_multiprocessing @@ -79,10 +80,10 @@ def configure(architecture, dataset_type, lang, embeddings_name, # train a model with all available for a given dataset def train(dataset_type='conll2003', lang='en', embeddings_name=None, architecture='BidLSTM_CRF', transformer=None, data_path=None, use_ELMo=False, max_sequence_length=-1, - batch_size=-1, patience=-1, learning_rate=None, max_epoch=-1): + batch_size=-1, patience=-1, learning_rate=None, max_epoch=-1, early_stop=None): batch_size, max_sequence_length, patience, recurrent_dropout, early_stop, max_epoch, embeddings_name, word_lstm_units, multiprocessing = \ - configure(architecture, dataset_type, lang, embeddings_name, use_ELMo, max_sequence_length, batch_size, patience, max_epoch) + configure(architecture, dataset_type, lang, embeddings_name, use_ELMo, max_sequence_length, batch_size, patience, max_epoch, early_stop) if (dataset_type == 'conll2003') and (lang == 'en'): print('Loading data...') @@ -628,6 +629,9 @@ def annotate(output_format, parser.add_argument("--learning-rate", type=float, default=None, help="Initial learning rate") parser.add_argument("--max-epoch", type=int, default=-1, help="Maximum number of epochs. If specified, it is assumed that earlyStop=False.") + parser.add_argument("--early-stop", type=bool, default=None, + help="Force training early termination when evaluation scores at the end of " + "n epochs are not changing.") args = parser.parse_args() @@ -650,6 +654,7 @@ def annotate(output_format, batch_size = args.batch_size learning_rate = args.learning_rate max_epoch = args.max_epoch + early_stop = args.early_stop # name of embeddings refers to the file delft/resources-registry.json # be sure to use here the same name as in the registry ('glove-840B', 'fasttext-crawl', 'word2vec'), @@ -670,7 +675,8 @@ def annotate(output_format, batch_size=batch_size, patience=patience, learning_rate=learning_rate, - max_epoch=max_epoch + max_epoch=max_epoch, + early_stop=early_stop ) if action == 'train_eval': @@ -690,7 +696,8 @@ def annotate(output_format, batch_size=batch_size, patience=patience, learning_rate=learning_rate, - max_epoch=max_epoch + max_epoch=max_epoch, + early_stop=early_stop ) if action == 'eval': From 1910551dbac6174d701b73b2a064a140c5ea8cfb Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 25 Aug 2023 11:45:37 +0900 Subject: [PATCH 3/6] read true/false value correctly --- delft/applications/datasetTagger.py | 8 +++++--- delft/applications/grobidTagger.py | 4 ++-- delft/applications/insultTagger.py | 7 +++++-- delft/applications/nerTagger.py | 9 +++++---- delft/utilities/Utilities.py | 10 ++++++++++ 5 files changed, 27 insertions(+), 11 deletions(-) diff --git a/delft/applications/datasetTagger.py b/delft/applications/datasetTagger.py index 49d5a8b4..1a09bae7 100644 --- a/delft/applications/datasetTagger.py +++ b/delft/applications/datasetTagger.py @@ -8,6 +8,7 @@ from delft.sequenceLabelling import Sequence from delft.sequenceLabelling.reader import load_data_and_labels_json_offsets +from delft.utilities.Utilities import t_or_f from delft.utilities.misc import parse_number_ranges def configure(architecture, output_path=None, max_sequence_length=-1, @@ -137,7 +138,7 @@ def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None, def train_eval(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None, input_path=None, output_path=None, fold_count=1, features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1, use_ELMo=False, - patience=-1, learning_rate=None): + patience=-1, learning_rate=None, early_stop=None): print('Loading data...') if input_path is None: x_all1 = y_all1 = x_all2 = y_all2 = x_all3 = y_all3 = [] @@ -169,7 +170,8 @@ def train_eval(embeddings_name=None, architecture='BidLSTM_CRF', transformer=Non embeddings_name, max_epoch, use_ELMo, - patience=patience) + patience=patience, + early_stop=early_stop) model = Sequence(model_name, recurrent_dropout=0.50, embeddings_name=embeddings_name, @@ -292,7 +294,7 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non parser.add_argument("--learning-rate", type=float, default=None, help="Initial learning rate") parser.add_argument("--max-epoch", type=int, default=-1, help="Maximum number of epochs. If specified, it is assumed that earlyStop=False.") - parser.add_argument("--early-stop", type=bool, default=None, + parser.add_argument("--early-stop", type=t_or_f, default=None, help="Force training early termination when evaluation scores at the end of " "n epochs are not changing.") diff --git a/delft/applications/grobidTagger.py b/delft/applications/grobidTagger.py index 4ba6e9e1..9fd05f23 100644 --- a/delft/applications/grobidTagger.py +++ b/delft/applications/grobidTagger.py @@ -8,7 +8,7 @@ from delft.sequenceLabelling import Sequence from delft.sequenceLabelling.reader import load_data_and_labels_crf_file -from delft.utilities.Utilities import longest_row +from delft.utilities.Utilities import longest_row, t_or_f MODEL_LIST = ['affiliation-address', 'citation', 'date', 'header', 'name-citation', 'name-header', 'software', 'figure', 'table', 'reference-segmenter'] @@ -403,7 +403,7 @@ class Tasks: parser.add_argument("--learning-rate", type=float, default=None, help="Initial learning rate") parser.add_argument("--max-epoch", type=int, default=-1, help="Maximum number of epochs for training.") - parser.add_argument("--early-stop", type=bool, default=None, + parser.add_argument("--early-stop", type=t_or_f, default=None, help="Force training early termination when evaluation scores at the end of " "n epochs are not changing.") diff --git a/delft/applications/insultTagger.py b/delft/applications/insultTagger.py index b3243e66..d4cd4095 100644 --- a/delft/applications/insultTagger.py +++ b/delft/applications/insultTagger.py @@ -5,6 +5,9 @@ import argparse import time +from delft.utilities.Utilities import t_or_f + + def configure(architecture, embeddings_name, batch_size=-1, max_epoch=-1, early_stop=None): maxlen = 300 @@ -131,14 +134,14 @@ def annotate(texts, output_format, architecture='BidLSTM_CRF', transformer=None, parser.add_argument("--max-epoch", type=int, default=-1, help="Maximum number of epochs. If specified, it is assumed that earlyStop=False.") parser.add_argument("--batch-size", type=int, default=-1, help="batch-size parameter to be used.") - parser.add_argument("--early-stop", type=bool, default=None, + parser.add_argument("--early-stop", type=t_or_f, default=None, help="Force training early termination when evaluation scores at the end of " "n epochs are not changing.") args = parser.parse_args() if args.action not in ('train', 'tag'): - print('action not specifed, must be one of [train,tag]') + print('action not specified, must be one of [train,tag]') embeddings_name = args.embedding architecture = args.architecture diff --git a/delft/applications/nerTagger.py b/delft/applications/nerTagger.py index e7cb9d50..5c14e92e 100644 --- a/delft/applications/nerTagger.py +++ b/delft/applications/nerTagger.py @@ -1,7 +1,7 @@ import os import numpy as np from delft.sequenceLabelling import Sequence -from delft.utilities.Utilities import stats +from delft.utilities.Utilities import stats, t_or_f from delft.utilities.numpy import shuffle_arrays from delft.sequenceLabelling.reader import load_data_and_labels_conll, load_data_and_labels_lemonde, load_data_and_labels_ontonotes from sklearn.model_selection import train_test_split @@ -209,11 +209,12 @@ def train_eval(embeddings_name=None, batch_size=-1, max_sequence_length=-1, learning_rate=None, - max_epoch=-1): + max_epoch=-1, + early_stop=None): batch_size, max_sequence_length, patience, recurrent_dropout, early_stop, max_epoch, embeddings_name, word_lstm_units, multiprocessing = \ configure(architecture, dataset_type, lang, embeddings_name, use_ELMo, - max_sequence_length=max_sequence_length, batch_size=batch_size, patience=patience, max_epoch=max_epoch) + max_sequence_length=max_sequence_length, batch_size=batch_size, patience=patience, max_epoch=max_epoch, early_stop=early_stop) if (dataset_type == 'conll2003') and (lang == 'en'): print('Loading CoNLL 2003 data...') @@ -629,7 +630,7 @@ def annotate(output_format, parser.add_argument("--learning-rate", type=float, default=None, help="Initial learning rate") parser.add_argument("--max-epoch", type=int, default=-1, help="Maximum number of epochs. If specified, it is assumed that earlyStop=False.") - parser.add_argument("--early-stop", type=bool, default=None, + parser.add_argument("--early-stop", type=t_or_f, default=None, help="Force training early termination when evaluation scores at the end of " "n epochs are not changing.") diff --git a/delft/utilities/Utilities.py b/delft/utilities/Utilities.py index 6c2ddd8a..18214134 100644 --- a/delft/utilities/Utilities.py +++ b/delft/utilities/Utilities.py @@ -617,3 +617,13 @@ def longest_row(array): convert_conll2012_to_iob2(data_path, output_path) elif dataset_type == 'ontonotes': ontonotes_conll2012_names(data_path, output_path) + + +def t_or_f(arg): + ua = str(arg).upper() + if 'TRUE'.startswith(ua): + return True + elif 'FALSE'.startswith(ua): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected. Omit this option to use default values.') \ No newline at end of file From d98a71a47e6d41ce50accdf58835ffe54990e4e7 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 25 Aug 2023 11:50:37 +0900 Subject: [PATCH 4/6] miss param --- delft/applications/grobidTagger.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/delft/applications/grobidTagger.py b/delft/applications/grobidTagger.py index 9fd05f23..bc7e2185 100644 --- a/delft/applications/grobidTagger.py +++ b/delft/applications/grobidTagger.py @@ -166,7 +166,8 @@ def train(model, embeddings_name=None, architecture=None, transformer=None, inpu embeddings_name, max_epoch, use_ELMo, - patience) + patience, early_stop) + model = Sequence(model_name, recurrent_dropout=0.50, embeddings_name=embeddings_name, From 300cf60047ae7c5180e974b2e7102f66f0bf404d Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 25 Aug 2023 11:56:18 +0900 Subject: [PATCH 5/6] typo --- delft/applications/nerTagger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/delft/applications/nerTagger.py b/delft/applications/nerTagger.py index 5c14e92e..33407f01 100644 --- a/delft/applications/nerTagger.py +++ b/delft/applications/nerTagger.py @@ -638,7 +638,7 @@ def annotate(output_format, action = args.action if action not in ('train', 'tag', 'eval', 'train_eval'): - print('action not specifed, must be one of [train, train_eval, eval, tag]') + print('action not specified, must be one of [train, train_eval, eval, tag]') lang = args.lang dataset_type = args.dataset_type train_with_validation_set = args.train_with_validation_set From 194767d43ae0a7e8d67f75320868aa3859cbdfdd Mon Sep 17 00:00:00 2001 From: Patrice Lopez Date: Wed, 29 Nov 2023 17:21:13 +0100 Subject: [PATCH 6/6] rephrase improving versus changing --- delft/applications/datasetTagger.py | 4 ++-- delft/applications/grobidTagger.py | 4 ++-- delft/applications/insultTagger.py | 4 ++-- delft/applications/nerTagger.py | 4 ++-- doc/ner.md | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/delft/applications/datasetTagger.py b/delft/applications/datasetTagger.py index ec6d46d2..5535d1db 100644 --- a/delft/applications/datasetTagger.py +++ b/delft/applications/datasetTagger.py @@ -306,8 +306,8 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non parser.add_argument("--max-epoch", type=int, default=-1, help="Maximum number of epochs.") parser.add_argument("--early-stop", type=t_or_f, default=None, - help="Force training early termination when evaluation scores at the end of " - "n epochs are not changing.") + help="Force early training termination when metrics scores are not improving " + + "after a number of epochs equals to the patience parameter.") parser.add_argument("--multi-gpu", default=False, help="Enable the support for distributed computing (the batch size needs to be set accordingly using --batch-size)", action="store_true") diff --git a/delft/applications/grobidTagger.py b/delft/applications/grobidTagger.py index d4a7e30b..00d9bb91 100644 --- a/delft/applications/grobidTagger.py +++ b/delft/applications/grobidTagger.py @@ -422,8 +422,8 @@ class Tasks: parser.add_argument("--max-epoch", type=int, default=-1, help="Maximum number of epochs for training.") parser.add_argument("--early-stop", type=t_or_f, default=None, - help="Force training early termination when evaluation scores at the end of " - "n epochs are not changing.") + help="Force early training termination when metrics scores are not improving " + + "after a number of epochs equals to the patience parameter.") parser.add_argument("--multi-gpu", default=False, help="Enable the support for distributed computing (the batch size needs to be set accordingly using --batch-size)", diff --git a/delft/applications/insultTagger.py b/delft/applications/insultTagger.py index 3bae351c..f784d97f 100644 --- a/delft/applications/insultTagger.py +++ b/delft/applications/insultTagger.py @@ -137,8 +137,8 @@ def annotate(texts, output_format, architecture='BidLSTM_CRF', transformer=None, help="Maximum number of epochs.") parser.add_argument("--batch-size", type=int, default=-1, help="batch-size parameter to be used.") parser.add_argument("--early-stop", type=t_or_f, default=None, - help="Force training early termination when evaluation scores at the end of " - "n epochs are not changing.") + help="Force early training termination when metrics scores are not improving " + + "after a number of epochs equals to the patience parameter.") parser.add_argument("--multi-gpu", default=False, help="Enable the support for distributed computing (the batch size needs to be set accordingly using --batch-size)", diff --git a/delft/applications/nerTagger.py b/delft/applications/nerTagger.py index 808dcdfd..9206541b 100644 --- a/delft/applications/nerTagger.py +++ b/delft/applications/nerTagger.py @@ -635,8 +635,8 @@ def annotate(output_format, parser.add_argument("--max-epoch", type=int, default=-1, help="Maximum number of epochs.") parser.add_argument("--early-stop", type=t_or_f, default=None, - help="Force training early termination when evaluation scores at the end of " - "n epochs are not changing.") + help="Force early training termination when metrics scores are not improving " + + "after a number of epochs equals to the patience parameter.") parser.add_argument("--multi-gpu", default=False, help="Enable the support for distributed computing (the batch size needs to be set accordingly using --batch-size)", diff --git a/doc/ner.md b/doc/ner.md index fb5f0761..b439877e 100644 --- a/doc/ner.md +++ b/doc/ner.md @@ -34,7 +34,7 @@ Results with transformer fine-tuning for CoNLL-2003 NER dataset, including a fin | --- | --- | --- | --- | | BERT | bert-base-cased | DeLFT | 91.19 | | BERT_CRF | bert-base-cased +CRF| DeLFT | 91.25 | -| BERT_ChainCRF | bert-base-cased +CRF| DeLFT | | +| BERT_ChainCRF | bert-base-cased +CRF| DeLFT | 91.22 | | BERT | roberta-base | DeLFT | 91.64 | Note: DeLFT uses `BERT` as architecture name for transformers in general, but the transformer model could be in principle any transformer variants preset in HuggingFace Hub. DeLFT supports 2 implementations of a CRF layer to be combined with RNN and transformer architectures: `CRF` based on TensorFlow Addons and `ChainCRF` a custom implementation. Both should produce similar accuracy results, but `ChainCRF` is significantly faster and robust.