From 6fce63f4b2f5983ee6e8caf6dc7592e31b1b512e Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 31 Mar 2020 14:55:16 +0900 Subject: [PATCH 01/27] adding a new generic text classifier and fixing some compatibility issues --- citationClassifier.py | 12 +- delft/textClassification/data_generator.py | 4 +- delft/textClassification/models.py | 2 +- textClassifier.py | 165 +++++++++++++++++++++ 4 files changed, 173 insertions(+), 10 deletions(-) create mode 100644 textClassifier.py diff --git a/citationClassifier.py b/citationClassifier.py index bf59b80b..f8238c1b 100644 --- a/citationClassifier.py +++ b/citationClassifier.py @@ -1,13 +1,11 @@ -import json -from delft.utilities.Embeddings import Embeddings -from delft.utilities.Utilities import split_data_and_labels -from delft.textClassification.reader import load_citation_sentiment_corpus -import delft.textClassification -from delft.textClassification import Classifier import argparse -import keras.backend as K +import json import time + +from delft.textClassification import Classifier from delft.textClassification.models import modelTypes +from delft.textClassification.reader import load_citation_sentiment_corpus +from delft.utilities.Utilities import split_data_and_labels list_classes = ["negative", "neutral", "positive"] class_weights = {0: 25., diff --git a/delft/textClassification/data_generator.py b/delft/textClassification/data_generator.py index c3a608e7..6b4c7c74 100644 --- a/delft/textClassification/data_generator.py +++ b/delft/textClassification/data_generator.py @@ -1,6 +1,6 @@ import numpy as np # seed is fixed for reproducibility -from delft.utilities.numpy import shuffle_pair_with_view +from delft.utilities.numpy import shuffle_triple_with_view np.random.seed(7) from tensorflow import set_random_seed @@ -41,7 +41,7 @@ def on_epoch_end(self): # shuffle dataset at each epoch if self.shuffle: - self.x, self.y = shuffle_pair_with_view(self.x, self.y) + self.x, self.y, _ = shuffle_triple_with_view(self.x, self.y) def __data_generation(self, index): 'Generates data containing batch_size samples' diff --git a/delft/textClassification/models.py b/delft/textClassification/models.py index 85834e9d..e7f31449 100644 --- a/delft/textClassification/models.py +++ b/delft/textClassification/models.py @@ -726,7 +726,7 @@ def train_model(model, list_classes, batch_size, max_epoch, use_roc_auc, class_w # we distinguish 1-class and multiclass problems if len(list_classes) is 1: - total_loss = log_loss(val_y, y_pred, labels=[0,1]) + total_loss = log_loss(val_y, y_pred, labels=[0, 1]) if len(np.unique(val_y)) == 1: # roc_auc_score sklearn implementation is not working in this case, it needs more balanced batches # a simple fix is to return the r2_score instead in this case (which is a regression score and not a loss) diff --git a/textClassifier.py b/textClassifier.py new file mode 100644 index 00000000..0d202045 --- /dev/null +++ b/textClassifier.py @@ -0,0 +1,165 @@ +import argparse +import json +import time + +from sklearn.preprocessing import LabelEncoder, OneHotEncoder + +from delft.textClassification import Classifier +from delft.textClassification.models import modelTypes +from delft.textClassification.reader import load_texts_and_classes +from delft.utilities.Utilities import split_data_and_labels + + +def train(model_name, input_file, embeddings_name, fold_count, use_ELMo=False, use_BERT=False, architecture="gru"): + batch_size, maxlen = configure(architecture, use_BERT, use_ELMo) + + print('loading ' + model_name + ' training corpus...') + xtr, y = load_texts_and_classes(input_file) + + list_classes = list(set([y_[0] for y_ in y])) + + y_one_hot = get_one_hot(y) + + model = Classifier(model_name, model_type=architecture, list_classes=list_classes, max_epoch=100, + fold_number=fold_count, patience=10, + use_roc_auc=True, embeddings_name=embeddings_name, use_ELMo=use_ELMo, use_BERT=use_BERT, + batch_size=batch_size, maxlen=maxlen, + class_weights=None) + + if fold_count == 1: + model.train(xtr, y_one_hot) + else: + model.train_nfold(xtr, y_one_hot) + # saving the model + model.save() + + +def get_one_hot(y): + label_encoder = LabelEncoder() + integer_encoded = label_encoder.fit_transform(y) + onehot_encoder = OneHotEncoder(sparse=False) + integer_encoded = integer_encoded.reshape(len(integer_encoded), 1) + y2 = onehot_encoder.fit_transform(integer_encoded) + return y2 + + +def configure(architecture, use_BERT=False, use_ELMo=False): + batch_size = 256 + if use_ELMo: + batch_size = 20 + elif use_BERT: + batch_size = 50 + maxlen = 120 + # default bert model parameters + if architecture.find("bert") != -1: + batch_size = 32 + return batch_size, maxlen + + +def train_and_eval(model_name, input_file, embeddings_name, fold_count, use_ELMo=False, use_BERT=False, + architecture="gru"): + batch_size, maxlen = configure(architecture, use_BERT, use_ELMo) + maxlen = 150 + + print('loading ' + model_name + ' corpus...') + xtr, y = load_texts_and_classes(input_file) + + list_classes = list(set([y_[0] for y_ in y])) + + y_one_hot = get_one_hot(y) + + model = Classifier(model_name, model_type=architecture, list_classes=list_classes, max_epoch=100, + fold_number=fold_count, patience=10, + use_roc_auc=True, embeddings_name=embeddings_name, use_ELMo=use_ELMo, use_BERT=use_BERT, + batch_size=batch_size, maxlen=maxlen, + class_weights=None) + + # segment train and eval sets + x_train, y_train, x_test, y_test = split_data_and_labels(xtr, y_one_hot, 0.9) + + if fold_count == 1: + model.train(x_train, y_train) + else: + model.train_nfold(x_train, y_train) + model.eval(x_test, y_test) + + # saving the model + model.save() + + +# classify a list of texts +def classify(model_name, texts, output_format, architecture="gru"): + # load model + model = Classifier(model_name, model_type=architecture) + model.load() + start_time = time.time() + result = model.predict(texts, output_format) + runtime = round(time.time() - start_time, 3) + if output_format is 'json': + result["runtime"] = runtime + else: + print("runtime: %s seconds " % (runtime)) + return result + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Sentiment classification of citation passages") + + parser.add_argument("action") + parser.add_argument("--fold-count", type=int, default=1) + parser.add_argument("--name", type=str, required=True, help="The name of the model") + parser.add_argument("--input", type=str, required=True, help="The file to be used for training/evaluation") + parser.add_argument("--architecture", default='gru', + help="type of model architecture to be used, one of " + str(modelTypes)) + parser.add_argument("--use-ELMo", action="store_true", help="Use ELMo contextual embeddings") + parser.add_argument("--use-BERT", action="store_true", help="Use BERT contextual embeddings") + parser.add_argument( + "--embedding", default='word2vec', + help=( + "The desired pre-trained word embeddings using their descriptions in the file" + " embedding-registry.json." + " Be sure to use here the same name as in the registry ('glove-840B', 'fasttext-crawl', 'word2vec')," + " and that the path in the registry to the embedding file is correct on your system." + ) + ) + + args = parser.parse_args() + + if args.action not in ('train', 'train_eval', 'classify'): + print('action not specifed, must be one of [train,train_eval,classify]') + + embeddings_name = args.embedding + use_ELMo = args.use_ELMo + use_BERT = args.use_BERT + input_file = args.input + model_name = args.name + + architecture = args.architecture + if architecture not in modelTypes: + print('unknown model architecture, must be one of ' + str(modelTypes)) + + if args.action == 'train': + if args.fold_count < 1: + raise ValueError("fold-count should be equal or more than 1") + + train(model_name, input_file, embeddings_name, args.fold_count, use_ELMo=use_ELMo, use_BERT=use_BERT, + architecture=architecture) + + if args.action == 'train_eval': + if args.fold_count < 1: + raise ValueError("fold-count should be equal or more than 1") + + y_test = train_and_eval(model_name, input_file, embeddings_name, args.fold_count, use_ELMo=use_ELMo, + use_BERT=use_BERT, architecture=architecture) + + if args.action == 'classify': + someTexts = [ + 'One successful strategy [15] computes the set-similarity involving (multi-word) keyphrases about the mentions and the entities, collected from the KG.', + 'Unfortunately, fewer than half of the OCs in the DAML02 OC catalog (Dias et al. 2002) are suitable for use with the isochrone-fitting method because of the lack of a prominent main sequence, in addition to an absence of radial velocity and proper-motion data.', + 'However, we found that the pairwise approach LambdaMART [41] achieved the best performance on our datasets among most learning to rank algorithms.'] + result = classify(model_name, someTexts, "json", architecture=architecture) + print(json.dumps(result, sort_keys=False, indent=4, ensure_ascii=False)) + + # See https://github.com/tensorflow/tensorflow/issues/3388 + # K.clear_session() From db99fdfad9a787daa1f69d5c14260e42fd98fc34 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 10 Apr 2020 13:28:43 +0900 Subject: [PATCH 02/27] use the right shuffle method --- delft/textClassification/data_generator.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/delft/textClassification/data_generator.py b/delft/textClassification/data_generator.py index 6b4c7c74..fece0d7e 100644 --- a/delft/textClassification/data_generator.py +++ b/delft/textClassification/data_generator.py @@ -1,6 +1,6 @@ import numpy as np # seed is fixed for reproducibility -from delft.utilities.numpy import shuffle_triple_with_view +from delft.utilities.numpy import shuffle_pair_with_view np.random.seed(7) from tensorflow import set_random_seed @@ -14,8 +14,8 @@ class DataGenerator(keras.utils.Sequence): 'Generates data for Keras' def __init__(self, x, y, batch_size=256, maxlen=300, list_classes=[], embeddings=(), shuffle=True): 'Initialization' - self.x = x - self.y = y + self.original_x = self.x = x + self.original_y = self.y = y self.batch_size = batch_size self.maxlen = maxlen self.embeddings = embeddings @@ -36,12 +36,12 @@ def __getitem__(self, index): def on_epoch_end(self): # If we are predicting, we don't need to shuffle - if self.y is None: + if self.original_y is None: return # shuffle dataset at each epoch if self.shuffle: - self.x, self.y, _ = shuffle_triple_with_view(self.x, self.y) + self.x, self.y = shuffle_pair_with_view(self.original_x, self.original_y) def __data_generation(self, index): 'Generates data containing batch_size samples' From b07790913f8e2dc52fb7c2d0efe4c5d70ff67881 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 10 Apr 2020 17:09:15 +0900 Subject: [PATCH 03/27] avoid clean operations on cache when it's disable --- delft/utilities/Embeddings.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/delft/utilities/Embeddings.py b/delft/utilities/Embeddings.py index cbbcfa47..25593cbe 100644 --- a/delft/utilities/Embeddings.py +++ b/delft/utilities/Embeddings.py @@ -90,6 +90,7 @@ def __init__(self, name, path='./embedding-registry.json', lang='en', extension= self.make_embeddings_simple(name) self.static_embed_size = self.embed_size self.bilm = None + self.cache_training = False self.use_cache = use_cache # below init for using ELMo embeddings @@ -99,7 +100,8 @@ def __init__(self, name, path='./embedding-registry.json', lang='en', extension= self.embed_size = ELMo_embed_size + self.embed_size description = self._get_description('elmo-'+self.lang) self.env_ELMo = None - if description and description["cache-training"] and self.use_cache: + self.cache_training = description and description["cache-training"] and self.use_cache + if self.cache_training: self.embedding_ELMo_cache = os.path.join(description["path-cache"], "cache") # clean possible remaining cache self.clean_ELMo_cache() @@ -114,11 +116,12 @@ def __init__(self, name, path='./embedding-registry.json', lang='en', extension= #self.session = tf.Session() self.graph = tf.get_default_graph() #self.session.run(tf.global_variables_initializer()) - self.make_BERT() - self.embed_size = BERT_embed_size + self.embed_size + # self.make_BERT() + # self.embed_size = BERT_embed_size + self.embed_size description = self._get_description('bert-base-'+self.lang) self.env_BERT = None - if description and description["cache-training"] and self.use_cache: + self.cache_training = description and description["cache-training"] and self.use_cache + if self.cache_training: self.embedding_BERT_cache = os.path.join(description["path-cache"], "cache") # clean possible remaining cache self.clean_BERT_cache() @@ -734,6 +737,9 @@ def clean_BERT_cache(self): """ Delete BERT embeddings cache, this takes place normally after the completion of a training """ + if not self.cache_training: + return + # if cache subdirectory does not exist, we create it if not os.path.exists(self.embedding_BERT_cache): os.makedirs(self.embedding_BERT_cache) From cbd5f1483daadd312c8f64af0540c3cec7a58884 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 13 Apr 2020 13:27:50 +0900 Subject: [PATCH 04/27] commented too much code --- delft/utilities/Embeddings.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/delft/utilities/Embeddings.py b/delft/utilities/Embeddings.py index 25593cbe..c7d9ee28 100644 --- a/delft/utilities/Embeddings.py +++ b/delft/utilities/Embeddings.py @@ -116,8 +116,8 @@ def __init__(self, name, path='./embedding-registry.json', lang='en', extension= #self.session = tf.Session() self.graph = tf.get_default_graph() #self.session.run(tf.global_variables_initializer()) - # self.make_BERT() - # self.embed_size = BERT_embed_size + self.embed_size + self.make_BERT() + self.embed_size = BERT_embed_size + self.embed_size description = self._get_description('bert-base-'+self.lang) self.env_BERT = None self.cache_training = description and description["cache-training"] and self.use_cache From 86d0583a611a8aafdc2e71687a24d648ed354f56 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 9 Jul 2021 11:51:40 +0900 Subject: [PATCH 05/27] read tsv with a parser --- delft/textClassification/reader.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/delft/textClassification/reader.py b/delft/textClassification/reader.py index c6b4e0e2..00752f27 100644 --- a/delft/textClassification/reader.py +++ b/delft/textClassification/reader.py @@ -1,3 +1,5 @@ +import csv + import numpy as np import xml import gzip @@ -26,15 +28,14 @@ def load_texts_and_classes(filepath): classes = [] with open(filepath) as f: - for line in f: - line = line.strip() - if (len(line) is 0): + tsvreader = csv.reader(f, delimiter="\t") + for line in tsvreader: + if len(line) is 0: continue - pieces = line.split('\t') - if (len(pieces) < 3): + if len(line) < 3: print("Warning: number of fields in the data file too low for line:", line) - texts.append(pieces[1]) - classes.append(pieces[2:]) + texts.append(line[1]) + classes.append(line[2:]) return np.asarray(texts), np.asarray(classes) From 1aa8026ed58b5809bb24186511f9d1f09da46daf Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 9 Jul 2021 16:46:35 +0900 Subject: [PATCH 06/27] use an advanced splitter --- textClassifier.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/textClassifier.py b/textClassifier.py index 0d202045..3e4ef2c9 100644 --- a/textClassifier.py +++ b/textClassifier.py @@ -2,6 +2,7 @@ import json import time +from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder, OneHotEncoder from delft.textClassification import Classifier @@ -75,7 +76,7 @@ def train_and_eval(model_name, input_file, embeddings_name, fold_count, use_ELMo class_weights=None) # segment train and eval sets - x_train, y_train, x_test, y_test = split_data_and_labels(xtr, y_one_hot, 0.9) + x_train, x_test, y_train, y_test = train_test_split(xtr, y_one_hot, test_size=0.1) if fold_count == 1: model.train(x_train, y_train) From 56cdf08845807890ad5a8ad672badd22148eca96 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 12 Jul 2021 15:41:20 +0900 Subject: [PATCH 07/27] minor updates --- textClassifier.py | 48 +++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/textClassifier.py b/textClassifier.py index 3e4ef2c9..0fc4a915 100644 --- a/textClassifier.py +++ b/textClassifier.py @@ -11,6 +11,29 @@ from delft.utilities.Utilities import split_data_and_labels + +def get_one_hot(y): + label_encoder = LabelEncoder() + integer_encoded = label_encoder.fit_transform(y) + onehot_encoder = OneHotEncoder(sparse=False) + integer_encoded = integer_encoded.reshape(len(integer_encoded), 1) + y2 = onehot_encoder.fit_transform(integer_encoded) + return y2 + + +def configure(architecture, use_BERT=False, use_ELMo=False): + batch_size = 256 + if use_ELMo: + batch_size = 20 + elif use_BERT: + batch_size = 50 + maxlen = 300 + # default bert model parameters + if architecture.find("bert") != -1: + batch_size = 32 + maxlen = 300 + return batch_size, maxlen + def train(model_name, input_file, embeddings_name, fold_count, use_ELMo=False, use_BERT=False, architecture="gru"): batch_size, maxlen = configure(architecture, use_BERT, use_ELMo) @@ -35,32 +58,9 @@ def train(model_name, input_file, embeddings_name, fold_count, use_ELMo=False, u model.save() -def get_one_hot(y): - label_encoder = LabelEncoder() - integer_encoded = label_encoder.fit_transform(y) - onehot_encoder = OneHotEncoder(sparse=False) - integer_encoded = integer_encoded.reshape(len(integer_encoded), 1) - y2 = onehot_encoder.fit_transform(integer_encoded) - return y2 - - -def configure(architecture, use_BERT=False, use_ELMo=False): - batch_size = 256 - if use_ELMo: - batch_size = 20 - elif use_BERT: - batch_size = 50 - maxlen = 120 - # default bert model parameters - if architecture.find("bert") != -1: - batch_size = 32 - return batch_size, maxlen - - def train_and_eval(model_name, input_file, embeddings_name, fold_count, use_ELMo=False, use_BERT=False, architecture="gru"): batch_size, maxlen = configure(architecture, use_BERT, use_ELMo) - maxlen = 150 print('loading ' + model_name + ' corpus...') xtr, y = load_texts_and_classes(input_file) @@ -105,7 +105,7 @@ def classify(model_name, texts, output_format, architecture="gru"): if __name__ == "__main__": parser = argparse.ArgumentParser( - description="Sentiment classification of citation passages") + description="General classification of text ") parser.add_argument("action") parser.add_argument("--fold-count", type=int, default=1) From e02bc6a487e0c958296222a5bd3e278225029edc Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 16 Jul 2021 15:03:41 +0900 Subject: [PATCH 08/27] improve memory usage --- delft/sequenceLabelling/data_generator.py | 2 +- delft/sequenceLabelling/preprocess.py | 6 +++--- delft/sequenceLabelling/reader.py | 23 +++++++++++++---------- delft/textClassification/reader.py | 18 +++++++++--------- delft/utilities/Utilities.py | 5 ++++- 5 files changed, 30 insertions(+), 24 deletions(-) diff --git a/delft/sequenceLabelling/data_generator.py b/delft/sequenceLabelling/data_generator.py index 97920f11..43d8bce5 100644 --- a/delft/sequenceLabelling/data_generator.py +++ b/delft/sequenceLabelling/data_generator.py @@ -146,7 +146,7 @@ def __data_generation(self, index): else: batches = self.preprocessor.transform(x_tokenized, extend=extend) - batch_c = np.asarray(batches[0]) + batch_c = np.asarray(batches[0], dtype='object') batch_l = batches[1] return batch_x, batch_c, batch_f, batch_a, batch_l, batch_y diff --git a/delft/sequenceLabelling/preprocess.py b/delft/sequenceLabelling/preprocess.py index 0a8c1da5..14f21c33 100644 --- a/delft/sequenceLabelling/preprocess.py +++ b/delft/sequenceLabelling/preprocess.py @@ -217,7 +217,7 @@ def transform(self, X, extend=False): out.append([0] * features_count) features_vector_padded, _ = pad_sequences(features_vector, [0] * features_count) - output = np.asarray(features_vector_padded) + output = np.asarray(features_vector_padded, dtype='object') return output @@ -347,12 +347,12 @@ def pad_sequence(self, char_ids, labels=None): labels_one_hot = None if labels: labels_padded, _ = pad_sequences(labels, 0) - labels_asarray = np.asarray(labels_padded) + labels_asarray = np.asarray(labels_padded, dtype='object') labels_one_hot = dense_to_one_hot(labels_asarray, len(self.vocab_tag), nlevels=2) if self.use_char_feature: char_ids, word_lengths = pad_sequences(char_ids, pad_tok=0, nlevels=2, max_char_length=self.max_char_length) - char_ids = np.asarray(char_ids) + char_ids = np.asarray(char_ids, dtype='object') return [char_ids], labels_one_hot else: return labels_one_hot diff --git a/delft/sequenceLabelling/reader.py b/delft/sequenceLabelling/reader.py index 4915d0b8..927b8098 100644 --- a/delft/sequenceLabelling/reader.py +++ b/delft/sequenceLabelling/reader.py @@ -87,10 +87,10 @@ def characters(self, content): self.accumulated += content def getSents(self): - return np.asarray(self.sents) + return np.asarray(self.sents, dtype='object') def getAllLabels(self): - return np.asarray(self.allLabels) + return np.asarray(self.allLabels, dtype='object') def clear(self): # clear the accumulator for re-use self.accumulated = "" @@ -220,10 +220,10 @@ def characters(self, content): self.accumulated += content def getSents(self): - return np.asarray(self.sents) + return np.asarray(self.sents, dtype='object') def getAllLabels(self): - return np.asarray(self.allLabels) + return np.asarray(self.allLabels, dtype='object') def clear(self): # clear the accumulator for re-use self.accumulated = "" @@ -307,7 +307,7 @@ def load_data_and_labels_crf_file(filepath): with open(filepath) as f: sents, labels, featureSets = load_data_and_labels_crf_content(f) - return np.asarray(sents), np.asarray(labels), np.asarray(featureSets) + return np.asarray(sents, dtype='object'), np.asarray(labels, dtype='object'), np.asarray(featureSets, dtype='object') def load_data_and_labels_crf_content(the_file): sents = [] @@ -363,7 +363,7 @@ def load_data_and_labels_crf_string(crfString): labels = [] featureSets = [] tokens, tags, features = [], [], [] - for line in crfString.splitlines(): + for line in crfString.splitlines(): line = line.strip(' \t') if len(line) == 0: if len(tokens) != 0: @@ -428,7 +428,10 @@ def load_data_crf_string(crfString): #print('sents:', len(sents)) #print('featureSets:', len(featureSets)) - return sents, featureSets + return ( + np.asarray(sents, dtype='object'), + np.asarray(featureSets, dtype='object') + ) def _translate_tags_grobid_to_IOB(tag): @@ -500,7 +503,7 @@ def load_data_and_labels_conll(filename): words.append(word) tags.append(tag) - return np.asarray(sents), np.asarray(labels) + return np.asarray(sents, dtype='object'), np.asarray(labels, dtype='object') def load_data_and_labels_lemonde(filepathXml): @@ -598,8 +601,8 @@ def load_data_and_labels_ontonotes(ontonotesRoot, lang='en'): total_tokens += len(sentence) print('nb total tokens:', total_tokens) - final_tokens = np.asarray(tokens) - final_label = np.asarray(labels) + final_tokens = np.asarray(tokens, dtype='object') + final_label = np.asarray(labels, dtype='object') return final_tokens, final_label diff --git a/delft/textClassification/reader.py b/delft/textClassification/reader.py index 00752f27..42833a0f 100644 --- a/delft/textClassification/reader.py +++ b/delft/textClassification/reader.py @@ -37,7 +37,7 @@ def load_texts_and_classes(filepath): texts.append(line[1]) classes.append(line[2:]) - return np.asarray(texts), np.asarray(classes) + return np.asarray(texts, dtype='object'), np.asarray(classes, dtype='object') def load_texts_and_classes_pandas(filepath): @@ -67,7 +67,7 @@ def load_texts_and_classes_pandas(filepath): classes = df.iloc[:,2:] classes_list = classes.values.tolist() - return np.asarray(texts_list), np.asarray(classes_list) + return np.asarray(texts_list, dtype='object'), np.asarray(classes_list, dtype='object') def load_texts_pandas(filepath): @@ -94,7 +94,7 @@ def load_texts_pandas(filepath): for j in range(0, df.shape[0]): texts_list.append(df.iloc[j,1]) - return np.asarray(texts_list) + return np.asarray(texts_list, dtype='object') def load_citation_sentiment_corpus(filepath): @@ -145,7 +145,7 @@ def load_citation_sentiment_corpus(filepath): polarity.append(0) polarities.append(polarity) - return np.asarray(texts), np.asarray(polarities) + return np.asarray(texts, dtype='object'), np.asarray(polarities, dtype='object') def load_dataseer_corpus_csv(filepath): @@ -195,7 +195,7 @@ def map_boolean(x): # otherwise we have the list of datatypes, and optionally subtypes and leaf datatypes datatypes = df.iloc[:,2] datatypes_list = datatypes.values.tolist() - datatypes_list = np.asarray(datatypes_list) + datatypes_list = np.asarray(datatypes_list, dtype='object') datatypes_list_lower = np.char.lower(datatypes_list) list_classes_datatypes = np.unique(datatypes_list_lower) datatypes_final = normalize_classes(datatypes_list_lower, list_classes_datatypes) @@ -207,7 +207,7 @@ def map_boolean(x): df = df[~df.datatype.str.contains("no_dataset")] datasubtypes = df.iloc[:,3] datasubtypes_list = datasubtypes.values.tolist() - datasubtypes_list = np.asarray(datasubtypes_list) + datasubtypes_list = np.asarray(datasubtypes_list, dtype='object') datasubtypes_list_lower = np.char.lower(datasubtypes_list) list_classes_datasubtypes = np.unique(datasubtypes_list_lower) datasubtypes_final = normalize_classes(datasubtypes_list_lower, list_classes_datasubtypes) @@ -225,10 +225,10 @@ def map_boolean(x): ''' if df.shape[1] == 3: - return np.asarray(texts_list), datatypes_final, None, None, list_classes_datatypes.tolist(), None, None + return np.asarray(texts_list, dtype='object'), datatypes_final, None, None, list_classes_datatypes.tolist(), None, None #elif df.shape[1] == 4: else: - return np.asarray(texts_list), datatypes_final, datasubtypes_final, None, list_classes_datatypes.tolist(), list_classes_datasubtypes.tolist(), None + return np.asarray(texts_list, dtype='object'), datatypes_final, datasubtypes_final, None, list_classes_datatypes.tolist(), list_classes_datasubtypes.tolist(), None ''' else: return np.asarray(texts_list), datatypes_final, datasubtypes_final, leafdatatypes_final, list_classes_datatypes.tolist(), list_classes_datasubtypes.tolist(), list_classes_leafdatatypes.tolist() @@ -270,7 +270,7 @@ def load_software_use_corpus_json(json_gz_file_path): list_possible_classes = np.unique(classes_list) classes_list_final = normalize_classes(classes_list, list_possible_classes) - texts_list_final = np.asarray(texts_list) + texts_list_final = np.asarray(texts_list, dtype='object') texts_list_final, classes_list_final, _ = shuffle_triple_with_view(texts_list_final, classes_list_final) diff --git a/delft/utilities/Utilities.py b/delft/utilities/Utilities.py index 5a13454f..2ecef965 100644 --- a/delft/utilities/Utilities.py +++ b/delft/utilities/Utilities.py @@ -105,7 +105,10 @@ def split_data_and_labels(x, y, ratio): else: x2.append(x[i]) y2.append(y[i]) - return np.asarray(x1),np.asarray(y1),np.asarray(x2),np.asarray(y2) + return np.asarray(x1, dtype='object'),\ + np.asarray(y1, dtype='object'),\ + np.asarray(x2, dtype='object'),\ + np.asarray(y2, dtype='object') url_regex = re.compile(r"https?:\/\/[a-zA-Z0-9_\-\.]+(?:com|org|fr|de|uk|se|net|edu|gov|int|mil|biz|info|br|ca|cn|in|jp|ru|au|us|ch|it|nl|no|es|pl|ir|cz|kr|co|gr|za|tw|hu|vn|be|mx|at|tr|dk|me|ar|fi|nz)\/?\b") From 6943f1d9640330e473ac151af6b3bcd091928030 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 24 Aug 2021 16:45:59 +0900 Subject: [PATCH 09/27] enforce the use of quotes (temporary) --- delft/textClassification/reader.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/delft/textClassification/reader.py b/delft/textClassification/reader.py index 42833a0f..d9015243 100644 --- a/delft/textClassification/reader.py +++ b/delft/textClassification/reader.py @@ -28,7 +28,8 @@ def load_texts_and_classes(filepath): classes = [] with open(filepath) as f: - tsvreader = csv.reader(f, delimiter="\t") + # TODO not in the original - need to revert it or add an option + tsvreader = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_ALL) for line in tsvreader: if len(line) is 0: continue From b0cf98603f991c05b45f6a8ba5505d381ce7d423 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 20 Sep 2022 17:33:42 +0900 Subject: [PATCH 10/27] cleanup after merging --- delft/applications/citationClassifier.py | 5 +- .../applications/textClassifier.py | 94 ++++++++++--------- delft/textClassification/reader.py | 60 +++++++----- 3 files changed, 87 insertions(+), 72 deletions(-) rename textClassifier.py => delft/applications/textClassifier.py (62%) diff --git a/delft/applications/citationClassifier.py b/delft/applications/citationClassifier.py index 4d954c9b..ff55c2d9 100644 --- a/delft/applications/citationClassifier.py +++ b/delft/applications/citationClassifier.py @@ -33,7 +33,6 @@ def configure(architecture): return batch_size, maxlen, patience, early_stop, max_epoch - def train(embeddings_name, fold_count, architecture="gru", transformer=None): batch_size, maxlen, patience, early_stop, max_epoch = configure(architecture) @@ -84,7 +83,7 @@ def classify(texts, output_format, architecture="gru", embeddings_name=None, tra start_time = time.time() result = model.predict(texts, output_format) runtime = round(time.time() - start_time, 3) - if output_format == 'json': + if output_format is 'json': result["runtime"] = runtime else: print("runtime: %s seconds " % (runtime)) @@ -122,7 +121,7 @@ def classify(texts, output_format, architecture="gru", embeddings_name=None, tra args = parser.parse_args() if args.action not in ('train', 'train_eval', 'classify'): - print('action not specifed, must be one of [train,train_eval,classify]') + print('action not specified, must be one of [train,train_eval,classify]') embeddings_name = args.embedding transformer = args.transformer diff --git a/textClassifier.py b/delft/applications/textClassifier.py similarity index 62% rename from textClassifier.py rename to delft/applications/textClassifier.py index 0fc4a915..b760a41a 100644 --- a/textClassifier.py +++ b/delft/applications/textClassifier.py @@ -6,11 +6,10 @@ from sklearn.preprocessing import LabelEncoder, OneHotEncoder from delft.textClassification import Classifier -from delft.textClassification.models import modelTypes -from delft.textClassification.reader import load_texts_and_classes -from delft.utilities.Utilities import split_data_and_labels - +from delft.textClassification.models import architectures +from delft.textClassification.reader import load_texts_and_classes_generic +pretrained_transformers_examples = [ 'bert-base-cased', 'bert-large-cased', 'allenai/scibert_scivocab_cased' ] def get_one_hot(y): label_encoder = LabelEncoder() @@ -21,34 +20,36 @@ def get_one_hot(y): return y2 -def configure(architecture, use_BERT=False, use_ELMo=False): +def configure(architecture): batch_size = 256 - if use_ELMo: - batch_size = 20 - elif use_BERT: - batch_size = 50 - maxlen = 300 + maxlen = 150 + patience = 5 + early_stop = True + max_epoch = 60 + # default bert model parameters - if architecture.find("bert") != -1: + if architecture == "bert": batch_size = 32 - maxlen = 300 - return batch_size, maxlen + early_stop = False + max_epoch = 3 + + return batch_size, maxlen, patience, early_stop, max_epoch -def train(model_name, input_file, embeddings_name, fold_count, use_ELMo=False, use_BERT=False, architecture="gru"): - batch_size, maxlen = configure(architecture, use_BERT, use_ELMo) +def train(model_name, input_file, embeddings_name, fold_count, architecture=None, transformer=None, + x_index=0, y_index=1): + batch_size, maxlen, patience, early_stop, max_epoch = configure(architecture) print('loading ' + model_name + ' training corpus...') - xtr, y = load_texts_and_classes(input_file) + xtr, y = load_texts_and_classes_generic(input_file, x_index, y_index) list_classes = list(set([y_[0] for y_ in y])) y_one_hot = get_one_hot(y) - model = Classifier(model_name, model_type=architecture, list_classes=list_classes, max_epoch=100, - fold_number=fold_count, patience=10, - use_roc_auc=True, embeddings_name=embeddings_name, use_ELMo=use_ELMo, use_BERT=use_BERT, - batch_size=batch_size, maxlen=maxlen, - class_weights=None) + model = Classifier(model_name, architecture=architecture, list_classes=list_classes, max_epoch=max_epoch, + fold_number=fold_count, patience=patience, transformer_name=transformer, + use_roc_auc=True, embeddings_name=embeddings_name, early_stop=early_stop, + batch_size=batch_size, maxlen=maxlen, class_weights=None) if fold_count == 1: model.train(xtr, y_one_hot) @@ -58,12 +59,12 @@ def train(model_name, input_file, embeddings_name, fold_count, use_ELMo=False, u model.save() -def train_and_eval(model_name, input_file, embeddings_name, fold_count, use_ELMo=False, use_BERT=False, +def train_and_eval(model_name, input_file, embeddings_name, fold_count,transformer=None, architecture="gru"): - batch_size, maxlen = configure(architecture, use_BERT, use_ELMo) + batch_size, maxlen = configure(architecture) print('loading ' + model_name + ' corpus...') - xtr, y = load_texts_and_classes(input_file) + xtr, y = load_texts_and_classes_generic(input_file) list_classes = list(set([y_[0] for y_ in y])) @@ -89,14 +90,14 @@ def train_and_eval(model_name, input_file, embeddings_name, fold_count, use_ELMo # classify a list of texts -def classify(model_name, texts, output_format, architecture="gru"): +def classify(texts, output_format, architecture="gru", transformer=None): # load model - model = Classifier(model_name, model_type=architecture) + model = Classifier(model_name, architecture=architecture, list_classes=list_classes, embeddings_name=embeddings_name, transformer_name=transformer) model.load() start_time = time.time() result = model.predict(texts, output_format) runtime = round(time.time() - start_time, 3) - if output_format is 'json': + if output_format == 'json': result["runtime"] = runtime else: print("runtime: %s seconds " % (runtime)) @@ -111,48 +112,55 @@ def classify(model_name, texts, output_format, architecture="gru"): parser.add_argument("--fold-count", type=int, default=1) parser.add_argument("--name", type=str, required=True, help="The name of the model") parser.add_argument("--input", type=str, required=True, help="The file to be used for training/evaluation") - parser.add_argument("--architecture", default='gru', - help="type of model architecture to be used, one of " + str(modelTypes)) - parser.add_argument("--use-ELMo", action="store_true", help="Use ELMo contextual embeddings") - parser.add_argument("--use-BERT", action="store_true", help="Use BERT contextual embeddings") + parser.add_argument("--x-index", type=int, required=True, help="Column index for the text assuming a TSV file") + parser.add_argument("--y-index", type=int, required=True, help="Column index for the classes assuming a TSV file") + parser.add_argument("--architecture", default='gru', choices=architectures, + help="type of model architecture to be used, one of " + str(architectures)) parser.add_argument( "--embedding", default='word2vec', help=( "The desired pre-trained word embeddings using their descriptions in the file" " embedding-registry.json." " Be sure to use here the same name as in the registry ('glove-840B', 'fasttext-crawl', 'word2vec')," - " and that the path in the registry to the embedding file is correct on your system." - ) + " and that the path in the registry to the embedding file is correct on your system.")) + + parser.add_argument( + "--transformer", + default=None, + help="The desired pre-trained transformer to be used in the selected architecture. " + \ + "For local loading use, delft/resources-registry.json, and be sure to use here the same name as in the registry, e.g. " + \ + str(pretrained_transformers_examples) + \ + " and that the path in the registry to the model path is correct on your system. " + \ + "HuggingFace transformers hub will be used otherwise to fetch the model, see https://huggingface.co/models " + \ + "for model names" ) args = parser.parse_args() if args.action not in ('train', 'train_eval', 'classify'): - print('action not specifed, must be one of [train,train_eval,classify]') + print('action not specified, must be one of [train,train_eval,classify]') embeddings_name = args.embedding - use_ELMo = args.use_ELMo - use_BERT = args.use_BERT input_file = args.input model_name = args.name - + transformer = args.transformer architecture = args.architecture - if architecture not in modelTypes: - print('unknown model architecture, must be one of ' + str(modelTypes)) + x_index = args.x_index + y_index = args.y_index if args.action == 'train': if args.fold_count < 1: raise ValueError("fold-count should be equal or more than 1") - train(model_name, input_file, embeddings_name, args.fold_count, use_ELMo=use_ELMo, use_BERT=use_BERT, - architecture=architecture) + train(model_name, input_file, embeddings_name, args.fold_count, architecture=architecture, + transformer=transformer, x_index=x_index, y_index=y_index) if args.action == 'train_eval': if args.fold_count < 1: raise ValueError("fold-count should be equal or more than 1") - y_test = train_and_eval(model_name, input_file, embeddings_name, args.fold_count, use_ELMo=use_ELMo, - use_BERT=use_BERT, architecture=architecture) + y_test = train_and_eval(model_name, input_file, embeddings_name, args.fold_count, architecture=architecture, + transformer=transformer) if args.action == 'classify': someTexts = [ diff --git a/delft/textClassification/reader.py b/delft/textClassification/reader.py index 38699c5f..f86a3996 100644 --- a/delft/textClassification/reader.py +++ b/delft/textClassification/reader.py @@ -6,7 +6,8 @@ import pandas as pd from delft.utilities.numpy import shuffle_triple_with_view -def load_texts_and_classes2(filepath): + +def load_texts_and_classes_generic(filepath, text_index, classes_index): """ Load texts and classes from a file in the following simple tab-separated format: @@ -25,17 +26,17 @@ def load_texts_and_classes2(filepath): classes = [] with open(filepath) as f: - # TODO not in the original - need to revert it or add an option + # TODO not in the original - need to revert it or add an option tsvreader = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_ALL) for line in tsvreader: - if len(line) is 0: + if len(line) == 0: continue if len(line) < 3: print("Warning: number of fields in the data file too low for line:", line) - texts.append(line[1]) - classes.append(line[2:]) + texts.append(line[text_index]) + classes.append(line[classes_index]) - return np.asarray(texts, dtype='object'), np.asarray(classes, dtype='object') + return np.asarray(texts, dtype=object), np.asarray(classes, dtype=object) def load_texts_and_classes(filepath): @@ -88,13 +89,13 @@ def load_texts_and_classes_pandas(filepath): """ df = pd.read_csv(filepath) - df.iloc[:,1].fillna('MISSINGVALUE', inplace=True) + df.iloc[:, 1].fillna('MISSINGVALUE', inplace=True) texts_list = [] for j in range(0, df.shape[0]): - texts_list.append(df.iloc[j,1]) + texts_list.append(df.iloc[j, 1]) - classes = df.iloc[:,2:] + classes = df.iloc[:, 2:] classes_list = classes.values.tolist() return np.asarray(texts_list, dtype=object), np.asarray(classes_list, dtype=object) @@ -118,11 +119,11 @@ def load_texts_pandas(filepath): """ df = pd.read_csv(filepath) - df.iloc[:,1].fillna('MISSINGVALUE', inplace=True) + df.iloc[:, 1].fillna('MISSINGVALUE', inplace=True) texts_list = [] for j in range(0, df.shape[0]): - texts_list.append(df.iloc[j,1]) + texts_list.append(df.iloc[j, 1]) return np.asarray(texts_list, dtype=object) @@ -157,7 +158,7 @@ def load_citation_sentiment_corpus(filepath): continue text = pieces[3] # remove start/end quotes - text = text[1:len(text)-1] + text = text[1:len(text) - 1] texts.append(text) polarity = [] @@ -198,7 +199,7 @@ def load_dataseer_corpus_csv(filepath): df = df[pd.notnull(df['datatype'])] if 'reuse' in df.columns: df = df[pd.notnull(df['reuse'])] - df.iloc[:,1].fillna('NA', inplace=True) + df.iloc[:, 1].fillna('NA', inplace=True) # shuffle, note that this is important for the reuse prediction, the following shuffle in place # and reset the index @@ -206,26 +207,28 @@ def load_dataseer_corpus_csv(filepath): texts_list = [] for j in range(0, df.shape[0]): - texts_list.append(df.iloc[j,1]) + texts_list.append(df.iloc[j, 1]) - if 'reuse' in df.columns: + if 'reuse' in df.columns: # we simply get the reuse boolean value for the examples - datareuses = df.iloc[:,2] + datareuses = df.iloc[:, 2] reuse_list = datareuses.values.tolist() reuse_list = np.asarray(reuse_list) + # map boolean values to [0,1] def map_boolean(x): - return [1.0,0.0] if x else [0.0,1.0] + return [1.0, 0.0] if x else [0.0, 1.0] + reuse_list = np.array(list(map(map_boolean, reuse_list))) print(reuse_list) return np.asarray(texts_list), reuse_list, None, None, ["not_reuse", "reuse"], None, None # otherwise we have the list of datatypes, and optionally subtypes and leaf datatypes - datatypes = df.iloc[:,2] + datatypes = df.iloc[:, 2] datatypes_list = datatypes.values.tolist() datatypes_list = np.asarray(datatypes_list, dtype=object) datatypes_list_lower = np.char.lower(datatypes_list) - list_classes_datatypes = np.unique(datatypes_list_lower) + list_classes_datatypes = np.unique(datatypes_list_lower) datatypes_final = normalize_classes(datatypes_list_lower, list_classes_datatypes) print(df.shape, df.shape[0], df.shape[1]) @@ -233,7 +236,7 @@ def map_boolean(x): if df.shape[1] > 3: # remove possible row with 'no_dataset' df = df[~df.datatype.str.contains("no_dataset")] - datasubtypes = df.iloc[:,3] + datasubtypes = df.iloc[:, 3] datasubtypes_list = datasubtypes.values.tolist() datasubtypes_list = np.asarray(datasubtypes_list, dtype=object) datasubtypes_list_lower = np.char.lower(datasubtypes_list) @@ -253,15 +256,18 @@ def map_boolean(x): ''' if df.shape[1] == 3: - return np.asarray(texts_list, dtype=object), datatypes_final, None, None, list_classes_datatypes.tolist(), None, None - #elif df.shape[1] == 4: + return np.asarray(texts_list, + dtype=object), datatypes_final, None, None, list_classes_datatypes.tolist(), None, None + # elif df.shape[1] == 4: else: - return np.asarray(texts_list, dtype=object), datatypes_final, datasubtypes_final, None, list_classes_datatypes.tolist(), list_classes_datasubtypes.tolist(), None + return np.asarray(texts_list, + dtype=object), datatypes_final, datasubtypes_final, None, list_classes_datatypes.tolist(), list_classes_datasubtypes.tolist(), None ''' else: return np.asarray(texts_list), datatypes_final, datasubtypes_final, leafdatatypes_final, list_classes_datatypes.tolist(), list_classes_datasubtypes.tolist(), list_classes_leafdatatypes.tolist() ''' + def load_software_use_corpus_json(json_gz_file_path): """ Load texts and classes from the corresponding Softcite corpus export in gzipped json format @@ -281,7 +287,7 @@ def load_software_use_corpus_json(json_gz_file_path): data = json.loads(fin.read().decode('utf-8')) if not "documents" in data: print("There is no usable classified text in the corpus file", json_gz_file_path) - return None, None + return None, None for document in data["documents"]: for segment in document["texts"]: if "entity_spans" in segment: @@ -352,8 +358,8 @@ def load_software_context_corpus_json(json_gz_file_path): classes_list.append(classes) - #list_possible_classes = np.unique(classes_list) - #classes_list_final = normalize_classes(classes_list, list_possible_classes) + # list_possible_classes = np.unique(classes_list) + # classes_list_final = normalize_classes(classes_list, list_possible_classes) texts_list_final = np.asarray(texts_list) classes_list_final = np.asarray(classes_list) @@ -367,12 +373,14 @@ def normalize_classes(y, list_classes): ''' Replace string values of classes by their index in the list of classes ''' + def f(x): return np.where(list_classes == x) intermediate = np.array([f(xi)[0] for xi in y]) return np.array([vectorize(xi, len(list_classes)) for xi in intermediate]) + def vectorize(index, size): ''' Create a numpy array of the provided size, where value at indicated index is 1, 0 otherwise From 19561a62c434e5c1adce99bf4ce4ea0962705409 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 20 Sep 2022 18:04:58 +0900 Subject: [PATCH 11/27] more cleanup --- delft/applications/citationClassifier.py | 2 +- delft/applications/textClassifier.py | 19 +++++++++---------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/delft/applications/citationClassifier.py b/delft/applications/citationClassifier.py index ff55c2d9..97420b6f 100644 --- a/delft/applications/citationClassifier.py +++ b/delft/applications/citationClassifier.py @@ -83,7 +83,7 @@ def classify(texts, output_format, architecture="gru", embeddings_name=None, tra start_time = time.time() result = model.predict(texts, output_format) runtime = round(time.time() - start_time, 3) - if output_format is 'json': + if output_format == 'json': result["runtime"] = runtime else: print("runtime: %s seconds " % (runtime)) diff --git a/delft/applications/textClassifier.py b/delft/applications/textClassifier.py index b760a41a..b43bdff1 100644 --- a/delft/applications/textClassifier.py +++ b/delft/applications/textClassifier.py @@ -60,21 +60,20 @@ def train(model_name, input_file, embeddings_name, fold_count, architecture=None def train_and_eval(model_name, input_file, embeddings_name, fold_count,transformer=None, - architecture="gru"): - batch_size, maxlen = configure(architecture) + architecture="gru", x_index=0, y_index=1): + batch_size, maxlen, patience, early_stop, max_epoch = configure(architecture) print('loading ' + model_name + ' corpus...') - xtr, y = load_texts_and_classes_generic(input_file) + xtr, y = load_texts_and_classes_generic(input_file, x_index, y_index) list_classes = list(set([y_[0] for y_ in y])) y_one_hot = get_one_hot(y) - model = Classifier(model_name, model_type=architecture, list_classes=list_classes, max_epoch=100, - fold_number=fold_count, patience=10, - use_roc_auc=True, embeddings_name=embeddings_name, use_ELMo=use_ELMo, use_BERT=use_BERT, - batch_size=batch_size, maxlen=maxlen, - class_weights=None) + model = Classifier(model_name, architecture=architecture, list_classes=list_classes, max_epoch=max_epoch, + fold_number=fold_count, patience=patience, transformer_name=transformer, + use_roc_auc=True, embeddings_name=embeddings_name, early_stop=early_stop, + batch_size=batch_size, maxlen=maxlen, class_weights=None) # segment train and eval sets x_train, x_test, y_train, y_test = train_test_split(xtr, y_one_hot, test_size=0.1) @@ -92,7 +91,7 @@ def train_and_eval(model_name, input_file, embeddings_name, fold_count,transform # classify a list of texts def classify(texts, output_format, architecture="gru", transformer=None): # load model - model = Classifier(model_name, architecture=architecture, list_classes=list_classes, embeddings_name=embeddings_name, transformer_name=transformer) + model = Classifier(model_name, architecture=architecture, embeddings_name=embeddings_name, transformer_name=transformer) model.load() start_time = time.time() result = model.predict(texts, output_format) @@ -167,7 +166,7 @@ def classify(texts, output_format, architecture="gru", transformer=None): 'One successful strategy [15] computes the set-similarity involving (multi-word) keyphrases about the mentions and the entities, collected from the KG.', 'Unfortunately, fewer than half of the OCs in the DAML02 OC catalog (Dias et al. 2002) are suitable for use with the isochrone-fitting method because of the lack of a prominent main sequence, in addition to an absence of radial velocity and proper-motion data.', 'However, we found that the pairwise approach LambdaMART [41] achieved the best performance on our datasets among most learning to rank algorithms.'] - result = classify(model_name, someTexts, "json", architecture=architecture) + result = classify(model_name, someTexts, "json") print(json.dumps(result, sort_keys=False, indent=4, ensure_ascii=False)) # See https://github.com/tensorflow/tensorflow/issues/3388 From 7ba819eb2d0ed24573834716f4984e6bb3538a15 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 21 Sep 2022 14:39:27 +0900 Subject: [PATCH 12/27] add column x and column y --- delft/applications/textClassifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/delft/applications/textClassifier.py b/delft/applications/textClassifier.py index b43bdff1..961baecd 100644 --- a/delft/applications/textClassifier.py +++ b/delft/applications/textClassifier.py @@ -159,7 +159,7 @@ def classify(texts, output_format, architecture="gru", transformer=None): raise ValueError("fold-count should be equal or more than 1") y_test = train_and_eval(model_name, input_file, embeddings_name, args.fold_count, architecture=architecture, - transformer=transformer) + transformer=transformer, x_index=x_index, y_index=y_index) if args.action == 'classify': someTexts = [ From e6bf93bded3b85cf3c0832716aec64415814c34a Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 26 Sep 2022 17:17:34 +0900 Subject: [PATCH 13/27] improve generalisation with a binary / multiclass classification --- delft/applications/textClassifier.py | 89 ++++++++++++++++++---------- delft/textClassification/reader.py | 19 +++--- delft/textClassification/wrapper.py | 14 ++--- 3 files changed, 78 insertions(+), 44 deletions(-) diff --git a/delft/applications/textClassifier.py b/delft/applications/textClassifier.py index 961baecd..f45e4315 100644 --- a/delft/applications/textClassifier.py +++ b/delft/applications/textClassifier.py @@ -9,7 +9,8 @@ from delft.textClassification.models import architectures from delft.textClassification.reader import load_texts_and_classes_generic -pretrained_transformers_examples = [ 'bert-base-cased', 'bert-large-cased', 'allenai/scibert_scivocab_cased' ] +pretrained_transformers_examples = ['bert-base-cased', 'bert-large-cased', 'allenai/scibert_scivocab_cased'] + def get_one_hot(y): label_encoder = LabelEncoder() @@ -35,36 +36,53 @@ def configure(architecture): return batch_size, maxlen, patience, early_stop, max_epoch + def train(model_name, input_file, embeddings_name, fold_count, architecture=None, transformer=None, - x_index=0, y_index=1): + x_index=0, y_indexes=[1]): batch_size, maxlen, patience, early_stop, max_epoch = configure(architecture) print('loading ' + model_name + ' training corpus...') - xtr, y = load_texts_and_classes_generic(input_file, x_index, y_index) + xtr, y = load_texts_and_classes_generic(input_file, x_index, y_indexes) list_classes = list(set([y_[0] for y_ in y])) - y_one_hot = get_one_hot(y) - model = Classifier(model_name, architecture=architecture, list_classes=list_classes, max_epoch=max_epoch, fold_number=fold_count, patience=patience, transformer_name=transformer, use_roc_auc=True, embeddings_name=embeddings_name, early_stop=early_stop, batch_size=batch_size, maxlen=maxlen, class_weights=None) + y_ = get_one_hot(y) + if fold_count == 1: - model.train(xtr, y_one_hot) + model.train(xtr, y_) else: - model.train_nfold(xtr, y_one_hot) + model.train_nfold(xtr, y_) # saving the model model.save() -def train_and_eval(model_name, input_file, embeddings_name, fold_count,transformer=None, - architecture="gru", x_index=0, y_index=1): +def eval(model_name, input_file, architecture=None, x_index=0, y_indexes=[1]): + # model_name += model_name + '-' + architecture + + print('loading ' + model_name + ' evaluation corpus...') + + xtr, y = load_texts_and_classes_generic(input_file, x_index, y_indexes) + print(len(xtr), 'evaluation sequences') + + model = Classifier(model_name) + model.load() + + y_ = get_one_hot(y) + + model.eval(xtr, y_) + + +def train_and_eval(model_name, input_file, embeddings_name, fold_count, transformer=None, + architecture="gru", x_index=0, y_indexes=[1]): batch_size, maxlen, patience, early_stop, max_epoch = configure(architecture) print('loading ' + model_name + ' corpus...') - xtr, y = load_texts_and_classes_generic(input_file, x_index, y_index) + xtr, y = load_texts_and_classes_generic(input_file, x_index, y_indexes) list_classes = list(set([y_[0] for y_ in y])) @@ -82,6 +100,8 @@ def train_and_eval(model_name, input_file, embeddings_name, fold_count,transform model.train(x_train, y_train) else: model.train_nfold(x_train, y_train) + model.model_config.fold_number=1 + model.eval(x_test, y_test) # saving the model @@ -91,7 +111,8 @@ def train_and_eval(model_name, input_file, embeddings_name, fold_count,transform # classify a list of texts def classify(texts, output_format, architecture="gru", transformer=None): # load model - model = Classifier(model_name, architecture=architecture, embeddings_name=embeddings_name, transformer_name=transformer) + model = Classifier(model_name, architecture=architecture, embeddings_name=embeddings_name, + transformer_name=transformer) model.load() start_time = time.time() result = model.predict(texts, output_format) @@ -108,11 +129,13 @@ def classify(texts, output_format, architecture="gru", transformer=None): description="General classification of text ") parser.add_argument("action") + parser.add_argument("model", help="The name of the model") parser.add_argument("--fold-count", type=int, default=1) - parser.add_argument("--name", type=str, required=True, help="The name of the model") parser.add_argument("--input", type=str, required=True, help="The file to be used for training/evaluation") - parser.add_argument("--x-index", type=int, required=True, help="Column index for the text assuming a TSV file") - parser.add_argument("--y-index", type=int, required=True, help="Column index for the classes assuming a TSV file") + parser.add_argument("--x-index", type=int, required=True, help="Index of the columns for the X value " + "(assuming a TSV file)") + parser.add_argument("--y-indexes", type=str, required=True, help="Index(es) of the columns for the Y (classes) " + "separated by comma, without spaces (assuming a TSV file)") parser.add_argument("--architecture", default='gru', choices=architectures, help="type of model architecture to be used, one of " + str(architectures)) parser.add_argument( @@ -127,41 +150,47 @@ def classify(texts, output_format, architecture="gru", transformer=None): "--transformer", default=None, help="The desired pre-trained transformer to be used in the selected architecture. " + \ - "For local loading use, delft/resources-registry.json, and be sure to use here the same name as in the registry, e.g. " + \ - str(pretrained_transformers_examples) + \ - " and that the path in the registry to the model path is correct on your system. " + \ - "HuggingFace transformers hub will be used otherwise to fetch the model, see https://huggingface.co/models " + \ - "for model names" + "For local loading use, delft/resources-registry.json, and be sure to use here the " + "same name as in the registry, e.g. " + \ + str(pretrained_transformers_examples) + \ + " and that the path in the registry to the model path is correct on your system. " + \ + "HuggingFace transformers hub will be used otherwise to fetch the model, " + "see https://huggingface.co/models " + \ + "for model names" ) args = parser.parse_args() - if args.action not in ('train', 'train_eval', 'classify'): - print('action not specified, must be one of [train,train_eval,classify]') + if args.action not in ('train', 'train_eval', 'eval', 'classify'): + print('action not specified, must be one of [train, train_eval, eval, classify]') embeddings_name = args.embedding input_file = args.input - model_name = args.name + model_name = args.model transformer = args.transformer architecture = args.architecture x_index = args.x_index - y_index = args.y_index + y_indexes = [int(index) for index in args.y_indexes.split(",")] - if args.action == 'train': - if args.fold_count < 1: - raise ValueError("fold-count should be equal or more than 1") + if transformer is None and embeddings_name is None: + # default word embeddings + embeddings_name = "glove-840B" + if args.action == 'train': train(model_name, input_file, embeddings_name, args.fold_count, architecture=architecture, - transformer=transformer, x_index=x_index, y_index=y_index) + transformer=transformer, x_index=x_index, y_indexes=y_indexes) + + elif args.action == 'eval': + eval(model_name, input_file, architecture=architecture, x_index=x_index, y_indexes=y_indexes) - if args.action == 'train_eval': + elif args.action == 'train_eval': if args.fold_count < 1: raise ValueError("fold-count should be equal or more than 1") y_test = train_and_eval(model_name, input_file, embeddings_name, args.fold_count, architecture=architecture, - transformer=transformer, x_index=x_index, y_index=y_index) + transformer=transformer, x_index=x_index, y_indexes=y_indexes) - if args.action == 'classify': + elif args.action == 'classify': someTexts = [ 'One successful strategy [15] computes the set-similarity involving (multi-word) keyphrases about the mentions and the entities, collected from the KG.', 'Unfortunately, fewer than half of the OCs in the DAML02 OC catalog (Dias et al. 2002) are suitable for use with the isochrone-fitting method because of the lack of a prominent main sequence, in addition to an absence of radial velocity and proper-motion data.', diff --git a/delft/textClassification/reader.py b/delft/textClassification/reader.py index f86a3996..a53a0041 100644 --- a/delft/textClassification/reader.py +++ b/delft/textClassification/reader.py @@ -7,7 +7,7 @@ from delft.utilities.numpy import shuffle_triple_with_view -def load_texts_and_classes_generic(filepath, text_index, classes_index): +def load_texts_and_classes_generic(filepath, text_index: int, classes_indexes: list): """ Load texts and classes from a file in the following simple tab-separated format: @@ -22,10 +22,11 @@ def load_texts_and_classes_generic(filepath, text_index, classes_index): tuple(numpy array, numpy array): texts and classes """ - texts = [] - classes = [] + x = [] + y = [] with open(filepath) as f: + first = True # TODO not in the original - need to revert it or add an option tsvreader = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_ALL) for line in tsvreader: @@ -33,10 +34,14 @@ def load_texts_and_classes_generic(filepath, text_index, classes_index): continue if len(line) < 3: print("Warning: number of fields in the data file too low for line:", line) - texts.append(line[text_index]) - classes.append(line[classes_index]) - - return np.asarray(texts, dtype=object), np.asarray(classes, dtype=object) + classes = [line[i] for i in classes_indexes] + if first: + print("Sample input", "x: ", line[text_index], "y: ", classes) + first = False + x.append(line[text_index]) + y.append(classes) + + return np.asarray(x, dtype=object), np.asarray(y, dtype=object) def load_texts_and_classes(filepath): diff --git a/delft/textClassification/wrapper.py b/delft/textClassification/wrapper.py index a2033187..9aad5b30 100644 --- a/delft/textClassification/wrapper.py +++ b/delft/textClassification/wrapper.py @@ -246,7 +246,7 @@ def eval(self, x_test, y_test, use_main_thread_only=False): bert_data = True if self.model_config.fold_number == 1: - if self.model != None: + if self.model is not None: self.model.print_summary() test_generator = DataGenerator(x_test, None, batch_size=self.model_config.batch_size, maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, @@ -324,7 +324,7 @@ def vectorize(index, size): total_accuracy += accuracy f1 = f1_score(y_test[:, j], result_binary[:, j], average='micro') total_f1 += f1 - loss = log_loss(y_test[:, j], result[:, j], labels=[0,1]) + loss = log_loss(y_test[:, j], result[:, j], labels=[0, 1]) total_loss += loss if len(np.unique(y_test[:, j])) == 1: # roc_auc_score sklearn implementation is not working in this case, it needs more balanced batches @@ -397,7 +397,7 @@ def vectorize(index, size): ''' def save(self, dir_path='data/models/textClassification/'): - # create subfolder for the model if not already exists + # create sub-folder for the model if not already exists directory = os.path.join(dir_path, self.model_config.model_name) if not os.path.exists(directory): os.makedirs(directory) @@ -406,20 +406,20 @@ def save(self, dir_path='data/models/textClassification/'): print('model config file saved') if self.model_config.fold_number == 1: - if self.model != None: + if self.model is not None: self.model.save(os.path.join(directory, self.weight_file)) print('model saved') else: print('Error: model has not been built') else: - if self.models == None: - print('Error: nfolds models have not been built') + if self.models is None: + print('Error: n-folds models have not been built') else: # fold models having a transformer layers are already saved if self.model_config.transformer_name is None: for i in range(0, self.model_config.fold_number): self.models[i].save(os.path.join(directory, "model{0}_weights.hdf5".format(i))) - print('nfolds model saved') + print('n-folds model saved') # save pretrained transformer config and tokenizer if used in the model and if single fold (otherwise it is saved in the nfold process) if self.transformer_name is not None and self.model_config.fold_number == 1: From 39332c017618fe55eb48d9b51ce0e9671ebbf723 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 28 Sep 2022 10:16:12 +0900 Subject: [PATCH 14/27] remove early stop for bert --- delft/applications/textClassifier.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/delft/applications/textClassifier.py b/delft/applications/textClassifier.py index f45e4315..0ca4b780 100644 --- a/delft/applications/textClassifier.py +++ b/delft/applications/textClassifier.py @@ -31,8 +31,6 @@ def configure(architecture): # default bert model parameters if architecture == "bert": batch_size = 32 - early_stop = False - max_epoch = 3 return batch_size, maxlen, patience, early_stop, max_epoch From 7014a08872c0f29a6c737cbb4a3aaf7b6207db8e Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 28 Sep 2022 12:27:13 +0900 Subject: [PATCH 15/27] limit to single class and some cosmetics --- delft/applications/textClassifier.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/delft/applications/textClassifier.py b/delft/applications/textClassifier.py index 0ca4b780..d7090db1 100644 --- a/delft/applications/textClassifier.py +++ b/delft/applications/textClassifier.py @@ -11,6 +11,8 @@ pretrained_transformers_examples = ['bert-base-cased', 'bert-large-cased', 'allenai/scibert_scivocab_cased'] +actions = ['train', 'train_eval', 'eval', 'classify'] + def get_one_hot(y): label_encoder = LabelEncoder() @@ -31,6 +33,8 @@ def configure(architecture): # default bert model parameters if architecture == "bert": batch_size = 32 + # early_stop = False + # max_epoch = 3 return batch_size, maxlen, patience, early_stop, max_epoch @@ -98,7 +102,6 @@ def train_and_eval(model_name, input_file, embeddings_name, fold_count, transfor model.train(x_train, y_train) else: model.train_nfold(x_train, y_train) - model.model_config.fold_number=1 model.eval(x_test, y_test) @@ -126,14 +129,15 @@ def classify(texts, output_format, architecture="gru", transformer=None): parser = argparse.ArgumentParser( description="General classification of text ") - parser.add_argument("action") + parser.add_argument("action", help="the action", choices=actions) parser.add_argument("model", help="The name of the model") parser.add_argument("--fold-count", type=int, default=1) parser.add_argument("--input", type=str, required=True, help="The file to be used for training/evaluation") parser.add_argument("--x-index", type=int, required=True, help="Index of the columns for the X value " "(assuming a TSV file)") parser.add_argument("--y-indexes", type=str, required=True, help="Index(es) of the columns for the Y (classes) " - "separated by comma, without spaces (assuming a TSV file)") + "separated by comma, without spaces (assuming " + "a TSV file)") parser.add_argument("--architecture", default='gru', choices=architectures, help="type of model architecture to be used, one of " + str(architectures)) parser.add_argument( @@ -159,9 +163,6 @@ def classify(texts, output_format, architecture="gru", transformer=None): args = parser.parse_args() - if args.action not in ('train', 'train_eval', 'eval', 'classify'): - print('action not specified, must be one of [train, train_eval, eval, classify]') - embeddings_name = args.embedding input_file = args.input model_name = args.model @@ -169,6 +170,9 @@ def classify(texts, output_format, architecture="gru", transformer=None): architecture = args.architecture x_index = args.x_index y_indexes = [int(index) for index in args.y_indexes.split(",")] + if len(y_indexes) > 1: + print("At the moment we support just one value per class. Taking the first value only. ") + y_indexes = y_indexes[0] if transformer is None and embeddings_name is None: # default word embeddings From 5e223ea01e4992987b0c09c42a6c554a6834b140 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 24 Oct 2022 10:57:31 +0900 Subject: [PATCH 16/27] add output in csv when in not in json, y-indexes mandatory --- delft/applications/textClassifier.py | 41 +++++++++++++++++++--------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/delft/applications/textClassifier.py b/delft/applications/textClassifier.py index d7090db1..2de0039d 100644 --- a/delft/applications/textClassifier.py +++ b/delft/applications/textClassifier.py @@ -1,7 +1,10 @@ import argparse +import csv import json +import sys import time +import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder, OneHotEncoder @@ -112,8 +115,7 @@ def train_and_eval(model_name, input_file, embeddings_name, fold_count, transfor # classify a list of texts def classify(texts, output_format, architecture="gru", transformer=None): # load model - model = Classifier(model_name, architecture=architecture, embeddings_name=embeddings_name, - transformer_name=transformer) + model = Classifier(model_name) model.load() start_time = time.time() result = model.predict(texts, output_format) @@ -135,7 +137,7 @@ def classify(texts, output_format, architecture="gru", transformer=None): parser.add_argument("--input", type=str, required=True, help="The file to be used for training/evaluation") parser.add_argument("--x-index", type=int, required=True, help="Index of the columns for the X value " "(assuming a TSV file)") - parser.add_argument("--y-indexes", type=str, required=True, help="Index(es) of the columns for the Y (classes) " + parser.add_argument("--y-indexes", type=str, required=False, help="Index(es) of the columns for the Y (classes) " "separated by comma, without spaces (assuming " "a TSV file)") parser.add_argument("--architecture", default='gru', choices=architectures, @@ -169,10 +171,16 @@ def classify(texts, output_format, architecture="gru", transformer=None): transformer = args.transformer architecture = args.architecture x_index = args.x_index - y_indexes = [int(index) for index in args.y_indexes.split(",")] - if len(y_indexes) > 1: - print("At the moment we support just one value per class. Taking the first value only. ") - y_indexes = y_indexes[0] + + if args.action != "classify": + if args.y_indexes is None: + print("--y-indexes is mandatory") + sys.exit(-1) + y_indexes = [int(index) for index in args.y_indexes.split(",")] + + if len(y_indexes) > 1: + print("At the moment we support just one value per class. Taking the first value only. ") + y_indexes = y_indexes[0] if transformer is None and embeddings_name is None: # default word embeddings @@ -193,12 +201,19 @@ def classify(texts, output_format, architecture="gru", transformer=None): transformer=transformer, x_index=x_index, y_indexes=y_indexes) elif args.action == 'classify': - someTexts = [ - 'One successful strategy [15] computes the set-similarity involving (multi-word) keyphrases about the mentions and the entities, collected from the KG.', - 'Unfortunately, fewer than half of the OCs in the DAML02 OC catalog (Dias et al. 2002) are suitable for use with the isochrone-fitting method because of the lack of a prominent main sequence, in addition to an absence of radial velocity and proper-motion data.', - 'However, we found that the pairwise approach LambdaMART [41] achieved the best performance on our datasets among most learning to rank algorithms.'] - result = classify(model_name, someTexts, "json") - print(json.dumps(result, sort_keys=False, indent=4, ensure_ascii=False)) + lines = [] + with open(input_file, 'r') as f: + tsvreader = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_ALL) + for line in tsvreader: + if len(line) == 0: + continue + lines.append(line[x_index]) + + result = classify(lines, "csv") + + result_binary = [np.argmax(line) for line in result] + for x in result_binary: + print(x) # See https://github.com/tensorflow/tensorflow/issues/3388 # K.clear_session() From 9da4fde73f41eafd7042b0d99792d4d505dbf502 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 19 Dec 2022 14:55:58 +0900 Subject: [PATCH 17/27] make reader selecting tsv or csv --- delft/textClassification/reader.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/delft/textClassification/reader.py b/delft/textClassification/reader.py index eb6c45f0..80591a2c 100644 --- a/delft/textClassification/reader.py +++ b/delft/textClassification/reader.py @@ -25,10 +25,13 @@ def load_texts_and_classes_generic(filepath, text_index: int, classes_indexes: l x = [] y = [] + delimiter = "\t" + if filepath.endswith(".csv"): + delimiter = "," + with open(filepath) as f: first = True - # TODO not in the original - need to revert it or add an option - tsvreader = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_ALL) + tsvreader = csv.reader(f, delimiter=delimiter) for line in tsvreader: if len(line) == 0: continue @@ -442,7 +445,7 @@ def load_software_dataset_context_corpus_json(json_gz_file_path): data = json.loads(fin.read().decode('utf-8')) if not "documents" in data: print("There is no usable classified text in the corpus file", json_gz_file_path) - return None, None + return None, None for document in data["documents"]: for segment in document["texts"]: if "class_attributes" not in segment: @@ -466,13 +469,13 @@ def load_software_dataset_context_corpus_json(json_gz_file_path): if "shared" in classification and classification["shared"]["value"]: classes.append(1.0) else: - classes.append(0.0) + classes.append(0.0) classes_list.append(classes) texts_list_final = np.asarray(texts_list) classes_list_final = np.asarray(classes_list) - + texts_list_final, classes_list_final, _ = shuffle_triple_with_view(texts_list_final, classes_list_final) return texts_list_final, classes_list_final From 371bff8f7b6bc6d9b336daa43cc26c089d6d356b Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 19 Dec 2022 16:12:31 +0900 Subject: [PATCH 18/27] add patience in parameters --- delft/applications/textClassifier.py | 28 +++++++++++++++++++++------- delft/utilities/misc.py | 5 +++-- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/delft/applications/textClassifier.py b/delft/applications/textClassifier.py index 2de0039d..d8c601af 100644 --- a/delft/applications/textClassifier.py +++ b/delft/applications/textClassifier.py @@ -26,10 +26,10 @@ def get_one_hot(y): return y2 -def configure(architecture): +def configure(architecture, batch_size_, max_sequence_length_, patience_): batch_size = 256 - maxlen = 150 - patience = 5 + maxlen = 150 if max_sequence_length_ == -1 else max_sequence_length_ + patience = 5 if patience_ == -1 else patience_ early_stop = True max_epoch = 60 @@ -39,12 +39,16 @@ def configure(architecture): # early_stop = False # max_epoch = 3 + batch_size = batch_size_ if batch_size_ != -1 else batch_size + return batch_size, maxlen, patience, early_stop, max_epoch def train(model_name, input_file, embeddings_name, fold_count, architecture=None, transformer=None, - x_index=0, y_indexes=[1]): - batch_size, maxlen, patience, early_stop, max_epoch = configure(architecture) + x_index=0, y_indexes=[1], batch_size=-1, max_sequence_length=-1, patience=-1): + + batch_size, maxlen, patience, early_stop, max_epoch = configure(architecture, batch_size, + max_sequence_length, patience) print('loading ' + model_name + ' training corpus...') xtr, y = load_texts_and_classes_generic(input_file, x_index, y_indexes) @@ -163,6 +167,11 @@ def classify(texts, output_format, architecture="gru", transformer=None): "for model names" ) + parser.add_argument("--max-sequence-length", type=int, default=-1, help="max-sequence-length parameter to be used.") + parser.add_argument("--batch-size", type=int, default=-1, help="batch-size parameter to be used.") + parser.add_argument("--patience", type=int, default=5, help="patience, number of epoques to count before stopping the training (only used in train or train_eval).") + + args = parser.parse_args() embeddings_name = args.embedding @@ -171,6 +180,9 @@ def classify(texts, output_format, architecture="gru", transformer=None): transformer = args.transformer architecture = args.architecture x_index = args.x_index + patience = args.patience + batch_size = args.batch_size + max_sequence_length = args.max_sequence_length if args.action != "classify": if args.y_indexes is None: @@ -188,7 +200,8 @@ def classify(texts, output_format, architecture="gru", transformer=None): if args.action == 'train': train(model_name, input_file, embeddings_name, args.fold_count, architecture=architecture, - transformer=transformer, x_index=x_index, y_indexes=y_indexes) + transformer=transformer, x_index=x_index, y_indexes=y_indexes, batch_size=batch_size, + max_sequence_length= max_sequence_length, patience=patience) elif args.action == 'eval': eval(model_name, input_file, architecture=architecture, x_index=x_index, y_indexes=y_indexes) @@ -198,7 +211,8 @@ def classify(texts, output_format, architecture="gru", transformer=None): raise ValueError("fold-count should be equal or more than 1") y_test = train_and_eval(model_name, input_file, embeddings_name, args.fold_count, architecture=architecture, - transformer=transformer, x_index=x_index, y_indexes=y_indexes) + transformer=transformer, x_index=x_index, y_indexes=y_indexes, batch_size=batch_size, + max_sequence_length= max_sequence_length, patience=patience) elif args.action == 'classify': lines = [] diff --git a/delft/utilities/misc.py b/delft/utilities/misc.py index a67a707e..afc2bffb 100644 --- a/delft/utilities/misc.py +++ b/delft/utilities/misc.py @@ -51,8 +51,9 @@ def print_parameters(model_config, training_config): print("---") print("max_epoch:", training_config.max_epoch) print("early_stop:", training_config.early_stop) + print("patience:", training_config.patience) print("batch_size (training):", model_config.batch_size) - + if hasattr(model_config, 'max_sequence_length'): print("max_sequence_length:", model_config.max_sequence_length) @@ -65,7 +66,7 @@ def print_parameters(model_config, training_config): if hasattr(model_config, 'use_ELMo'): print("use_ELMo: ", model_config.use_ELMo) - if hasattr(training_config, 'class_weights') and training_config.class_weights != None and hasattr(model_config, 'list_classes'): + if hasattr(training_config, 'class_weights') and training_config.class_weights is not None and hasattr(model_config, 'list_classes'): list_classes = model_config.list_classes weight_summary = "" for indx, class_name in enumerate(model_config.list_classes): From 77a6be0ed5be45ac1761a560dcd3226c43fdabcf Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 19 Dec 2022 17:46:02 +0900 Subject: [PATCH 19/27] cleanup useless removed spaces --- delft/applications/citationClassifier.py | 1 + delft/sequenceLabelling/preprocess.py | 42 +++++------ delft/sequenceLabelling/reader.py | 10 +-- delft/sequenceLabelling/wrapper.py | 3 +- delft/textClassification/models.py | 96 ++++++++++++------------ delft/textClassification/reader.py | 63 +++++++--------- delft/textClassification/wrapper.py | 2 +- delft/utilities/Embeddings.py | 4 +- delft/utilities/misc.py | 2 +- 9 files changed, 108 insertions(+), 115 deletions(-) diff --git a/delft/applications/citationClassifier.py b/delft/applications/citationClassifier.py index 97420b6f..7ced0bba 100644 --- a/delft/applications/citationClassifier.py +++ b/delft/applications/citationClassifier.py @@ -33,6 +33,7 @@ def configure(architecture): return batch_size, maxlen, patience, early_stop, max_epoch + def train(embeddings_name, fold_count, architecture="gru", transformer=None): batch_size, maxlen, patience, early_stop, max_epoch = configure(architecture) diff --git a/delft/sequenceLabelling/preprocess.py b/delft/sequenceLabelling/preprocess.py index 109e9055..7c1acfda 100644 --- a/delft/sequenceLabelling/preprocess.py +++ b/delft/sequenceLabelling/preprocess.py @@ -215,7 +215,7 @@ def empty_features_vector(self) -> Iterable[int]: class BERTPreprocessor(object): """ Generic BERT preprocessor for a sequence labelling data set. - Input are pre-tokenized texts, possibly with features and labels to re-align with the sub-tokenization. + Input are pre-tokenized texts, possibly with features and labels to re-align with the sub-tokenization. Rely on transformers library tokenizer """ @@ -240,27 +240,27 @@ def tokenize_and_align_features_and_labels(self, texts, chars, text_features, te target_features = None if text_features is not None: target_features = [] - + target_labels = None if text_labels is not None: - target_labels = [] - + target_labels = [] + for i, text in enumerate(texts): - + local_chars = chars[i] features = None if text_features is not None: features = text_features[i] - + label_list = None if text_labels is not None: label_list = text_labels[i] - input_ids, token_type_ids, attention_mask, chars_block, feature_blocks, target_tags, tokens = self.convert_single_text(text, - local_chars, - features, - label_list, + input_ids, token_type_ids, attention_mask, chars_block, feature_blocks, target_tags, tokens = self.convert_single_text(text, + local_chars, + features, + label_list, maxlen) target_ids.append(input_ids) target_type_ids.append(token_type_ids) @@ -270,9 +270,9 @@ def tokenize_and_align_features_and_labels(self, texts, chars, text_features, te if target_features is not None: target_features.append(feature_blocks) - + if target_labels is not None: - target_labels.append(target_tags) + target_labels.append(target_tags) return target_ids, target_type_ids, target_attention_mask, target_chars, target_features, target_labels, input_tokens @@ -299,7 +299,7 @@ def convert_single_text(self, text_tokens, chars_tokens, features_tokens, label_ chars_tokens = [] while len(chars_tokens) < len(text_tokens): chars_tokens.append(self.empty_char_vector) - + # sub-tokenization encoded_result = self.tokenizer(text_tokens, add_special_tokens=True, is_split_into_words=True, max_length=max_seq_length, truncation=True, return_offsets_mapping=True) @@ -315,7 +315,7 @@ def convert_single_text(self, text_tokens, chars_tokens, features_tokens, label_ chars_blocks = [] feature_blocks = [] - # trick to support sentence piece tokenizer like GPT2, roBERTa, CamemBERT, etc. which encode prefixed + # trick to support sentence piece tokenizer like GPT2, roBERTa, CamemBERT, etc. which encode prefixed # spaces in the tokens (the encoding symbol for this space varies from one model to another) new_input_ids = [] new_attention_mask = [] @@ -323,9 +323,9 @@ def convert_single_text(self, text_tokens, chars_tokens, features_tokens, label_ new_offsets = [] for i in range(0, len(input_ids)): if len(self.tokenizer.decode(input_ids[i])) != 0: - # if a decoded token has a length of 0, it is typically a space added for sentence piece/camembert/GPT2 + # if a decoded token has a length of 0, it is typically a space added for sentence piece/camembert/GPT2 # which happens to be then sometimes a single token for unknown reason when with is_split_into_words=True - # we need to skip this but also remove it from attention_mask, token_type_ids and offsets to stay + # we need to skip this but also remove it from attention_mask, token_type_ids and offsets to stay # in sync new_input_ids.append(input_ids[i]) new_attention_mask.append(attention_mask[i]) @@ -352,12 +352,12 @@ def convert_single_text(self, text_tokens, chars_tokens, features_tokens, label_ feature_blocks.append(features_tokens[word_idx]) chars_blocks.append(chars_tokens[word_idx]) else: - # propagate the data to the new sub-token or + # propagate the data to the new sub-token or # dummy/empty input for sub-tokens label_ids.append("") chars_blocks.append(self.empty_char_vector) - # 2 possibilities, either empty features for sub-tokens or repeating the - # feature vector of the prefix sub-token + # 2 possibilities, either empty features for sub-tokens or repeating the + # feature vector of the prefix sub-token #feature_blocks.append(self.empty_features_vector) feature_blocks.append(features_tokens[word_idx]) @@ -382,7 +382,7 @@ def convert_single_text(self, text_tokens, chars_tokens, features_tokens, label_ def convert_single_text_bert(self, text_tokens, chars_tokens, features_tokens, label_tokens, max_seq_length): """ - Converts a single sequence input into a single BERT input format and align other channel input to this + Converts a single sequence input into a single BERT input format and align other channel input to this new sub-tokenization The original BERT implementation works as follow: @@ -431,7 +431,7 @@ def convert_single_text_bert(self, text_tokens, chars_tokens, features_tokens, l for text_token, label_token, chars_token, features_token in zip(text_tokens, label_tokens, chars_tokens, features_tokens): text_sub_tokens = self.tokenizer.tokenize(text_token, add_special_tokens=False) - + # we mark added sub-tokens with the "##" prefix in order to restore token back correctly, # otherwise the BERT tokenizer do not mark them all with this prefix # (note: to be checked if it's the same with the non-original BERT tokenizer) diff --git a/delft/sequenceLabelling/reader.py b/delft/sequenceLabelling/reader.py index 526f396a..fc244077 100644 --- a/delft/sequenceLabelling/reader.py +++ b/delft/sequenceLabelling/reader.py @@ -579,7 +579,7 @@ def load_data_and_labels_conll(filename): def load_data_and_labels_conll_with_document_context(filename, max_context_window=400): """ - Load data and label from a file. In this alternative, we do not segment by sentence and + Load data and label from a file. In this alternative, we do not segment by sentence and we keep a maximum of document context according to a context window size. Args: @@ -610,7 +610,7 @@ def load_data_and_labels_conll_with_document_context(filename, max_context_windo """ - # TBD: ideally, for consistency, the tokenization in the CoNLL files should not be enforced, + # TBD: ideally, for consistency, the tokenization in the CoNLL files should not be enforced, # only the standard DeLFT tokenization should be used, in line with the word embeddings documents, sents, labels = [], [], [] with open(filename, encoding="UTF-8") as f: @@ -746,7 +746,7 @@ def load_data_and_labels_ontonotes(ontonotesRoot, lang='en'): def load_data_and_labels_json_offsets(jsonCorpus, tokenizer=None): """ - Load data and labels from json corpus where annotations are expressed with offsets. + Load data and labels from json corpus where annotations are expressed with offsets. This requires a tokenizer passed as parameter. If tokenizer is None, we use the generic Indo-European tokenizer. @@ -773,7 +773,7 @@ def load_data_and_labels_json_offsets(jsonCorpus, tokenizer=None): }, ] } - } + } Returns: tuple(numpy array, numpy array): data and labels @@ -789,7 +789,7 @@ def load_data_and_labels_json_offsets(jsonCorpus, tokenizer=None): corpus_file = gzip.open(jsonCorpus, "rt") else: corpus_file = open(jsonCorpus, "rt") - + jsonDocuments = json.load(corpus_file) if "documents" in jsonDocuments: for jsonDocument in jsonDocuments["documents"]: diff --git a/delft/sequenceLabelling/wrapper.py b/delft/sequenceLabelling/wrapper.py index 5a4baa69..f3f92264 100644 --- a/delft/sequenceLabelling/wrapper.py +++ b/delft/sequenceLabelling/wrapper.py @@ -54,8 +54,6 @@ import transformers transformers.logging.set_verbosity(transformers.logging.ERROR) -from tensorflow.keras.utils import plot_model - class Sequence(object): # number of parallel worker for the data generator @@ -173,6 +171,7 @@ def train(self, x_train, y_train, f_train=None, x_valid=None, y_valid=None, f_va self.model.print_summary() # uncomment to plot graph + # from tensorflow.keras.utils import plot_model #plot_model(self.model, # to_file='data/models/textClassification/'+self.model_config.model_name+'_'+self.model_config.architecture+'.png') diff --git a/delft/textClassification/models.py b/delft/textClassification/models.py index acf7b6f2..e5a4c308 100644 --- a/delft/textClassification/models.py +++ b/delft/textClassification/models.py @@ -106,16 +106,16 @@ def print_summary(self): self.model.base_model.summary() self.model.summary() - def train_model(self, - list_classes, - batch_size, - max_epoch, - use_roc_auc, - class_weights, - training_generator, - validation_generator, - val_y, - multiprocessing=True, + def train_model(self, + list_classes, + batch_size, + max_epoch, + use_roc_auc, + class_weights, + training_generator, + validation_generator, + val_y, + multiprocessing=True, patience=5, callbacks=None): @@ -154,14 +154,14 @@ def train_model(self, epochs=max_epoch, callbacks=callbacks) y_pred = self.model.predict( - validation_generator, + validation_generator, use_multiprocessing=multiprocessing, workers=nb_workers) total_loss = 0.0 total_roc_auc = 0.0 - # we distinguish 1-class and multiclass problems + # we distinguish 1-class and multiclass problems if len(list_classes) == 1: total_loss = log_loss(val_y, y_pred, labels=[0,1]) if len(np.unique(val_y)) == 1: @@ -169,7 +169,7 @@ def train_model(self, # a simple fix is to return the r2_score instead in this case (which is a regression score and not a loss) roc_auc = r2_score(val_y, y_pred) if roc_auc < 0: - roc_auc = 0 + roc_auc = 0 else: total_roc_auc = roc_auc_score(val_y, y_pred) else: @@ -181,7 +181,7 @@ def train_model(self, # a simple fix is to return the r2_score instead in this case (which is a regression score and not a loss) roc_auc = r2_score(val_y[:, j], y_pred[:, j]) if roc_auc < 0: - roc_auc = 0 + roc_auc = 0 else: roc_auc = roc_auc_score(val_y[:, j], y_pred[:, j]) total_roc_auc += roc_auc @@ -223,20 +223,20 @@ def predict(self, predict_generator, use_main_thread_only=False): if use_main_thread_only: # worker at 0 means the training will be executed in the main thread - nb_workers = 0 + nb_workers = 0 multiprocessing = False - + y = self.model.predict( - predict_generator, + predict_generator, use_multiprocessing=multiprocessing, workers=nb_workers) return y def compile(self, train_size): - # default compilation of the model. + # default compilation of the model. # train_size gives the number of steps for the traning, to be used for learning rate scheduler/decay - self.model.compile(loss='binary_crossentropy', - optimizer='adam', + self.model.compile(loss='binary_crossentropy', + optimizer='adam', metrics=['accuracy']) def init_transformer(self, config, load_pretrained_weights=True, local_path=None): @@ -306,19 +306,19 @@ def train_folds(X, y, model_config, training_config, embeddings, models=None, ca print('\n------------------------ fold ' + str(fold_id) + '--------------------------------------') training_generator = DataGenerator(train_x, train_y, batch_size=training_config.batch_size, - maxlen=model_config.maxlen, list_classes=model_config.list_classes, + maxlen=model_config.maxlen, list_classes=model_config.list_classes, embeddings=embeddings, bert_data=bert_data, shuffle=True, transformer_tokenizer=foldModel.transformer_tokenizer) validation_generator = None if training_config.early_stop: - validation_generator = DataGenerator(val_x, val_y, batch_size=training_config.batch_size, - maxlen=model_config.maxlen, list_classes=model_config.list_classes, + validation_generator = DataGenerator(val_x, val_y, batch_size=training_config.batch_size, + maxlen=model_config.maxlen, list_classes=model_config.list_classes, embeddings=embeddings, bert_data=bert_data, shuffle=False, transformer_tokenizer=foldModel.transformer_tokenizer) - foldModel.train_model(model_config.list_classes, training_config.batch_size, max_epoch, use_roc_auc, - class_weights, training_generator, validation_generator, val_y, multiprocessing=training_config.multiprocessing, + foldModel.train_model(model_config.list_classes, training_config.batch_size, max_epoch, use_roc_auc, + class_weights, training_generator, validation_generator, val_y, multiprocessing=training_config.multiprocessing, patience=training_config.patience, callbacks=callbacks) - + if model_config.transformer_name is None: if incremental: models[fold_id] = foldModel @@ -358,7 +358,7 @@ def predict_folds(models, predict_generator, model_config, training_config, use_ model = models[0] # load new weight from disk model_path = os.path.join("data/models/textClassification/", model_config.model_name, "model{0}_weights.hdf5".format(fold_id)) - model.load(model_path) + model.load(model_path) else: model = models[fold_id] @@ -371,7 +371,7 @@ def predict_folds(models, predict_generator, model_config, training_config, use_ y_predicts **= (1. / len(y_predicts_list)) - return y_predicts + return y_predicts class lstm(BaseModel): @@ -380,7 +380,7 @@ class lstm(BaseModel): """ name = 'lstm' - # default parameters + # default parameters parameters = { 'max_features': 200000, 'maxlen': 300, @@ -418,7 +418,7 @@ class bidLstm_simple(BaseModel): """ name = 'bidLstm_simple' - # default parameters + # default parameters parameters = { 'max_features': 200000, 'maxlen': 300, @@ -431,7 +431,7 @@ class bidLstm_simple(BaseModel): 'dense_size': 256 } - # bidirectional LSTM + # bidirectional LSTM def __init__(self, model_config, training_config): super().__init__(model_config, training_config) self.update_parameters(model_config, training_config) @@ -457,7 +457,7 @@ class cnn(BaseModel): """ name = 'cnn' - # default parameters + # default parameters parameters = { 'max_features': 200000, 'maxlen': 250, @@ -475,9 +475,9 @@ def __init__(self, model_config, training_config): super().__init__(model_config, training_config) self.update_parameters(model_config, training_config) nb_classes = len(model_config.list_classes) - + input_layer = Input(shape=(self.parameters["maxlen"], self.parameters["embed_size"]), ) - x = Dropout(self.parameters["dropout_rate"])(input_layer) + x = Dropout(self.parameters["dropout_rate"])(input_layer) x = Conv1D(filters=self.parameters["recurrent_units"], kernel_size=2, padding='same', activation='relu')(x) x = MaxPooling1D(pool_size=2)(x) x = Conv1D(filters=self.parameters["recurrent_units"], kernel_size=2, padding='same', activation='relu')(x) @@ -498,7 +498,7 @@ class cnn2(BaseModel): """ name = 'cnn2' - # default parameters + # default parameters parameters = { 'max_features': 200000, 'maxlen': 250, @@ -517,7 +517,7 @@ def __init__(self, model_config, training_config): nb_classes = len(model_config.list_classes) input_layer = Input(shape=(self.parameters["maxlen"], self.parameters["embed_size"]), ) - x = Dropout(self.parameters["dropout_rate"])(input_layer) + x = Dropout(self.parameters["dropout_rate"])(input_layer) x = Conv1D(filters=self.parameters["recurrent_units"], kernel_size=2, padding='same', activation='relu')(x) x = Conv1D(filters=self.parameters["recurrent_units"], kernel_size=2, padding='same', activation='relu')(x) x = Conv1D(filters=self.parameters["recurrent_units"], kernel_size=2, padding='same', activation='relu')(x) @@ -535,7 +535,7 @@ class cnn3(BaseModel): """ name = 'cnn3' - # default parameters + # default parameters parameters = { 'max_features': 200000, 'maxlen': 300, @@ -577,7 +577,7 @@ class lstm_cnn(BaseModel): """ name = 'lstm_cnn' - # default parameters + # default parameters parameters = { 'max_features': 200000, 'maxlen': 250, @@ -623,7 +623,7 @@ class gru(BaseModel): """ name = 'gru' - # default parameters + # default parameters parameters = { 'max_features': 200000, 'maxlen': 300, @@ -636,7 +636,7 @@ class gru(BaseModel): 'dense_size': 32 } - # 2 bid. GRU + # 2 bid. GRU def __init__(self, model_config, training_config): super().__init__(model_config, training_config) self.update_parameters(model_config, training_config) @@ -655,7 +655,7 @@ def __init__(self, model_config, training_config): output_layer = Dense(nb_classes, activation="sigmoid")(x) self.model = Model(inputs=input_layer, outputs=output_layer) - + def compile(self, train_size): self.model.compile(loss='binary_crossentropy', optimizer=RMSprop(clipvalue=1, clipnorm=1), @@ -668,7 +668,7 @@ class gru_simple(BaseModel): """ name = 'gru_simple' - # default parameters + # default parameters parameters = { 'max_features': 200000, 'maxlen': 300, @@ -710,7 +710,7 @@ class gru_lstm(BaseModel): """ name = 'gru_lstm' - # default parameters + # default parameters parameters = { 'max_features': 200000, 'maxlen': 300, @@ -755,7 +755,7 @@ class dpcnn(BaseModel): """ name = 'dpcnn' - # default parameters + # default parameters parameters = { 'max_features': 200000, 'maxlen': 300, @@ -809,13 +809,13 @@ def __init__(self, model_config, training_config): class bert(BaseModel): """ - A Keras implementation of a BERT classifier for fine-tuning, with BERT layer to be + A Keras implementation of a BERT classifier for fine-tuning, with BERT layer to be instanciated with a pre-trained BERT model """ name = 'bert' - bert_config = None + bert_config = None - # default parameters + # default parameters parameters = { 'dense_size': 512, 'max_seq_len': 512, @@ -847,7 +847,7 @@ def __init__(self, model_config, training_config, load_pretrained_weights=True, def compile(self, train_size): #optimizer = Adam(learning_rate=2e-5, clipnorm=1) optimizer, lr_schedule = create_optimizer( - init_lr=2e-5, + init_lr=2e-5, num_train_steps=train_size, weight_decay_rate=0.01, num_warmup_steps=0.1*train_size, diff --git a/delft/textClassification/reader.py b/delft/textClassification/reader.py index 80591a2c..c5fb5123 100644 --- a/delft/textClassification/reader.py +++ b/delft/textClassification/reader.py @@ -97,13 +97,13 @@ def load_texts_and_classes_pandas(filepath): """ df = pd.read_csv(filepath) - df.iloc[:, 1].fillna('MISSINGVALUE', inplace=True) + df.iloc[:,1].fillna('MISSINGVALUE', inplace=True) texts_list = [] for j in range(0, df.shape[0]): - texts_list.append(df.iloc[j, 1]) + texts_list.append(df.iloc[j,1]) - classes = df.iloc[:, 2:] + classes = df.iloc[:,2:] classes_list = classes.values.tolist() return np.asarray(texts_list, dtype=object), np.asarray(classes_list, dtype=object) @@ -127,11 +127,11 @@ def load_texts_pandas(filepath): """ df = pd.read_csv(filepath) - df.iloc[:, 1].fillna('MISSINGVALUE', inplace=True) + df.iloc[:,1].fillna('MISSINGVALUE', inplace=True) texts_list = [] for j in range(0, df.shape[0]): - texts_list.append(df.iloc[j, 1]) + texts_list.append(df.iloc[j,1]) return np.asarray(texts_list, dtype=object) @@ -166,7 +166,7 @@ def load_citation_sentiment_corpus(filepath): continue text = pieces[3] # remove start/end quotes - text = text[1:len(text) - 1] + text = text[1:len(text)-1] texts.append(text) polarity = [] @@ -255,9 +255,9 @@ def load_dataseer_corpus_csv(filepath): df = df[pd.notnull(df['text'])] if 'datatype' in df.columns: df = df[pd.notnull(df['datatype'])] - if 'reuse' in df.columns: + if 'reuse' in df.columns: df = df[pd.notnull(df['reuse'])] - df.iloc[:, 1].fillna('NA', inplace=True) + df.iloc[:,1].fillna('NA', inplace=True) # shuffle, note that this is important for the reuse prediction, the following shuffle in place # and reset the index @@ -265,28 +265,26 @@ def load_dataseer_corpus_csv(filepath): texts_list = [] for j in range(0, df.shape[0]): - texts_list.append(df.iloc[j, 1]) + texts_list.append(df.iloc[j,1]) - if 'reuse' in df.columns: + if 'reuse' in df.columns: # we simply get the reuse boolean value for the examples - datareuses = df.iloc[:, 2] + datareuses = df.iloc[:,2] reuse_list = datareuses.values.tolist() reuse_list = np.asarray(reuse_list) - # map boolean values to [0,1] def map_boolean(x): - return [1.0, 0.0] if x else [0.0, 1.0] - + return [1.0,0.0] if x else [0.0,1.0] reuse_list = np.array(list(map(map_boolean, reuse_list))) print(reuse_list) return np.asarray(texts_list), reuse_list, None, None, ["not_reuse", "reuse"], None, None # otherwise we have the list of datatypes, and optionally subtypes and leaf datatypes - datatypes = df.iloc[:, 2] + datatypes = df.iloc[:,2] datatypes_list = datatypes.values.tolist() datatypes_list = np.asarray(datatypes_list, dtype=object) datatypes_list_lower = np.char.lower(datatypes_list) - list_classes_datatypes = np.unique(datatypes_list_lower) + list_classes_datatypes = np.unique(datatypes_list_lower) datatypes_final = normalize_classes(datatypes_list_lower, list_classes_datatypes) print(df.shape, df.shape[0], df.shape[1]) @@ -294,7 +292,7 @@ def map_boolean(x): if df.shape[1] > 3: # remove possible row with 'no_dataset' df = df[~df.datatype.str.contains("no_dataset")] - datasubtypes = df.iloc[:, 3] + datasubtypes = df.iloc[:,3] datasubtypes_list = datasubtypes.values.tolist() datasubtypes_list = np.asarray(datasubtypes_list, dtype=object) datasubtypes_list_lower = np.char.lower(datasubtypes_list) @@ -314,18 +312,15 @@ def map_boolean(x): ''' if df.shape[1] == 3: - return np.asarray(texts_list, - dtype=object), datatypes_final, None, None, list_classes_datatypes.tolist(), None, None - # elif df.shape[1] == 4: + return np.asarray(texts_list, dtype=object), datatypes_final, None, None, list_classes_datatypes.tolist(), None, None + #elif df.shape[1] == 4: else: - return np.asarray(texts_list, - dtype=object), datatypes_final, datasubtypes_final, None, list_classes_datatypes.tolist(), list_classes_datasubtypes.tolist(), None + return np.asarray(texts_list, dtype=object), datatypes_final, datasubtypes_final, None, list_classes_datatypes.tolist(), list_classes_datasubtypes.tolist(), None ''' else: return np.asarray(texts_list), datatypes_final, datasubtypes_final, leafdatatypes_final, list_classes_datatypes.tolist(), list_classes_datasubtypes.tolist(), list_classes_leafdatatypes.tolist() ''' - def load_software_use_corpus_json(json_gz_file_path): """ Load texts and classes from the corresponding Softcite corpus export in gzipped json format @@ -345,7 +340,7 @@ def load_software_use_corpus_json(json_gz_file_path): data = json.loads(fin.read().decode('utf-8')) if not "documents" in data: print("There is no usable classified text in the corpus file", json_gz_file_path) - return None, None + return None, None for document in data["documents"]: for segment in document["texts"]: if "entity_spans" in segment: @@ -376,7 +371,7 @@ def load_software_context_corpus_json(json_gz_file_path): Classification of the software usage is multiclass/multilabel Returns: - tuple(numpy array, numpy array): + tuple(numpy array, numpy array): texts, classes_list """ @@ -388,7 +383,7 @@ def load_software_context_corpus_json(json_gz_file_path): data = json.loads(fin.read().decode('utf-8')) if not "documents" in data: print("There is no usable classified text in the corpus file", json_gz_file_path) - return None, None + return None, None for document in data["documents"]: for segment in document["texts"]: if "entity_spans" in segment: @@ -412,16 +407,16 @@ def load_software_context_corpus_json(json_gz_file_path): if "shared" in entity_span and entity_span["shared"]: classes.append(1.0) else: - classes.append(0.0) + classes.append(0.0) classes_list.append(classes) - # list_possible_classes = np.unique(classes_list) - # classes_list_final = normalize_classes(classes_list, list_possible_classes) + #list_possible_classes = np.unique(classes_list) + #classes_list_final = normalize_classes(classes_list, list_possible_classes) texts_list_final = np.asarray(texts_list) classes_list_final = np.asarray(classes_list) - + texts_list_final, classes_list_final, _ = shuffle_triple_with_view(texts_list_final, classes_list_final) return texts_list_final, classes_list_final @@ -445,7 +440,7 @@ def load_software_dataset_context_corpus_json(json_gz_file_path): data = json.loads(fin.read().decode('utf-8')) if not "documents" in data: print("There is no usable classified text in the corpus file", json_gz_file_path) - return None, None + return None, None for document in data["documents"]: for segment in document["texts"]: if "class_attributes" not in segment: @@ -469,13 +464,13 @@ def load_software_dataset_context_corpus_json(json_gz_file_path): if "shared" in classification and classification["shared"]["value"]: classes.append(1.0) else: - classes.append(0.0) + classes.append(0.0) classes_list.append(classes) texts_list_final = np.asarray(texts_list) classes_list_final = np.asarray(classes_list) - + texts_list_final, classes_list_final, _ = shuffle_triple_with_view(texts_list_final, classes_list_final) return texts_list_final, classes_list_final @@ -485,14 +480,12 @@ def normalize_classes(y, list_classes): ''' Replace string values of classes by their index in the list of classes ''' - def f(x): return np.where(list_classes == x) intermediate = np.array([f(xi)[0] for xi in y]) return np.array([vectorize(xi, len(list_classes)) for xi in intermediate]) - def vectorize(index, size): ''' Create a numpy array of the provided size, where value at indicated index is 1, 0 otherwise diff --git a/delft/textClassification/wrapper.py b/delft/textClassification/wrapper.py index 610d6486..c524fbac 100644 --- a/delft/textClassification/wrapper.py +++ b/delft/textClassification/wrapper.py @@ -338,7 +338,7 @@ def vectorize(index, size): total_accuracy += accuracy f1 = f1_score(y_test[:, j], result_binary[:, j], average='micro') total_f1 += f1 - loss = log_loss(y_test[:, j], result[:, j], labels=[0, 1]) + loss = log_loss(y_test[:, j], result[:, j], labels=[0,1]) total_loss += loss if len(np.unique(y_test[:, j])) == 1: # roc_auc_score sklearn implementation is not working in this case, it needs more balanced batches diff --git a/delft/utilities/Embeddings.py b/delft/utilities/Embeddings.py index feba4266..ac16fdb1 100644 --- a/delft/utilities/Embeddings.py +++ b/delft/utilities/Embeddings.py @@ -441,7 +441,7 @@ def get_sentence_vector_only_ELMo(self, token_list): def get_sentence_vector_with_ELMo(self, token_list): """ - Return a concatenation of standard embeddings (e.g. Glove) and ELMo embeddings + Return a concatenation of standard embeddings (e.g. Glove) and ELMo embeddings for a full sentence """ if not self.use_ELMo: @@ -506,7 +506,7 @@ def get_ELMo_lmdb_vector(self, token_list, max_size_sentence): def cache_ELMo_lmdb_vector(self, token_list, ELMo_vector): """ - Cache in LMDB the ELMo embeddings for a given sequence + Cache in LMDB the ELMo embeddings for a given sequence """ if self.env_ELMo is None: # db cache not available, we don't cache ELMo stuff diff --git a/delft/utilities/misc.py b/delft/utilities/misc.py index afc2bffb..83f73efd 100644 --- a/delft/utilities/misc.py +++ b/delft/utilities/misc.py @@ -53,7 +53,7 @@ def print_parameters(model_config, training_config): print("early_stop:", training_config.early_stop) print("patience:", training_config.patience) print("batch_size (training):", model_config.batch_size) - + if hasattr(model_config, 'max_sequence_length'): print("max_sequence_length:", model_config.max_sequence_length) From 830e092da34a8d5af67cba4ad9c804bf90e98dc5 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 19 Dec 2022 17:49:05 +0900 Subject: [PATCH 20/27] cleanup useless removed spaces 2 --- delft/sequenceLabelling/data_generator.py | 62 +++++++++++------------ delft/sequenceLabelling/preprocess.py | 14 ++--- 2 files changed, 38 insertions(+), 38 deletions(-) diff --git a/delft/sequenceLabelling/data_generator.py b/delft/sequenceLabelling/data_generator.py index 35ad4513..15d661b4 100644 --- a/delft/sequenceLabelling/data_generator.py +++ b/delft/sequenceLabelling/data_generator.py @@ -13,8 +13,8 @@ class BaseGenerator(keras.utils.Sequence): Abstract class for data generator. Generate batch of data to feed sequence labeling model, both for training and prediction. - - This generator is for input based on word embeddings. We keep embeddings application outside the + + This generator is for input based on word embeddings. We keep embeddings application outside the model to make it considerably more compact and avoid duplication of embeddings layers. """ def __init__(self, x, y, @@ -87,8 +87,8 @@ def __data_generation(self, index): class DataGenerator(BaseGenerator): - """ - This generator is for input based on word embeddings. We keep embeddings application outside the + """ + This generator is for input based on word embeddings. We keep embeddings application outside the model to make it considerably more compact and avoid duplication of embeddings layers. """ def __init__(self, x, y, @@ -104,15 +104,15 @@ def __init__(self, x, y, output_input_offsets=False, use_chain_crf=False): - super().__init__(x, y, - batch_size=batch_size, + super().__init__(x, y, + batch_size=batch_size, preprocessor=preprocessor, - bert_preprocessor=bert_preprocessor, - char_embed_size=char_embed_size, - embeddings=embeddings, - max_sequence_length=max_sequence_length, - tokenize=tokenize, - shuffle=shuffle, + bert_preprocessor=bert_preprocessor, + char_embed_size=char_embed_size, + embeddings=embeddings, + max_sequence_length=max_sequence_length, + tokenize=tokenize, + shuffle=shuffle, features=features, output_input_offsets=output_input_offsets, use_chain_crf=use_chain_crf) @@ -188,11 +188,11 @@ def __data_generation(self, index): # truncation of sequence at max_sequence_length sub_f = truncate_batch_values(sub_f, self.max_sequence_length) batch_f = self.preprocessor.transform_features(sub_f, extend=extend) - + batch_a = np.zeros((max_iter, max_length_x), dtype=np.int32) if self.preprocessor.return_casing: for i in range(0, max_iter): - batch_a[i] = to_casing_single(x_tokenized[i], max_length_x) + batch_a[i] = to_casing_single(x_tokenized[i], max_length_x) if self.y is not None: if self.use_chain_crf: @@ -212,9 +212,9 @@ def __data_generation(self, index): class DataGeneratorTransformers(BaseGenerator): """ Generate batch of data to feed sequence labeling model, both for training and prediction. - - This generator is for input based on transformer embeddings. We keep embeddings application - outside the model so that we can serialize the model more easily. + + This generator is for input based on transformer embeddings. We keep embeddings application + outside the model so that we can serialize the model more easily. """ def __init__(self, x, y, batch_size=24, @@ -229,15 +229,15 @@ def __init__(self, x, y, output_input_offsets=False, use_chain_crf=False): - super().__init__(x, y, - batch_size=batch_size, - preprocessor=preprocessor, - bert_preprocessor=bert_preprocessor, - char_embed_size=char_embed_size, - embeddings=embeddings, - max_sequence_length=max_sequence_length, - tokenize=tokenize, - shuffle=shuffle, + super().__init__(x, y, + batch_size=batch_size, + preprocessor=preprocessor, + bert_preprocessor=bert_preprocessor, + char_embed_size=char_embed_size, + embeddings=embeddings, + max_sequence_length=max_sequence_length, + tokenize=tokenize, + shuffle=shuffle, features=features, output_input_offsets=output_input_offsets, use_chain_crf=use_chain_crf) @@ -258,10 +258,10 @@ def __getitem__(self, index): return_data = [batch_x] - if self.preprocessor.return_chars: + if self.preprocessor.return_chars: return_data += [batch_c] - if self.preprocessor.return_features: + if self.preprocessor.return_features: return_data += [batch_f] return_data += [batch_x_types] @@ -303,7 +303,7 @@ def __data_generation(self, index): # generate data batch_y = None - + # tag embeddings if self.y is not None: # note: tags are always already "tokenized" by input token @@ -353,8 +353,8 @@ def __data_generation(self, index): if self.preprocessor.return_features: batch_f = np.asarray(truncate_batch_values(input_features, max_length_x), dtype=np.int32) - else: - batch_f = np.zeros((batch_x.shape[0:2]), dtype=np.int32) + else: + batch_f = np.zeros((batch_x.shape[0:2]), dtype=np.int32) return batch_x, batch_x_types, batch_x_masks, batch_c, batch_f, batch_l, batch_input_offsets, batch_y diff --git a/delft/sequenceLabelling/preprocess.py b/delft/sequenceLabelling/preprocess.py index 7c1acfda..439f5415 100644 --- a/delft/sequenceLabelling/preprocess.py +++ b/delft/sequenceLabelling/preprocess.py @@ -433,7 +433,7 @@ def convert_single_text_bert(self, text_tokens, chars_tokens, features_tokens, l text_sub_tokens = self.tokenizer.tokenize(text_token, add_special_tokens=False) # we mark added sub-tokens with the "##" prefix in order to restore token back correctly, - # otherwise the BERT tokenizer do not mark them all with this prefix + # otherwise the BERT tokenizer do not mark them all with this prefix # (note: to be checked if it's the same with the non-original BERT tokenizer) text_sub_tokens_marked = self.tokenizer.tokenize(text_token, add_special_tokens=False) for i in range(len(text_sub_tokens_marked)): @@ -442,7 +442,7 @@ def convert_single_text_bert(self, text_tokens, chars_tokens, features_tokens, l tok = text_sub_tokens_marked[i] if not tok.startswith("##"): text_sub_tokens_marked[i] = "##" + tok - + label_sub_tokens = [label_token] + [label_token] * (len(text_sub_tokens) - 1) chars_sub_tokens = [chars_token] + [chars_token] * (len(text_sub_tokens) - 1) feature_sub_tokens = [features_token] + [features_token] * (len(text_sub_tokens) - 1) @@ -494,7 +494,7 @@ def convert_single_text_bert(self, text_tokens, chars_tokens, features_tokens, l for token in tokens_marked: input_tokens_marked.append(token) - + input_tokens.append(self.tokenizer.sep_token) input_tokens_marked.append(self.tokenizer.sep_token) segment_ids.append(0) @@ -634,7 +634,7 @@ def transform(self, X, y=None, extend=False, label_indices=False): y: list of list of tags Returns: - numpy array: sentences with char sequences and length + numpy array: sentences with char sequences and length numpy array: sequence of tags, either one hot encoded (default) or as indices if label_indices parameter is true, we encode tags with index integer, otherwise output hot one encoded tags @@ -897,9 +897,9 @@ def to_vector_elmo(tokens, embeddings, maxlen, lowercase=False, num_norm=False, def to_vector_simple_with_elmo(tokens, embeddings, maxlen, lowercase=False, num_norm=False, extend=False): """ - Given a list of tokens convert it to a sequence of word embedding - vectors based on the concatenation of the provided static embeddings and - the ELMo contextualized embeddings, introducing and + Given a list of tokens convert it to a sequence of word embedding + vectors based on the concatenation of the provided static embeddings and + the ELMo contextualized embeddings, introducing and padding token vector when appropriate """ subtokens = get_subtokens(tokens, maxlen, extend, lowercase) From 03bd7dd5a3128fd6be2a2ef434e7a045ad466cfd Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 19 Dec 2022 18:14:41 +0900 Subject: [PATCH 21/27] fix tests --- tests/sequence_labelling/preprocess_test.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/sequence_labelling/preprocess_test.py b/tests/sequence_labelling/preprocess_test.py index bb4d5f04..85eae2dd 100644 --- a/tests/sequence_labelling/preprocess_test.py +++ b/tests/sequence_labelling/preprocess_test.py @@ -78,7 +78,7 @@ def _to_dense(a: np.array): def all_close(a: np.array, b: np.array): - return np.allclose(_to_dense(a), _to_dense(b)) + return np.allclose(_to_dense(a).astype("float"), _to_dense(b).astype("float")) class TestFeaturesPreprocessor: @@ -95,7 +95,7 @@ def test_should_fit_single_value_feature(self): features_transformed = preprocessor.fit_transform(features_batch) features_length = len(preprocessor.features_indices) assert features_length == 1 - assert all_close(features_transformed, [[[1]]]) + assert all_close(features_transformed, np.array([[[1]]], dtype=object)) def test_should_fit_single_multiple_value_features(self): preprocessor = FeaturesPreprocessor() @@ -112,14 +112,14 @@ def test_should_fit_multiple_single_value_features(self): features_transformed = preprocessor.fit_transform(features_batch) features_length = len(preprocessor.features_indices) assert features_length == 2 - assert all_close(features_transformed, [[[1, 13]]]) + assert all_close(features_transformed, np.asarray([[[1, 13]]], dtype=object)) def test_should_transform_unseen_to_zero(self): preprocessor = FeaturesPreprocessor() features_batch = [[[FEATURE_VALUE_1]]] preprocessor.fit(features_batch) features_transformed = preprocessor.transform([[[FEATURE_VALUE_2]]]) - assert all_close(features_transformed, [[[0]]]) + assert all_close(features_transformed, np.asarray([[[0]]], dtype=object)) def test_should_select_features(self): preprocessor = FeaturesPreprocessor(features_indices=[1]) @@ -131,7 +131,7 @@ def test_should_select_features(self): features_transformed = preprocessor.fit_transform(features_batch) features_length = len(preprocessor.features_indices) assert features_length == 1 - assert all_close(features_transformed, [[[1], [2], [3]]]) + assert all_close(features_transformed, np.asarray([[[1], [2], [3]]], dtype=object)) def test_serialize_to_json(self, tmp_path): preprocessor = FeaturesPreprocessor(features_indices=[1]) From 198ba69b09540c10837dac811276309b90c4353a Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 20 Dec 2022 15:44:08 +0900 Subject: [PATCH 22/27] missing parameters --- delft/applications/textClassifier.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/delft/applications/textClassifier.py b/delft/applications/textClassifier.py index d8c601af..9414c3dc 100644 --- a/delft/applications/textClassifier.py +++ b/delft/applications/textClassifier.py @@ -87,8 +87,10 @@ def eval(model_name, input_file, architecture=None, x_index=0, y_indexes=[1]): def train_and_eval(model_name, input_file, embeddings_name, fold_count, transformer=None, - architecture="gru", x_index=0, y_indexes=[1]): - batch_size, maxlen, patience, early_stop, max_epoch = configure(architecture) + architecture="gru", x_index=0, y_indexes=[1], batch_size=-1, + max_sequence_length=-1, patience=-1): + + batch_size, maxlen, patience, early_stop, max_epoch = configure(architecture, batch_size, max_sequence_length, patience) print('loading ' + model_name + ' corpus...') xtr, y = load_texts_and_classes_generic(input_file, x_index, y_indexes) From 175fe89321fd3bd0693165ccdae213e1eb9328c1 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 22 Dec 2022 11:17:01 +0900 Subject: [PATCH 23/27] add classification from input file, cleanup --- delft/applications/textClassifier.py | 20 ++++++++------------ delft/textClassification/reader.py | 9 +++++---- delft/textClassification/wrapper.py | 7 +++---- 3 files changed, 16 insertions(+), 20 deletions(-) diff --git a/delft/applications/textClassifier.py b/delft/applications/textClassifier.py index 9414c3dc..5a3a3aa9 100644 --- a/delft/applications/textClassifier.py +++ b/delft/applications/textClassifier.py @@ -119,17 +119,19 @@ def train_and_eval(model_name, input_file, embeddings_name, fold_count, transfor # classify a list of texts -def classify(texts, output_format, architecture="gru", transformer=None): - # load model +def classify(model_name, texts, output_format='json'): model = Classifier(model_name) model.load() + start_time = time.time() result = model.predict(texts, output_format) runtime = round(time.time() - start_time, 3) + if output_format == 'json': result["runtime"] = runtime else: print("runtime: %s seconds " % (runtime)) + return result @@ -140,7 +142,7 @@ def classify(texts, output_format, architecture="gru", transformer=None): parser.add_argument("action", help="the action", choices=actions) parser.add_argument("model", help="The name of the model") parser.add_argument("--fold-count", type=int, default=1) - parser.add_argument("--input", type=str, required=True, help="The file to be used for training/evaluation") + parser.add_argument("--input", type=str, required=True, help="The file to be used for train, train_eval, eval and, classify") parser.add_argument("--x-index", type=int, required=True, help="Index of the columns for the X value " "(assuming a TSV file)") parser.add_argument("--y-indexes", type=str, required=False, help="Index(es) of the columns for the Y (classes) " @@ -217,15 +219,9 @@ def classify(texts, output_format, architecture="gru", transformer=None): max_sequence_length= max_sequence_length, patience=patience) elif args.action == 'classify': - lines = [] - with open(input_file, 'r') as f: - tsvreader = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_ALL) - for line in tsvreader: - if len(line) == 0: - continue - lines.append(line[x_index]) - - result = classify(lines, "csv") + lines, _ = load_texts_and_classes_generic(input_file, x_index, None) + + result = classify(model_name, lines, "csv") result_binary = [np.argmax(line) for line in result] diff --git a/delft/textClassification/reader.py b/delft/textClassification/reader.py index c5fb5123..93557dcf 100644 --- a/delft/textClassification/reader.py +++ b/delft/textClassification/reader.py @@ -35,14 +35,15 @@ def load_texts_and_classes_generic(filepath, text_index: int, classes_indexes: l for line in tsvreader: if len(line) == 0: continue - if len(line) < 3: - print("Warning: number of fields in the data file too low for line:", line) - classes = [line[i] for i in classes_indexes] + + classes = [line[i] for i in classes_indexes] if classes_indexes is not None else None if first: print("Sample input", "x: ", line[text_index], "y: ", classes) first = False x.append(line[text_index]) - y.append(classes) + + if classes_indexes is not None: + y.append(classes) return np.asarray(x, dtype=object), np.asarray(y, dtype=object) diff --git a/delft/textClassification/wrapper.py b/delft/textClassification/wrapper.py index c524fbac..7e136c40 100644 --- a/delft/textClassification/wrapper.py +++ b/delft/textClassification/wrapper.py @@ -34,7 +34,7 @@ from delft.textClassification.models import predict_folds from delft.textClassification.data_generator import DataGenerator -from delft.utilities.Transformer import Transformer, TRANSFORMER_CONFIG_FILE_NAME, DEFAULT_TRANSFORMER_TOKENIZER_DIR +from delft.utilities.Transformer import TRANSFORMER_CONFIG_FILE_NAME, DEFAULT_TRANSFORMER_TOKENIZER_DIR from delft.utilities.Embeddings import Embeddings, load_resource_registry @@ -42,9 +42,7 @@ from sklearn.model_selection import train_test_split import transformers -transformers.logging.set_verbosity(transformers.logging.ERROR) - -from tensorflow.keras.utils import plot_model +transformers.logging.set_verbosity(transformers.logging.ERROR) class Classifier(object): @@ -166,6 +164,7 @@ def train(self, x_train, y_train, vocab_init=None, incremental=False, callbacks= validation_generator = None # uncomment to plot graph + # from tensorflow.keras.utils import plot_model #plot_model(self.model, # to_file='data/models/textClassification/'+self.model_config.model_name+'_'+self.model_config.architecture+'.png') self.model.train_model( From ee6eca0b44f06e00c21a2267695468e5f6893053 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 5 Jan 2023 14:54:30 +0900 Subject: [PATCH 24/27] add consistency in the application script --- delft/applications/textClassifier.py | 44 ++++++++++++++-------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/delft/applications/textClassifier.py b/delft/applications/textClassifier.py index 5a3a3aa9..d497a23a 100644 --- a/delft/applications/textClassifier.py +++ b/delft/applications/textClassifier.py @@ -44,9 +44,8 @@ def configure(architecture, batch_size_, max_sequence_length_, patience_): return batch_size, maxlen, patience, early_stop, max_epoch -def train(model_name, input_file, embeddings_name, fold_count, architecture=None, transformer=None, +def train(model_name, architecture, input_file, embeddings_name, fold_count, transformer=None, x_index=0, y_indexes=[1], batch_size=-1, max_sequence_length=-1, patience=-1): - batch_size, maxlen, patience, early_stop, max_epoch = configure(architecture, batch_size, max_sequence_length, patience) @@ -70,7 +69,7 @@ def train(model_name, input_file, embeddings_name, fold_count, architecture=None model.save() -def eval(model_name, input_file, architecture=None, x_index=0, y_indexes=[1]): +def eval(model_name, architecture, input_file, x_index=0, y_indexes=[1]): # model_name += model_name + '-' + architecture print('loading ' + model_name + ' evaluation corpus...') @@ -78,7 +77,7 @@ def eval(model_name, input_file, architecture=None, x_index=0, y_indexes=[1]): xtr, y = load_texts_and_classes_generic(input_file, x_index, y_indexes) print(len(xtr), 'evaluation sequences') - model = Classifier(model_name) + model = Classifier(model_name, architecture=architecture) model.load() y_ = get_one_hot(y) @@ -86,11 +85,11 @@ def eval(model_name, input_file, architecture=None, x_index=0, y_indexes=[1]): model.eval(xtr, y_) -def train_and_eval(model_name, input_file, embeddings_name, fold_count, transformer=None, - architecture="gru", x_index=0, y_indexes=[1], batch_size=-1, - max_sequence_length=-1, patience=-1): - - batch_size, maxlen, patience, early_stop, max_epoch = configure(architecture, batch_size, max_sequence_length, patience) +def train_and_eval(model_name, architecture, input_file, embeddings_name, fold_count, transformer=None, + x_index=0, y_indexes=[1], batch_size=-1, + max_sequence_length=-1, patience=-1): + batch_size, maxlen, patience, early_stop, max_epoch = configure(architecture, batch_size, max_sequence_length, + patience) print('loading ' + model_name + ' corpus...') xtr, y = load_texts_and_classes_generic(input_file, x_index, y_indexes) @@ -119,8 +118,8 @@ def train_and_eval(model_name, input_file, embeddings_name, fold_count, transfor # classify a list of texts -def classify(model_name, texts, output_format='json'): - model = Classifier(model_name) +def classify(model_name, architecture, texts, output_format='json'): + model = Classifier(model_name, architecture=architecture) model.load() start_time = time.time() @@ -142,12 +141,13 @@ def classify(model_name, texts, output_format='json'): parser.add_argument("action", help="the action", choices=actions) parser.add_argument("model", help="The name of the model") parser.add_argument("--fold-count", type=int, default=1) - parser.add_argument("--input", type=str, required=True, help="The file to be used for train, train_eval, eval and, classify") + parser.add_argument("--input", type=str, required=True, + help="The file to be used for train, train_eval, eval and, classify") parser.add_argument("--x-index", type=int, required=True, help="Index of the columns for the X value " "(assuming a TSV file)") parser.add_argument("--y-indexes", type=str, required=False, help="Index(es) of the columns for the Y (classes) " - "separated by comma, without spaces (assuming " - "a TSV file)") + "separated by comma, without spaces (assuming " + "a TSV file)") parser.add_argument("--architecture", default='gru', choices=architectures, help="type of model architecture to be used, one of " + str(architectures)) parser.add_argument( @@ -173,8 +173,8 @@ def classify(model_name, texts, output_format='json'): parser.add_argument("--max-sequence-length", type=int, default=-1, help="max-sequence-length parameter to be used.") parser.add_argument("--batch-size", type=int, default=-1, help="batch-size parameter to be used.") - parser.add_argument("--patience", type=int, default=5, help="patience, number of epoques to count before stopping the training (only used in train or train_eval).") - + parser.add_argument("--patience", type=int, default=5, + help="patience, number of epoques to count before stopping the training (only used in train or train_eval).") args = parser.parse_args() @@ -203,20 +203,20 @@ def classify(model_name, texts, output_format='json'): embeddings_name = "glove-840B" if args.action == 'train': - train(model_name, input_file, embeddings_name, args.fold_count, architecture=architecture, + train(model_name, architecture, input_file, embeddings_name, args.fold_count, transformer=transformer, x_index=x_index, y_indexes=y_indexes, batch_size=batch_size, - max_sequence_length= max_sequence_length, patience=patience) + max_sequence_length=max_sequence_length, patience=patience) elif args.action == 'eval': - eval(model_name, input_file, architecture=architecture, x_index=x_index, y_indexes=y_indexes) + eval(model_name, architecture, input_file, x_index=x_index, y_indexes=y_indexes) elif args.action == 'train_eval': if args.fold_count < 1: raise ValueError("fold-count should be equal or more than 1") - y_test = train_and_eval(model_name, input_file, embeddings_name, args.fold_count, architecture=architecture, - transformer=transformer, x_index=x_index, y_indexes=y_indexes, batch_size=batch_size, - max_sequence_length= max_sequence_length, patience=patience) + train_and_eval(model_name, architecture, input_file, embeddings_name, args.fold_count, + transformer=transformer, x_index=x_index, y_indexes=y_indexes, batch_size=batch_size, + max_sequence_length=max_sequence_length, patience=patience) elif args.action == 'classify': lines, _ = load_texts_and_classes_generic(input_file, x_index, None) From a4d13b43e1a5c1fd0397db0f94f6e0e7f6bb2b67 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 17 Jan 2024 13:24:41 +0900 Subject: [PATCH 25/27] enable multi-gpu and other parameters --- delft/applications/textClassifier.py | 104 +++++++++++++++++++++------ delft/textClassification/wrapper.py | 59 +++++++++++++-- 2 files changed, 134 insertions(+), 29 deletions(-) diff --git a/delft/applications/textClassifier.py b/delft/applications/textClassifier.py index d497a23a..208255f3 100644 --- a/delft/applications/textClassifier.py +++ b/delft/applications/textClassifier.py @@ -1,6 +1,4 @@ import argparse -import csv -import json import sys import time @@ -11,6 +9,7 @@ from delft.textClassification import Classifier from delft.textClassification.models import architectures from delft.textClassification.reader import load_texts_and_classes_generic +from delft.utilities.Utilities import t_or_f pretrained_transformers_examples = ['bert-base-cased', 'bert-large-cased', 'allenai/scibert_scivocab_cased'] @@ -26,12 +25,12 @@ def get_one_hot(y): return y2 -def configure(architecture, batch_size_, max_sequence_length_, patience_): + +def configure(architecture, max_sequence_length_=-1, batch_size_=-1, max_epoch_=-1, patience_=-1, early_stop=True): batch_size = 256 maxlen = 150 if max_sequence_length_ == -1 else max_sequence_length_ patience = 5 if patience_ == -1 else patience_ - early_stop = True - max_epoch = 60 + max_epoch = 60 if max_epoch_ == -1 else max_epoch_ # default bert model parameters if architecture == "bert": @@ -44,25 +43,50 @@ def configure(architecture, batch_size_, max_sequence_length_, patience_): return batch_size, maxlen, patience, early_stop, max_epoch -def train(model_name, architecture, input_file, embeddings_name, fold_count, transformer=None, - x_index=0, y_indexes=[1], batch_size=-1, max_sequence_length=-1, patience=-1): +def train(model_name, + architecture, + input_file, + embeddings_name, + fold_count, + transformer=None, + x_index=0, + y_indexes=[1], + batch_size=-1, + max_sequence_length=-1, + patience=-1, + incremental=False, + learning_rate=None, + multi_gpu=False, + max_epoch=50, + early_stop=True + ): + batch_size, maxlen, patience, early_stop, max_epoch = configure(architecture, batch_size, - max_sequence_length, patience) + max_sequence_length, patience, learning_rate) print('loading ' + model_name + ' training corpus...') xtr, y = load_texts_and_classes_generic(input_file, x_index, y_indexes) list_classes = list(set([y_[0] for y_ in y])) - model = Classifier(model_name, architecture=architecture, list_classes=list_classes, max_epoch=max_epoch, - fold_number=fold_count, patience=patience, transformer_name=transformer, - use_roc_auc=True, embeddings_name=embeddings_name, early_stop=early_stop, - batch_size=batch_size, maxlen=maxlen, class_weights=None) + model = Classifier(model_name, + architecture=architecture, + list_classes=list_classes, + max_epoch=max_epoch, + fold_number=fold_count, + patience=patience, + transformer_name=transformer, + use_roc_auc=True, + embeddings_name=embeddings_name, + early_stop=early_stop, + batch_size=batch_size, + maxlen=maxlen, + class_weights=None) y_ = get_one_hot(y) if fold_count == 1: - model.train(xtr, y_) + model.train(xtr, y_, incremental=incremental) else: model.train_nfold(xtr, y_) # saving the model @@ -87,7 +111,7 @@ def eval(model_name, architecture, input_file, x_index=0, y_indexes=[1]): def train_and_eval(model_name, architecture, input_file, embeddings_name, fold_count, transformer=None, x_index=0, y_indexes=[1], batch_size=-1, - max_sequence_length=-1, patience=-1): + max_sequence_length=-1, patience=-1, multi_gpu=False): batch_size, maxlen, patience, early_stop, max_epoch = configure(architecture, batch_size, max_sequence_length, patience) @@ -107,9 +131,9 @@ def train_and_eval(model_name, architecture, input_file, embeddings_name, fold_c x_train, x_test, y_train, y_test = train_test_split(xtr, y_one_hot, test_size=0.1) if fold_count == 1: - model.train(x_train, y_train) + model.train(x_train, y_train, multi_gpu=multi_gpu) else: - model.train_nfold(x_train, y_train) + model.train_nfold(x_train, y_train, multi_gpu=multi_gpu) model.eval(x_test, y_test) @@ -173,8 +197,19 @@ def classify(model_name, architecture, texts, output_format='json'): parser.add_argument("--max-sequence-length", type=int, default=-1, help="max-sequence-length parameter to be used.") parser.add_argument("--batch-size", type=int, default=-1, help="batch-size parameter to be used.") - parser.add_argument("--patience", type=int, default=5, - help="patience, number of epoques to count before stopping the training (only used in train or train_eval).") + parser.add_argument("--patience", type=int, default=-1, help="patience, number of extra epochs to perform after " + "the best epoch before stopping a training.") + parser.add_argument("--learning-rate", type=float, default=None, help="Initial learning rate") + parser.add_argument("--incremental", action="store_true", help="training is incremental, starting from existing model if present") + parser.add_argument("--max-epoch", type=int, default=-1, + help="Maximum number of epochs for training.") + parser.add_argument("--early-stop", type=t_or_f, default=None, + help="Force early training termination when metrics scores are not improving " + + "after a number of epochs equals to the patience parameter.") + + parser.add_argument("--multi-gpu", default=False, + help="Enable the support for distributed computing (the batch size needs to be set accordingly using --batch-size)", + action="store_true") args = parser.parse_args() @@ -186,7 +221,12 @@ def classify(model_name, architecture, texts, output_format='json'): x_index = args.x_index patience = args.patience batch_size = args.batch_size + incremental = args.incremental max_sequence_length = args.max_sequence_length + learning_rate = args.learning_rate + max_epoch = args.max_epoch + early_stop = args.early_stop + multi_gpu = args.multi_gpu if args.action != "classify": if args.y_indexes is None: @@ -204,8 +244,17 @@ def classify(model_name, architecture, texts, output_format='json'): if args.action == 'train': train(model_name, architecture, input_file, embeddings_name, args.fold_count, - transformer=transformer, x_index=x_index, y_indexes=y_indexes, batch_size=batch_size, - max_sequence_length=max_sequence_length, patience=patience) + transformer=transformer, + x_index=x_index, + y_indexes=y_indexes, + batch_size=batch_size, + incremental=incremental, + max_sequence_length=max_sequence_length, + patience=patience, + learning_rate=learning_rate, + max_epoch=max_epoch, + early_stop=early_stop, + multi_gpu=multi_gpu) elif args.action == 'eval': eval(model_name, architecture, input_file, x_index=x_index, y_indexes=y_indexes) @@ -214,9 +263,18 @@ def classify(model_name, architecture, texts, output_format='json'): if args.fold_count < 1: raise ValueError("fold-count should be equal or more than 1") - train_and_eval(model_name, architecture, input_file, embeddings_name, args.fold_count, - transformer=transformer, x_index=x_index, y_indexes=y_indexes, batch_size=batch_size, - max_sequence_length=max_sequence_length, patience=patience) + train_and_eval(model_name, + architecture, + input_file, + embeddings_name, + args.fold_count, + transformer=transformer, + x_index=x_index, + y_indexes=y_indexes, + batch_size=batch_size, + max_sequence_length=max_sequence_length, + patience=patience, + multi_gpu=multi_gpu) elif args.action == 'classify': lines, _ = load_texts_and_classes_generic(input_file, x_index, None) diff --git a/delft/textClassification/wrapper.py b/delft/textClassification/wrapper.py index dcfbeb74..4a5eecad 100644 --- a/delft/textClassification/wrapper.py +++ b/delft/textClassification/wrapper.py @@ -1,5 +1,7 @@ import os +from packaging import version + from delft.sequenceLabelling.trainer import LogLearningRateCallback # ask tensorflow to be quiet and not print hundred lines of logs from delft.utilities.misc import print_parameters @@ -136,10 +138,25 @@ def __init__(self, class_weights=class_weights, multiprocessing=multiprocessing) - def train(self, x_train, y_train, vocab_init=None, incremental=False, callbacks=None): + def train(self, x_train, y_train, vocab_init=None, incremental=False, callbacks=None, multi_gpu=False): + if multi_gpu: + strategy = tf.distribute.MirroredStrategy() + print('Running with multi-gpu. Number of devices: {}'.format(strategy.num_replicas_in_sync)) + + # This trick avoid an exception being through when the --multi-gpu approach is used on a single GPU system. + # It might be removed with TF 2.10 https://github.com/tensorflow/tensorflow/issues/50487 + import atexit + atexit.register(strategy._extended._collective_ops._pool.close) # type: ignore + + with strategy.scope(): + self.train_(x_train, y_train, vocab_init, incremental, callbacks) + else: + self.train_(x_train, y_train, vocab_init, incremental, callbacks) + + def train_(self, x_train, y_train, vocab_init=None, incremental=False, callbacks=None): if incremental: - if self.model == None and self.models == None: + if self.model is None and self.models is None: print("error: you must load a model first for an incremental training") return print("Incremental training from loaded model", self.model_config.model_name) @@ -193,7 +210,22 @@ def train(self, x_train, y_train, vocab_init=None, incremental=False, callbacks= callbacks=callbacks) - def train_nfold(self, x_train, y_train, vocab_init=None, incremental=False, callbacks=None): + def train_nfold(self, x_train, y_train, vocab_init=None, incremental=False, callbacks=None, multi_gpu=False): + if multi_gpu: + strategy = tf.distribute.MirroredStrategy() + print('Running with multi-gpu. Number of devices: {}'.format(strategy.num_replicas_in_sync)) + + # This trick avoid an exception being through when the --multi-gpu approach is used on a single GPU system. + # It might be removed with TF 2.10 https://github.com/tensorflow/tensorflow/issues/50487 + import atexit + atexit.register(strategy._extended._collective_ops._pool.close) # type: ignore + + with strategy.scope(): + self.train_nfold_(x_train, y_train,vocab_init, incremental, callbacks) + else: + self.train_nfold_(x_train, y_train, vocab_init, incremental, callbacks) + + def train_nfold_(self, x_train, y_train, vocab_init=None, incremental=False, callbacks=None): if incremental: if self.models == None: print("error: you must load a model first for an incremental training") @@ -203,8 +235,23 @@ def train_nfold(self, x_train, y_train, vocab_init=None, incremental=False, call else: self.models = train_folds(x_train, y_train, self.model_config, self.training_config, self.embeddings, None, callbacks=callbacks) + def predict(self, texts, output_format='json', use_main_thread_only=False, batch_size=None, multi_gpu=False): + if multi_gpu: + strategy = tf.distribute.MirroredStrategy() + print('Running with multi-gpu. Number of devices: {}'.format(strategy.num_replicas_in_sync)) + + # This trick avoid an exception being through when the --multi-gpu approach is used on a single GPU system. + # It might be removed with TF 2.10 https://github.com/tensorflow/tensorflow/issues/50487 + if version.parse(tf.__version__) < version.parse('2.10.0'): + import atexit + atexit.register(strategy._extended._collective_ops._pool.close) # type: ignore - def predict(self, texts, output_format='json', use_main_thread_only=False, batch_size=None): + with strategy.scope(): + return self.predict_(texts, output_format, use_main_thread_only, batch_size) + else: + return self.predict_(texts, output_format, use_main_thread_only, batch_size) + + def predict_(self, texts, output_format='json', use_main_thread_only=False, batch_size=None): bert_data = False if self.transformer_name != None: bert_data = True @@ -216,7 +263,7 @@ def predict(self, texts, output_format='json', use_main_thread_only=False, batch print("---") if self.model_config.fold_number == 1: - if self.model != None: + if self.model is not None: predict_generator = DataGenerator(texts, None, batch_size=self.model_config.batch_size, maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, @@ -226,7 +273,7 @@ def predict(self, texts, output_format='json', use_main_thread_only=False, batch else: raise (OSError('Could not find a model.')) else: - if self.models != None: + if self.models is not None: # just a warning: n classifiers using BERT layer for prediction might be heavy in term of model sizes predict_generator = DataGenerator(texts, None, batch_size=self.model_config.batch_size, From 4e6ed0dc8115d2c1b282754415c683ef977c33f8 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 18 Jan 2024 18:12:43 +0900 Subject: [PATCH 26/27] put some order with the parameters --- delft/applications/textClassifier.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/delft/applications/textClassifier.py b/delft/applications/textClassifier.py index 208255f3..79a3e6fc 100644 --- a/delft/applications/textClassifier.py +++ b/delft/applications/textClassifier.py @@ -61,8 +61,12 @@ def train(model_name, early_stop=True ): - batch_size, maxlen, patience, early_stop, max_epoch = configure(architecture, batch_size, - max_sequence_length, patience, learning_rate) + batch_size, maxlen, patience, early_stop, max_epoch = configure(architecture, + max_sequence_length, + batch_size, + max_epoch, + patience, + early_stop=early_stop) print('loading ' + model_name + ' training corpus...') xtr, y = load_texts_and_classes_generic(input_file, x_index, y_indexes) @@ -81,12 +85,13 @@ def train(model_name, early_stop=early_stop, batch_size=batch_size, maxlen=maxlen, - class_weights=None) + class_weights=None, + learning_rate=learning_rate) y_ = get_one_hot(y) if fold_count == 1: - model.train(xtr, y_, incremental=incremental) + model.train(xtr, y_, incremental=incremental, multi_gpu=multi_gpu) else: model.train_nfold(xtr, y_) # saving the model From 92d957314457dec564f9c575d8ee2404e1f83fdf Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 18 Jan 2024 18:17:14 +0900 Subject: [PATCH 27/27] fix multigpu tricks for tf < 2.10 --- delft/applications/textClassifier.py | 2 +- delft/sequenceLabelling/wrapper.py | 10 ++++++---- delft/textClassification/wrapper.py | 10 ++++++---- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/delft/applications/textClassifier.py b/delft/applications/textClassifier.py index 79a3e6fc..979b5427 100644 --- a/delft/applications/textClassifier.py +++ b/delft/applications/textClassifier.py @@ -93,7 +93,7 @@ def train(model_name, if fold_count == 1: model.train(xtr, y_, incremental=incremental, multi_gpu=multi_gpu) else: - model.train_nfold(xtr, y_) + model.train_nfold(xtr, y_, multi_gpu=multi_gpu) # saving the model model.save() diff --git a/delft/sequenceLabelling/wrapper.py b/delft/sequenceLabelling/wrapper.py index f2b2d802..538e483d 100644 --- a/delft/sequenceLabelling/wrapper.py +++ b/delft/sequenceLabelling/wrapper.py @@ -151,8 +151,9 @@ def train(self, x_train, y_train, f_train=None, x_valid=None, y_valid=None, f_va # This trick avoid an exception being through when the --multi-gpu approach is used on a single GPU system. # It might be removed with TF 2.10 https://github.com/tensorflow/tensorflow/issues/50487 - import atexit - atexit.register(strategy._extended._collective_ops._pool.close) # type: ignore + if version.parse(tf.__version__) < version.parse('2.10.0'): + import atexit + atexit.register(strategy._extended._collective_ops._pool.close) # type: ignore with strategy.scope(): self.train_(x_train, y_train, f_train, x_valid, y_valid, f_valid, incremental, callbacks) @@ -219,8 +220,9 @@ def train_nfold(self, x_train, y_train, x_valid=None, y_valid=None, f_train=None # This trick avoid an exception being through when the --multi-gpu approach is used on a single GPU system. # It might be removed with TF 2.10 https://github.com/tensorflow/tensorflow/issues/50487 - import atexit - atexit.register(strategy._extended._collective_ops._pool.close) # type: ignore + if version.parse(tf.__version__) < version.parse('2.10.0'): + import atexit + atexit.register(strategy._extended._collective_ops._pool.close) # type: ignore with strategy.scope(): self.train_nfold_(x_train, y_train, x_valid, y_valid, f_train, f_valid, incremental, callbacks) diff --git a/delft/textClassification/wrapper.py b/delft/textClassification/wrapper.py index 4a5eecad..b44b633d 100644 --- a/delft/textClassification/wrapper.py +++ b/delft/textClassification/wrapper.py @@ -145,8 +145,9 @@ def train(self, x_train, y_train, vocab_init=None, incremental=False, callbacks= # This trick avoid an exception being through when the --multi-gpu approach is used on a single GPU system. # It might be removed with TF 2.10 https://github.com/tensorflow/tensorflow/issues/50487 - import atexit - atexit.register(strategy._extended._collective_ops._pool.close) # type: ignore + if version.parse(tf.__version__) < version.parse('2.10.0'): + import atexit + atexit.register(strategy._extended._collective_ops._pool.close) # type: ignore with strategy.scope(): self.train_(x_train, y_train, vocab_init, incremental, callbacks) @@ -217,8 +218,9 @@ def train_nfold(self, x_train, y_train, vocab_init=None, incremental=False, call # This trick avoid an exception being through when the --multi-gpu approach is used on a single GPU system. # It might be removed with TF 2.10 https://github.com/tensorflow/tensorflow/issues/50487 - import atexit - atexit.register(strategy._extended._collective_ops._pool.close) # type: ignore + if version.parse(tf.__version__) < version.parse('2.10.0'): + import atexit + atexit.register(strategy._extended._collective_ops._pool.close) # type: ignore with strategy.scope(): self.train_nfold_(x_train, y_train,vocab_init, incremental, callbacks)