Skip to content

Commit

Permalink
Merge branch 'master' of github.com:kermitt2/delft
Browse files Browse the repository at this point in the history
  • Loading branch information
kermitt2 committed Nov 28, 2023
2 parents e6d2a52 + f8adbf5 commit 3671cd4
Show file tree
Hide file tree
Showing 6 changed files with 160 additions and 60 deletions.
38 changes: 22 additions & 16 deletions delft/applications/datasetTagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None,
input_path=None, output_path=None, fold_count=1,
features_indices=None, max_sequence_length=-1,
batch_size=-1, max_epoch=-1, use_ELMo=False, patience=-1,
learning_rate=None):
learning_rate=None, multi_gpu=False):
print('Loading data...')
if input_path is None:
x_all1 = y_all1 = x_all2 = y_all2 = x_all3 = y_all3 = []
Expand Down Expand Up @@ -116,7 +116,7 @@ def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None,
learning_rate=learning_rate)

start_time = time.time()
model.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid)
model.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid, multi_gpu=multi_gpu)
runtime = round(time.time() - start_time, 3)

print("training runtime: %s seconds " % runtime)
Expand All @@ -132,7 +132,7 @@ def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None,
def train_eval(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None,
input_path=None, output_path=None, fold_count=1,
features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1, use_ELMo=False,
patience=-1, learning_rate=None):
patience=-1, learning_rate=None, multi_gpu=False):
print('Loading data...')
if input_path is None:
x_all1 = y_all1 = x_all2 = y_all2 = x_all3 = y_all3 = []
Expand Down Expand Up @@ -207,7 +207,7 @@ def eval_(input_path=None, architecture=None):


# annotate a list of texts
def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=None, use_ELMo=False):
def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=None, use_ELMo=False, multi_gpu=False):
annotations = []

# load model
Expand All @@ -221,7 +221,7 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non

start_time = time.time()

annotations = model.tag(texts, output_format, features=features)
annotations = model.tag(texts, output_format, features=features, multi_gpu=multi_gpu)
runtime = round(time.time() - start_time, 3)

if output_format == 'json':
Expand Down Expand Up @@ -285,6 +285,9 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non
parser.add_argument("--patience", type=int, default=-1, help="patience, number of extra epochs to perform after "
"the best epoch before stopping a training.")
parser.add_argument("--learning-rate", type=float, default=None, help="Initial learning rate")
parser.add_argument("--multi-gpu", default=False,
help="Enable the support for distributed computing (the batch size needs to be set accordingly using --batch-size)",
action="store_true")

args = parser.parse_args()

Expand All @@ -299,22 +302,24 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non
use_ELMo = args.use_ELMo
patience = args.patience
learning_rate = args.learning_rate
multi_gpu = args.multi_gpu

if transformer is None and embeddings_name is None:
# default word embeddings
embeddings_name = "glove-840B"

if action == "train":
train(embeddings_name=embeddings_name,
architecture=architecture,
transformer=transformer,
input_path=input_path,
output_path=output,
max_sequence_length=max_sequence_length,
batch_size=batch_size,
use_ELMo=use_ELMo,
patience=patience,
learning_rate=learning_rate)
architecture=architecture,
transformer=transformer,
input_path=input_path,
output_path=output,
max_sequence_length=max_sequence_length,
batch_size=batch_size,
use_ELMo=use_ELMo,
patience=patience,
learning_rate=learning_rate,
multi_gpu=multi_gpu)

if action == "eval":
if args.fold_count is not None and args.fold_count > 1:
Expand All @@ -337,7 +342,8 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non
batch_size=batch_size,
use_ELMo=use_ELMo,
patience=patience,
learning_rate=learning_rate)
learning_rate=learning_rate,
multi_gpu=multi_gpu)

if action == "tag":
someTexts = []
Expand All @@ -347,7 +353,7 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non
someTexts.append("We also compare ShanghaiTechRGBD with other RGB-D crowd counting datasets in , and we can see that ShanghaiTechRGBD is the most challenging RGB-D crowd counting dataset in terms of the number of images and heads.")
someTexts.append("Insulin levels of all samples were measured by ELISA kit (Mercodia)")

result = annotate_text(someTexts, "json", architecture=architecture, use_ELMo=use_ELMo)
result = annotate_text(someTexts, "json", architecture=architecture, use_ELMo=use_ELMo, multi_gpu=multi_gpu)
print(json.dumps(result, sort_keys=False, indent=4, ensure_ascii=False))


38 changes: 23 additions & 15 deletions delft/applications/grobidTagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,9 @@ def configure(model, architecture, output_path=None, max_sequence_length=-1, bat


# train a GROBID model with all available data
def train(model, embeddings_name=None, architecture=None, transformer=None, input_path=None,
output_path=None, features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1,
use_ELMo=False, incremental=False, input_model_path=None, patience=-1, learning_rate=None):
def train(model, embeddings_name=None, architecture=None, transformer=None, input_path=None,
output_path=None, features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1,
use_ELMo=False, incremental=False, input_model_path=None, patience=-1, learning_rate=None, multi_gpu=False):

print('Loading data...')
if input_path == None:
Expand Down Expand Up @@ -201,7 +201,8 @@ def train(model, embeddings_name=None, architecture=None, transformer=None, inpu
model.load()

start_time = time.time()
model.train(x_train, y_train, f_train, x_valid, y_valid, f_valid, incremental=incremental)
model.train(x_train, y_train, f_train, x_valid, y_valid, f_valid, incremental=incremental, multi_gpu=multi_gpu)

runtime = round(time.time() - start_time, 3)
print("training runtime: %s seconds " % (runtime))

Expand All @@ -216,7 +217,7 @@ def train(model, embeddings_name=None, architecture=None, transformer=None, inpu
def train_eval(model, embeddings_name=None, architecture='BidLSTM_CRF', transformer=None,
input_path=None, output_path=None, fold_count=1,
features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1,
use_ELMo=False, incremental=False, input_model_path=None, patience=-1, learning_rate=None):
use_ELMo=False, incremental=False, input_model_path=None, patience=-1, learning_rate=None, multi_gpu=False):
print('Loading data...')
if input_path is None:
x_all, y_all, f_all = load_data_and_labels_crf_file('data/sequenceLabelling/grobid/'+model+'/'+model+'-060518.train')
Expand Down Expand Up @@ -270,9 +271,9 @@ def train_eval(model, embeddings_name=None, architecture='BidLSTM_CRF', transfor
start_time = time.time()

if fold_count == 1:
model.train(x_train, y_train, f_train=f_train, x_valid=x_valid, y_valid=y_valid, f_valid=f_valid, incremental=incremental)
model.train(x_train, y_train, f_train=f_train, x_valid=x_valid, y_valid=y_valid, f_valid=f_valid, incremental=incremental, multi_gpu=multi_gpu)
else:
model.train_nfold(x_train, y_train, f_train=f_train, x_valid=x_valid, y_valid=y_valid, f_valid=f_valid, incremental=incremental)
model.train_nfold(x_train, y_train, f_train=f_train, x_valid=x_valid, y_valid=y_valid, f_valid=f_valid, incremental=incremental, multi_gpu=multi_gpu)

runtime = round(time.time() - start_time, 3)
print("training runtime: %s seconds " % runtime)
Expand Down Expand Up @@ -321,7 +322,7 @@ def eval_(model, input_path=None, architecture='BidLSTM_CRF', use_ELMo=False):

# annotate a list of texts, this is relevant only of models taking only text as input
# (so not text with layout information)
def annotate_text(texts, model, output_format, architecture='BidLSTM_CRF', features=None, use_ELMo=False):
def annotate_text(texts, model, output_format, architecture='BidLSTM_CRF', features=None, use_ELMo=False, multi_gpu=False):
annotations = []

# load model
Expand All @@ -335,7 +336,7 @@ def annotate_text(texts, model, output_format, architecture='BidLSTM_CRF', featu

start_time = time.time()

annotations = model.tag(texts, output_format, features=features)
annotations = model.tag(texts, output_format, features=features, multi_gpu=multi_gpu)
runtime = round(time.time() - start_time, 3)

if output_format == 'json':
Expand Down Expand Up @@ -409,6 +410,9 @@ class Tasks:
parser.add_argument("--patience", type=int, default=-1, help="patience, number of extra epochs to perform after "
"the best epoch before stopping a training.")
parser.add_argument("--learning-rate", type=float, default=None, help="Initial learning rate")
parser.add_argument("--multi-gpu", default=False,
help="Enable the support for distributed computing (the batch size needs to be set accordingly using --batch-size)",
action="store_true")

args = parser.parse_args()

Expand All @@ -426,6 +430,7 @@ class Tasks:
incremental = args.incremental
patience = args.patience
learning_rate = args.learning_rate
multi_gpu = args.multi_gpu

if architecture is None:
raise ValueError("A model architecture has to be specified: " + str(architectures))
Expand All @@ -436,18 +441,19 @@ class Tasks:

if action == Tasks.TRAIN:
train(model,
embeddings_name=embeddings_name,
architecture=architecture,
embeddings_name=embeddings_name,
architecture=architecture,
transformer=transformer,
input_path=input_path,
input_path=input_path,
output_path=output,
max_sequence_length=max_sequence_length,
batch_size=batch_size,
use_ELMo=use_ELMo,
incremental=incremental,
input_model_path=input_model_path,
patience=patience,
learning_rate=learning_rate)
learning_rate=learning_rate,
multi_gpu=multi_gpu)

if action == Tasks.EVAL:
if args.fold_count is not None and args.fold_count > 1:
Expand All @@ -472,7 +478,8 @@ class Tasks:
use_ELMo=use_ELMo,
incremental=incremental,
input_model_path=input_model_path,
learning_rate=learning_rate)
learning_rate=learning_rate,
multi_gpu=multi_gpu)

if action == Tasks.TAG:
someTexts = []
Expand All @@ -482,6 +489,7 @@ class Tasks:
someTexts.append("March the 27th, 2001")
someTexts.append(" on April 27, 2001. . ")
someTexts.append('2018')
someTexts.append('2023 July the 22nd')
elif model == 'citation':
someTexts.append("N. Al-Dhahir and J. Cioffi, \“On the uniform ADC bit precision and clip level computation for a Gaussian signal,\” IEEE Trans. Signal Processing, pp. 434–438, Feb. 1996.")
someTexts.append("T. Steinherz, E. Rivlin, N. Intrator, Off-line cursive script word recognition—a survey, Int. J. Doc. Anal. Recognition 2(3) (1999) 1–33.")
Expand All @@ -498,7 +506,7 @@ class Tasks:
someTexts.append("The statistical analysis was performed using IBM SPSS Statistics v. 20 (SPSS Inc, 2003, Chicago, USA).")

if architecture.find("FEATURE") == -1:
result = annotate_text(someTexts, model, "json", architecture=architecture, use_ELMo=use_ELMo)
result = annotate_text(someTexts, model, "json", architecture=architecture, use_ELMo=use_ELMo, multi_gpu=multi_gpu)
print(json.dumps(result, sort_keys=False, indent=4, ensure_ascii=False))
else:
print("The model " + architecture + " cannot be used without supplying features as input and it's disabled. "
Expand Down
21 changes: 15 additions & 6 deletions delft/applications/insultTagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ def configure(architecture, embeddings_name):

return batch_size, maxlen, patience, early_stop, max_epoch, embeddings_name

def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None, use_ELMo=False, learning_rate=None):
def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None, use_ELMo=False, learning_rate=None,
multi_gpu=False):
batch_size, maxlen, patience, early_stop, max_epoch, embeddings_name = configure(architecture, embeddings_name)

root = 'data/sequenceLabelling/toxic/'
Expand All @@ -42,15 +43,15 @@ def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None, us
model = Sequence(model_name, max_epoch=max_epoch, batch_size=batch_size, max_sequence_length=maxlen,
embeddings_name=embeddings_name, architecture=architecture, patience=patience, early_stop=early_stop,
transformer_name=transformer, use_ELMo=use_ELMo, learning_rate=learning_rate)
model.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid)
model.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid, multi_gpu=multi_gpu)
print('training done')

# saving the model (must be called after eval for multiple fold training)
model.save()


# annotate a list of texts, provides results in a list of offset mentions
def annotate(texts, output_format, architecture='BidLSTM_CRF', transformer=None, use_ELMo=False):
def annotate(texts, output_format, architecture='BidLSTM_CRF', transformer=None, use_ELMo=False, multi_gpu=False):
annotations = []

model_name = 'insult-' + architecture
Expand All @@ -63,7 +64,7 @@ def annotate(texts, output_format, architecture='BidLSTM_CRF', transformer=None,

start_time = time.time()

annotations = model.tag(texts, output_format)
annotations = model.tag(texts, output_format, multi_gpu=multi_gpu)
runtime = round(time.time() - start_time, 3)

if output_format == 'json':
Expand Down Expand Up @@ -115,7 +116,10 @@ def annotate(texts, output_format, architecture='BidLSTM_CRF', transformer=None,
)
parser.add_argument("--use-ELMo", action="store_true", help="Use ELMo contextual embeddings")
parser.add_argument("--learning-rate", type=float, default=None, help="Initial learning rate")

parser.add_argument("--multi-gpu", default=False,
help="Enable the support for distributed computing (the batch size needs to be set accordingly using --batch-size)",
action="store_true")

args = parser.parse_args()

if args.action not in ('train', 'tag'):
Expand All @@ -126,13 +130,18 @@ def annotate(texts, output_format, architecture='BidLSTM_CRF', transformer=None,
transformer = args.transformer
use_ELMo = args.use_ELMo
learning_rate = args.learning_rate
multi_gpu = args.multi_gpu

if transformer == None and embeddings_name == None:
# default word embeddings
embeddings_name = "glove-840B"

if args.action == 'train':
train(embeddings_name=embeddings_name, architecture=architecture, transformer=transformer, use_ELMo=use_ELMo, learning_rate=learning_rate)
train(embeddings_name=embeddings_name,
architecture=architecture,
transformer=transformer, use_ELMo=use_ELMo,
learning_rate=learning_rate,
multi_gpu=multi_gpu)

if args.action == 'tag':
someTexts = ['This is a gentle test.',
Expand Down
Loading

0 comments on commit 3671cd4

Please sign in to comment.