-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtraining.py
91 lines (72 loc) · 3.34 KB
/
training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import sys
from typing import List
import torch
from flair.data import Corpus
from flair.datasets import ColumnCorpus
# define columns
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings, TransformerWordEmbeddings
from custom_embeddings import NoContextBertEmbeddings, LIWC2015Embeddings, LIWC2007Embeddings
if __name__ == '__main__':
if(len(sys.argv) != 2):
raise TypeError('Usage: python training.py <model_name>')
model_name = sys.argv[1]
torch.cuda.empty_cache()
columns = {0: 'text', 1: 'propaganda', 2: 'doc_id', 3: 'sentence_id'}
# this is the folder in which train, test and dev files reside
data_folder = './data'
# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns, in_memory=False,
train_file='train.txt',
test_file='test.txt',
dev_file='validate.txt')
#corpus.filter_empty_sentences()
#print(corpus)
#for i in range(len(corpus.train)):
# if len(corpus.train[i]) == 1: #and len(corpus.train[i].get_token(1).text) == 0:
# print(corpus.train[i])
# print(len(corpus.train[i].get_token(1).text))
# sys.exit(0)
tag_type = 'propaganda'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary)
embedding_types: List[TokenEmbeddings] = [
NoContextBertEmbeddings(),
#WordEmbeddings('glove'),
# LIWC2015Embeddings(),
# LIWC2007Embeddings(),
# other embeddings
# CharacterEmbeddings(),
# TransformerWordEmbeddings('bert-base-uncased', use_scalar_mix = True),
#FlairEmbeddings('news-forward'),
#FlairEmbeddings('news-backward'),
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)
# 5. initialize sequence tagger
from flair.models import SequenceTagger
tagger: SequenceTagger = SequenceTagger(hidden_size=200,
embeddings=embeddings,
tag_dictionary=tag_dictionary,
tag_type=tag_type,
train_initial_hidden_state = True,
loss_weights = {'0': 1, '1': 8},
use_crf=False,
dropout = 0.2)
# 6. initialize trainer
from flair.trainers import ModelTrainer
trainer: ModelTrainer = ModelTrainer(tagger, corpus)
# 7. start training
trainer.train('resources/taggers/' + model_name,
learning_rate=0.05,
anneal_factor = 0.2,
train_with_dev = False,
mini_batch_size=10,
max_epochs=5,
checkpoint=True,
embeddings_storage_mode = 'gpu',
patience=0,
monitor_test=True)
# 8. plot weight traces (optional)
from flair.visual.training_curves import Plotter
#plotter = Plotter()
#plotter.plot_weights('resources/taggers/propaganda/weights.txt')
#plotter.plot_training_curves('resources/taggers/propaganda/loss.tsv')