-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_model.py
140 lines (115 loc) · 4.61 KB
/
train_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import argparse
import numpy as np
import os
from transformers import DataCollatorForLanguageModeling
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from data_tokenization import TokenizedDataset
from typing import Any
# import torch
def compute_perplexity(nll_loss):
return np.clip(np.exp(nll_loss), 10**-6, 10**9)
def train_causal_lang_model(
path_project: str,
folder_to_save: str,
path_to_dataset: str = "",
model_type: str = "distilgpt2",
tokenized_dataset_obj: TokenizedDataset | None = None,
block_size: int = 1024,
test_size: float = 0.2,
save_model: bool = False,
) -> tuple[Any, Trainer, TokenizedDataset, tuple[Any]]:
# TODO: receive as options the training arguments for the model (as config file?)
# get tokenized dataset for model training
if not tokenized_dataset_obj:
data_obj = TokenizedDataset(
path_to_dataset,
test_size_ratio=test_size,
model_type=model_type,
block_size=block_size,
)
else:
data_obj = tokenized_dataset_obj
# get training and evaluation dataset, i.e. train / test split
lm_dataset_all_hf = data_obj.get_causal_LM_hf_dataset()
# get model tokenizer used for the dataset
tokenizer = data_obj.get_tokenizer()
# define data collator to fill in end of sequence if smaller than context used
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
# get pre-trained model from HuggingFace Hub
model = AutoModelForCausalLM.from_pretrained(model_type)
# TODO: allow for training args not to be hard-coded, as an input to this function.
# TODO: allow for automatic hyper-parameter selection?
# NOTE: The TrainingArguments options where optimized for a NVIDIA T4 GPU
# define training arguments for the model
training_args = TrainingArguments(
output_dir=os.path.join(path_project, folder_to_save),
overwrite_output_dir=True,
save_strategy="steps",
save_steps=0.25,
save_total_limit=3,
evaluation_strategy="steps",
logging_strategy="steps",
logging_steps=10, # 10 ; 0.1
learning_rate=1e-4, # 2e-5
push_to_hub=False,
per_device_train_batch_size=4, # 8 , 16
per_device_eval_batch_size=4,
gradient_accumulation_steps=8, # 16
num_train_epochs=35, # 20 ; 15
warmup_steps=150,
)
# removed regularization: weight_decay=0.01,
# get trainer object
trainer = Trainer(
model=model,
args=training_args,
train_dataset=lm_dataset_all_hf["train"],
eval_dataset=lm_dataset_all_hf["test"],
data_collator=data_collator,
)
# train the model now (if GPU available then runs there)
train_result = trainer.train()
# get final train set metrics
metrics_train = train_result.metrics
# compute_perplexity(metrics_train["train_loss"])
# get final test set metrics
metrics_eval = trainer.evaluate()
# compute_perplexity(metrics_eval["eval_loss"])
if save_model:
try:
# folder_to_save example is: "experiments/"
model.save_pretrained(
os.path.join(path_project, folder_to_save + "finetuned_model")
)
tokenizer.save_pretrained(
os.path.join(path_project, folder_to_save + "finetuned_model")
)
except:
raise (
"Error while saving model and tokenizer, please check paths or inputs."
)
return model, trainer, data_obj, (metrics_train, metrics_eval)
def main():
parser = argparse.ArgumentParser(
description="training-transformer-causal-language-modelling"
)
parser.add_argument("--path-project", type=str, help="project filepath")
parser.add_argument("--folder-to-save", type=str, help="")
parser.add_argument("--path-to-dataset", type=str, help="")
parser.add_argument("--model-type", default="distilgpt2", type=str, help="")
parser.add_argument("--block-size", default=1024, type=int, help="")
parser.add_argument("--test-size", default=0.2, type=float, help="")
parser.add_argument("--save-model", action="store_true", default=False, help="")
args = parser.parse_args()
train_causal_lang_model(
path_project=args.path_project,
folder_to_save=args.folder_to_save,
path_to_dataset=args.path_to_dataset,
model_type=args.model_type,
tokenized_dataset_obj=None,
block_size=args.block_size,
test_size=args.test_size,
save_model=args.save_model,
)
if __name__ == "__main__":
main()