diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py
index fea9a8b81..79d9748e6 100644
--- a/tuning/sft_trainer.py
+++ b/tuning/sft_trainer.py
@@ -67,7 +67,10 @@
write_termination_log,
)
from tuning.utils.logging import set_log_level
-from tuning.utils.tokenizer_data_utils import tokenizer_and_embedding_resize
+from tuning.utils.tokenizer_data_utils import (
+ tokenizer_and_embedding_resize,
+ set_special_tokens_dict,
+)
def train(
@@ -260,42 +263,9 @@ def train(
tokenizer.chat_template = data_args.chat_template
# Add special tokens only when a custom tokenizer is not passed
- special_tokens_dict = {}
- if not model_args.tokenizer_name_or_path:
- # TODO: understand if we need to hardcode these here or just use defaults in model
- if isinstance(tokenizer, (LlamaTokenizer, LlamaTokenizerFast)):
- special_tokens_dict["bos_token"] = ""
- special_tokens_dict["eos_token"] = ""
- special_tokens_dict["unk_token"] = ""
- special_tokens_dict["pad_token"] = ""
- elif isinstance(tokenizer, (GPT2Tokenizer, GPTNeoXTokenizerFast)):
- special_tokens_dict["pad_token"] = ""
-
- # add special tokens only when a custom tokenizer is not passed
- if not model_args.tokenizer_name_or_path:
- # TODO: we need to change this, perhaps follow what open instruct does?
- if tokenizer.pad_token is None:
- logger.warning("PAD token set to default, missing in tokenizer")
- special_tokens_dict["pad_token"] = configs.DEFAULT_PAD_TOKEN
- if tokenizer.eos_token is None:
- logger.warning("EOS token set to default, missing in tokenizer")
- special_tokens_dict["eos_token"] = configs.DEFAULT_EOS_TOKEN
- if tokenizer.bos_token is None:
- logger.warning("BOS token set to default, missing in tokenizer")
- special_tokens_dict["bos_token"] = configs.DEFAULT_BOS_TOKEN
- if tokenizer.unk_token is None:
- logger.warning("UNK token set to default, missing in tokenizer")
- special_tokens_dict["unk_token"] = configs.DEFAULT_UNK_TOKEN
- if tokenizer.pad_token == tokenizer.eos_token:
- logger.warning(
- "PAD token set to default, to make it different from eos token"
- )
- if tokenizer.eos_token != configs.DEFAULT_PAD_TOKEN:
- tokenizer.pad_token = configs.DEFAULT_PAD_TOKEN
- special_tokens_dict["pad_token"] = configs.DEFAULT_PAD_TOKEN
- else:
- tokenizer.eos_token = configs.DEFAULT_EOS_TOKEN
- special_tokens_dict["eos_token"] = configs.DEFAULT_EOS_TOKEN
+ special_tokens_dict = set_special_tokens_dict(
+ model_args=model_args, tokenizer=tokenizer
+ )
# TODO: lower priority but understand if resizing impacts inference quality and why its needed.
# It makes sense if we manipulate tokenizer that we also save it and provide it to inference.
diff --git a/tuning/utils/tokenizer_data_utils.py b/tuning/utils/tokenizer_data_utils.py
index ef0662d59..61ae69c84 100644
--- a/tuning/utils/tokenizer_data_utils.py
+++ b/tuning/utils/tokenizer_data_utils.py
@@ -14,10 +14,71 @@
# Standard
from typing import Dict
+import logging
import math
# Third Party
import transformers
+from tuning.config import configs
+
+logger = logging.getLogger(__name__)
+
+
+def set_special_tokens_dict(
+ model_args: configs.ModelArguments, tokenizer: transformers.PreTrainedTokenizer
+) -> dict:
+ """Creates a special tokens dictionary and sets the special tokens,
+ depending on the tokenizer.
+
+ Args:
+ model_args: configs.ModelArguments.
+ tokenizer: transformers.PreTrainedTokenizer.
+
+ Returns:
+ dict: Special tokens for the tokenizer.
+ """
+
+ special_tokens_dict = {}
+ if not model_args.tokenizer_name_or_path:
+ # TODO: understand if we need to hardcode these here or just use defaults in model
+ if isinstance(
+ tokenizer, (transformers.LlamaTokenizer, transformers.LlamaTokenizerFast)
+ ):
+ special_tokens_dict["bos_token"] = ""
+ special_tokens_dict["eos_token"] = ""
+ special_tokens_dict["unk_token"] = ""
+ special_tokens_dict["pad_token"] = ""
+ elif isinstance(
+ tokenizer, (transformers.GPT2Tokenizer, transformers.GPTNeoXTokenizerFast)
+ ):
+ special_tokens_dict["pad_token"] = ""
+
+ # add special tokens only when a custom tokenizer is not passed
+ if not model_args.tokenizer_name_or_path:
+ # TODO: we need to change this, perhaps follow what open instruct does?
+ if tokenizer.pad_token is None:
+ logger.warning("PAD token set to default, missing in tokenizer")
+ special_tokens_dict["pad_token"] = configs.DEFAULT_PAD_TOKEN
+ if tokenizer.eos_token is None:
+ logger.warning("EOS token set to default, missing in tokenizer")
+ special_tokens_dict["eos_token"] = configs.DEFAULT_EOS_TOKEN
+ if tokenizer.bos_token is None:
+ logger.warning("BOS token set to default, missing in tokenizer")
+ special_tokens_dict["bos_token"] = configs.DEFAULT_BOS_TOKEN
+ if tokenizer.unk_token is None:
+ logger.warning("UNK token set to default, missing in tokenizer")
+ special_tokens_dict["unk_token"] = configs.DEFAULT_UNK_TOKEN
+ if tokenizer.pad_token == tokenizer.eos_token:
+ logger.warning(
+ "PAD token set to default, to make it different from eos token"
+ )
+ if tokenizer.eos_token != configs.DEFAULT_PAD_TOKEN:
+ tokenizer.pad_token = configs.DEFAULT_PAD_TOKEN
+ special_tokens_dict["pad_token"] = configs.DEFAULT_PAD_TOKEN
+ else:
+ tokenizer.eos_token = configs.DEFAULT_EOS_TOKEN
+ special_tokens_dict["eos_token"] = configs.DEFAULT_EOS_TOKEN
+ return special_tokens_dict
def tokenizer_and_embedding_resize(
@@ -30,10 +91,10 @@ def tokenizer_and_embedding_resize(
Args:
special_tokens_dict: Dict containing special tokens to be added.
tokenizer: transformers.PreTrainedTokenizer.
- model: transformers.PreTrainedModel
+ model: transformers.PreTrainedModel.
multiple_of: int , embeddings are resized to multiple of this.
Return:
- dict: Metadata on number of added tokens
+ dict: Metadata on number of added tokens.
"""
num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
embedding_size = int(multiple_of * math.ceil(len(tokenizer) / multiple_of))