diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py index fea9a8b81..79d9748e6 100644 --- a/tuning/sft_trainer.py +++ b/tuning/sft_trainer.py @@ -67,7 +67,10 @@ write_termination_log, ) from tuning.utils.logging import set_log_level -from tuning.utils.tokenizer_data_utils import tokenizer_and_embedding_resize +from tuning.utils.tokenizer_data_utils import ( + tokenizer_and_embedding_resize, + set_special_tokens_dict, +) def train( @@ -260,42 +263,9 @@ def train( tokenizer.chat_template = data_args.chat_template # Add special tokens only when a custom tokenizer is not passed - special_tokens_dict = {} - if not model_args.tokenizer_name_or_path: - # TODO: understand if we need to hardcode these here or just use defaults in model - if isinstance(tokenizer, (LlamaTokenizer, LlamaTokenizerFast)): - special_tokens_dict["bos_token"] = "" - special_tokens_dict["eos_token"] = "" - special_tokens_dict["unk_token"] = "" - special_tokens_dict["pad_token"] = "" - elif isinstance(tokenizer, (GPT2Tokenizer, GPTNeoXTokenizerFast)): - special_tokens_dict["pad_token"] = "" - - # add special tokens only when a custom tokenizer is not passed - if not model_args.tokenizer_name_or_path: - # TODO: we need to change this, perhaps follow what open instruct does? - if tokenizer.pad_token is None: - logger.warning("PAD token set to default, missing in tokenizer") - special_tokens_dict["pad_token"] = configs.DEFAULT_PAD_TOKEN - if tokenizer.eos_token is None: - logger.warning("EOS token set to default, missing in tokenizer") - special_tokens_dict["eos_token"] = configs.DEFAULT_EOS_TOKEN - if tokenizer.bos_token is None: - logger.warning("BOS token set to default, missing in tokenizer") - special_tokens_dict["bos_token"] = configs.DEFAULT_BOS_TOKEN - if tokenizer.unk_token is None: - logger.warning("UNK token set to default, missing in tokenizer") - special_tokens_dict["unk_token"] = configs.DEFAULT_UNK_TOKEN - if tokenizer.pad_token == tokenizer.eos_token: - logger.warning( - "PAD token set to default, to make it different from eos token" - ) - if tokenizer.eos_token != configs.DEFAULT_PAD_TOKEN: - tokenizer.pad_token = configs.DEFAULT_PAD_TOKEN - special_tokens_dict["pad_token"] = configs.DEFAULT_PAD_TOKEN - else: - tokenizer.eos_token = configs.DEFAULT_EOS_TOKEN - special_tokens_dict["eos_token"] = configs.DEFAULT_EOS_TOKEN + special_tokens_dict = set_special_tokens_dict( + model_args=model_args, tokenizer=tokenizer + ) # TODO: lower priority but understand if resizing impacts inference quality and why its needed. # It makes sense if we manipulate tokenizer that we also save it and provide it to inference. diff --git a/tuning/utils/tokenizer_data_utils.py b/tuning/utils/tokenizer_data_utils.py index ef0662d59..61ae69c84 100644 --- a/tuning/utils/tokenizer_data_utils.py +++ b/tuning/utils/tokenizer_data_utils.py @@ -14,10 +14,71 @@ # Standard from typing import Dict +import logging import math # Third Party import transformers +from tuning.config import configs + +logger = logging.getLogger(__name__) + + +def set_special_tokens_dict( + model_args: configs.ModelArguments, tokenizer: transformers.PreTrainedTokenizer +) -> dict: + """Creates a special tokens dictionary and sets the special tokens, + depending on the tokenizer. + + Args: + model_args: configs.ModelArguments. + tokenizer: transformers.PreTrainedTokenizer. + + Returns: + dict: Special tokens for the tokenizer. + """ + + special_tokens_dict = {} + if not model_args.tokenizer_name_or_path: + # TODO: understand if we need to hardcode these here or just use defaults in model + if isinstance( + tokenizer, (transformers.LlamaTokenizer, transformers.LlamaTokenizerFast) + ): + special_tokens_dict["bos_token"] = "" + special_tokens_dict["eos_token"] = "" + special_tokens_dict["unk_token"] = "" + special_tokens_dict["pad_token"] = "" + elif isinstance( + tokenizer, (transformers.GPT2Tokenizer, transformers.GPTNeoXTokenizerFast) + ): + special_tokens_dict["pad_token"] = "" + + # add special tokens only when a custom tokenizer is not passed + if not model_args.tokenizer_name_or_path: + # TODO: we need to change this, perhaps follow what open instruct does? + if tokenizer.pad_token is None: + logger.warning("PAD token set to default, missing in tokenizer") + special_tokens_dict["pad_token"] = configs.DEFAULT_PAD_TOKEN + if tokenizer.eos_token is None: + logger.warning("EOS token set to default, missing in tokenizer") + special_tokens_dict["eos_token"] = configs.DEFAULT_EOS_TOKEN + if tokenizer.bos_token is None: + logger.warning("BOS token set to default, missing in tokenizer") + special_tokens_dict["bos_token"] = configs.DEFAULT_BOS_TOKEN + if tokenizer.unk_token is None: + logger.warning("UNK token set to default, missing in tokenizer") + special_tokens_dict["unk_token"] = configs.DEFAULT_UNK_TOKEN + if tokenizer.pad_token == tokenizer.eos_token: + logger.warning( + "PAD token set to default, to make it different from eos token" + ) + if tokenizer.eos_token != configs.DEFAULT_PAD_TOKEN: + tokenizer.pad_token = configs.DEFAULT_PAD_TOKEN + special_tokens_dict["pad_token"] = configs.DEFAULT_PAD_TOKEN + else: + tokenizer.eos_token = configs.DEFAULT_EOS_TOKEN + special_tokens_dict["eos_token"] = configs.DEFAULT_EOS_TOKEN + return special_tokens_dict def tokenizer_and_embedding_resize( @@ -30,10 +91,10 @@ def tokenizer_and_embedding_resize( Args: special_tokens_dict: Dict containing special tokens to be added. tokenizer: transformers.PreTrainedTokenizer. - model: transformers.PreTrainedModel + model: transformers.PreTrainedModel. multiple_of: int , embeddings are resized to multiple of this. Return: - dict: Metadata on number of added tokens + dict: Metadata on number of added tokens. """ num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict) embedding_size = int(multiple_of * math.ceil(len(tokenizer) / multiple_of))