Skip to content

Commit

Permalink
fix: Refactored special tokens function
Browse files Browse the repository at this point in the history
Refactored the set special tokens code to it's own function in the tuning.utils.tokenizer_data_utils file. Imported the new function into sft_trainer and called it to set special tokens.

Signed-off-by: Luka Dojcinovic <56648891+Luka-D@users.noreply.github.com>
  • Loading branch information
Luka-D committed Feb 12, 2025
1 parent 381fdd5 commit 2df4780
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 39 deletions.
44 changes: 7 additions & 37 deletions tuning/sft_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,10 @@
write_termination_log,
)
from tuning.utils.logging import set_log_level
from tuning.utils.tokenizer_data_utils import tokenizer_and_embedding_resize
from tuning.utils.tokenizer_data_utils import (
tokenizer_and_embedding_resize,
set_special_tokens_dict,
)


def train(
Expand Down Expand Up @@ -260,42 +263,9 @@ def train(
tokenizer.chat_template = data_args.chat_template

# Add special tokens only when a custom tokenizer is not passed
special_tokens_dict = {}
if not model_args.tokenizer_name_or_path:
# TODO: understand if we need to hardcode these here or just use defaults in model
if isinstance(tokenizer, (LlamaTokenizer, LlamaTokenizerFast)):
special_tokens_dict["bos_token"] = "<s>"
special_tokens_dict["eos_token"] = "</s>"
special_tokens_dict["unk_token"] = "<unk>"
special_tokens_dict["pad_token"] = "<pad>"
elif isinstance(tokenizer, (GPT2Tokenizer, GPTNeoXTokenizerFast)):
special_tokens_dict["pad_token"] = "<pad>"

# add special tokens only when a custom tokenizer is not passed
if not model_args.tokenizer_name_or_path:
# TODO: we need to change this, perhaps follow what open instruct does?
if tokenizer.pad_token is None:
logger.warning("PAD token set to default, missing in tokenizer")
special_tokens_dict["pad_token"] = configs.DEFAULT_PAD_TOKEN
if tokenizer.eos_token is None:
logger.warning("EOS token set to default, missing in tokenizer")
special_tokens_dict["eos_token"] = configs.DEFAULT_EOS_TOKEN
if tokenizer.bos_token is None:
logger.warning("BOS token set to default, missing in tokenizer")
special_tokens_dict["bos_token"] = configs.DEFAULT_BOS_TOKEN
if tokenizer.unk_token is None:
logger.warning("UNK token set to default, missing in tokenizer")
special_tokens_dict["unk_token"] = configs.DEFAULT_UNK_TOKEN
if tokenizer.pad_token == tokenizer.eos_token:
logger.warning(
"PAD token set to default, to make it different from eos token"
)
if tokenizer.eos_token != configs.DEFAULT_PAD_TOKEN:
tokenizer.pad_token = configs.DEFAULT_PAD_TOKEN
special_tokens_dict["pad_token"] = configs.DEFAULT_PAD_TOKEN
else:
tokenizer.eos_token = configs.DEFAULT_EOS_TOKEN
special_tokens_dict["eos_token"] = configs.DEFAULT_EOS_TOKEN
special_tokens_dict = set_special_tokens_dict(
model_args=model_args, tokenizer=tokenizer
)

# TODO: lower priority but understand if resizing impacts inference quality and why its needed.
# It makes sense if we manipulate tokenizer that we also save it and provide it to inference.
Expand Down
65 changes: 63 additions & 2 deletions tuning/utils/tokenizer_data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,71 @@

# Standard
from typing import Dict
import logging
import math

# Third Party
import transformers
from tuning.config import configs

logger = logging.getLogger(__name__)


def set_special_tokens_dict(
model_args: configs.ModelArguments, tokenizer: transformers.PreTrainedTokenizer
) -> dict:
"""Creates a special tokens dictionary and sets the special tokens,
depending on the tokenizer.
Args:
model_args: configs.ModelArguments.
tokenizer: transformers.PreTrainedTokenizer.
Returns:
dict: Special tokens for the tokenizer.
"""

special_tokens_dict = {}
if not model_args.tokenizer_name_or_path:
# TODO: understand if we need to hardcode these here or just use defaults in model
if isinstance(
tokenizer, (transformers.LlamaTokenizer, transformers.LlamaTokenizerFast)
):
special_tokens_dict["bos_token"] = "<s>"
special_tokens_dict["eos_token"] = "</s>"
special_tokens_dict["unk_token"] = "<unk>"
special_tokens_dict["pad_token"] = "<pad>"
elif isinstance(
tokenizer, (transformers.GPT2Tokenizer, transformers.GPTNeoXTokenizerFast)
):
special_tokens_dict["pad_token"] = "<pad>"

# add special tokens only when a custom tokenizer is not passed
if not model_args.tokenizer_name_or_path:
# TODO: we need to change this, perhaps follow what open instruct does?
if tokenizer.pad_token is None:
logger.warning("PAD token set to default, missing in tokenizer")
special_tokens_dict["pad_token"] = configs.DEFAULT_PAD_TOKEN
if tokenizer.eos_token is None:
logger.warning("EOS token set to default, missing in tokenizer")
special_tokens_dict["eos_token"] = configs.DEFAULT_EOS_TOKEN
if tokenizer.bos_token is None:
logger.warning("BOS token set to default, missing in tokenizer")
special_tokens_dict["bos_token"] = configs.DEFAULT_BOS_TOKEN
if tokenizer.unk_token is None:
logger.warning("UNK token set to default, missing in tokenizer")
special_tokens_dict["unk_token"] = configs.DEFAULT_UNK_TOKEN
if tokenizer.pad_token == tokenizer.eos_token:
logger.warning(
"PAD token set to default, to make it different from eos token"
)
if tokenizer.eos_token != configs.DEFAULT_PAD_TOKEN:
tokenizer.pad_token = configs.DEFAULT_PAD_TOKEN
special_tokens_dict["pad_token"] = configs.DEFAULT_PAD_TOKEN
else:
tokenizer.eos_token = configs.DEFAULT_EOS_TOKEN
special_tokens_dict["eos_token"] = configs.DEFAULT_EOS_TOKEN
return special_tokens_dict


def tokenizer_and_embedding_resize(
Expand All @@ -30,10 +91,10 @@ def tokenizer_and_embedding_resize(
Args:
special_tokens_dict: Dict containing special tokens to be added.
tokenizer: transformers.PreTrainedTokenizer.
model: transformers.PreTrainedModel
model: transformers.PreTrainedModel.
multiple_of: int , embeddings are resized to multiple of this.
Return:
dict: Metadata on number of added tokens
dict: Metadata on number of added tokens.
"""
num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
embedding_size = int(multiple_of * math.ceil(len(tokenizer) / multiple_of))
Expand Down

0 comments on commit 2df4780

Please sign in to comment.