Skip to content

Commit

Permalink
feat: add special tokens
Browse files Browse the repository at this point in the history
Signed-off-by: yashasvi <yashasvi@ibm.com>
  • Loading branch information
YashasviChaurasia committed Feb 20, 2025
1 parent fb3ace8 commit 0b27853
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 0 deletions.
8 changes: 8 additions & 0 deletions tuning/config/configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,14 @@ class DataArguments:
Passed in conjunction with response_template"
},
)
add_special_tokens: List[str] = field(
default=None,
metadata={
"help": "List of special tokens to be added to the tokenizer's vocabulary. \
Used to add Special Tokens to Tokenizer's Vocabulary,\
Add special tokens as new tokens and increase vocabulary and model embedding size."
},
)


@dataclass
Expand Down
7 changes: 7 additions & 0 deletions tuning/sft_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,13 @@ def train(
tokenizer.eos_token = configs.DEFAULT_EOS_TOKEN
special_tokens_dict["eos_token"] = configs.DEFAULT_EOS_TOKEN

# adds user specified special tokens to vocab
if data_args.add_special_tokens:
logger.info(
"Adding user-defined special tokens: %s ", data_args.add_special_tokens
)
special_tokens_dict["additional_special_tokens"] = data_args.add_special_tokens

# TODO: lower priority but understand if resizing impacts inference quality and why its needed.
# It makes sense if we manipulate tokenizer that we also save it and provide it to inference.
added_tokens_dict = tokenizer_and_embedding_resize(
Expand Down

0 comments on commit 0b27853

Please sign in to comment.