From 42a9c3244bd86093ba0b501c955e309f5a64cdee Mon Sep 17 00:00:00 2001 From: Luka Dojcinovic Date: Tue, 18 Feb 2025 13:02:25 -0500 Subject: [PATCH] Added unit tests for setting special tokens Added unit tests for set_special_tokens_dict() for LlamaTokenizerFast, GPT2TokenizerFast and GPTNeoXTokenizerFast. Signed-off-by: Luka Dojcinovic <56648891+Luka-D@users.noreply.github.com> --- tests/utils/test_tokenizer_data_utils.py | 46 ++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/tests/utils/test_tokenizer_data_utils.py b/tests/utils/test_tokenizer_data_utils.py index e24c90099..f9712d05e 100644 --- a/tests/utils/test_tokenizer_data_utils.py +++ b/tests/utils/test_tokenizer_data_utils.py @@ -1,13 +1,55 @@ # Third party -# Third Party from transformers import AutoModelForCausalLM, AutoTokenizer # First Party from tests.artifacts.testdata import MODEL_NAME +from tuning.config import configs # Local # First party -from tuning.utils.tokenizer_data_utils import tokenizer_and_embedding_resize +from tuning.utils.tokenizer_data_utils import ( + tokenizer_and_embedding_resize, + set_special_tokens_dict, +) + + +def test_setting_special_tokens_with_LlamaTokenizerFast(): + # For LlamaTokenizerFast, Missing PAD Token + tokenizer = AutoTokenizer.from_pretrained("Maykeye/TinyLLama-v0", legacy=True) + model_args = configs.ModelArguments() + special_tokens_dict = set_special_tokens_dict(model_args, tokenizer) + print(tokenizer) + print("Special Tokens", special_tokens_dict) + assert special_tokens_dict != { + "bos_token": "", + "eos_token": "", + "unk_token": "", + "pad_token": "", + } + + +def test_setting_special_tokens_with_GPT2TokenizerFast(): + # For GPT2TokenizerFast, PAD token = EOS Token + tokenizer = AutoTokenizer.from_pretrained("ibm-granite/granite-3.1-8b-base") + model_args = configs.ModelArguments() + special_tokens_dict = set_special_tokens_dict(model_args, tokenizer) + print(tokenizer) + print("Special Tokens", special_tokens_dict) + assert special_tokens_dict == { + "pad_token": "", + } + + +def test_setting_special_tokens_with_GPTNeoXTokenizerFast(): + # For GPTNeoXTokenizerFast, Missing PAD Token + tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b") + model_args = configs.ModelArguments() + special_tokens_dict = set_special_tokens_dict(model_args, tokenizer) + print(tokenizer) + print("Special Tokens", special_tokens_dict) + assert special_tokens_dict == { + "pad_token": "", + } def test_tokenizer_and_embedding_resize_return_values():