Skip to content

Commit

Permalink
Merge with main before adding unit tests
Browse files Browse the repository at this point in the history
Signed-off-by: Abhishek <maurya.abhishek@ibm.com>
  • Loading branch information
Abhishek-TAMU committed Feb 13, 2025
1 parent adafe9a commit 22f17ab
Show file tree
Hide file tree
Showing 5 changed files with 14 additions and 20 deletions.
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,12 @@ classifiers=[
dependencies = [
"numpy>=1.26.4,<2.0",
"accelerate>=0.20.3,!=0.34,<1.1",
"transformers>=4.45,<4.46",
"transformers>=4.46,<4.48.2",
"torch>=2.2.0,<2.5",
"sentencepiece>=0.1.99,<0.3",
"tokenizers>=0.13.3,<1.0",
"tqdm>=4.66.2,<5.0",
"trl>=0.9.3,<0.12",
"trl>=0.13,<0.15",
"peft>=0.8.0,<0.14",
"protobuf>=5.28.0,<6.0.0",
"datasets>=2.15.0,<3.0",
Expand Down
2 changes: 1 addition & 1 deletion tests/build/test_launch_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
"num_train_epochs": 5,
"per_device_train_batch_size": 4,
"per_device_eval_batch_size": 4,
"gradient_accumulation_steps": 4,
"gradient_accumulation_steps": 1,
"learning_rate": 0.00001,
"weight_decay": 0,
"warmup_ratio": 0.03,
Expand Down
7 changes: 0 additions & 7 deletions tests/data/test_data_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -667,13 +667,6 @@ def test_get_data_collator(
),
False,
),
# Pretokenized data with packing to True
(
configs.DataArguments(
training_data_path=TWITTER_COMPLAINTS_TOKENIZED_JSONL,
),
True,
),
],
)
def test_process_data_args_throws_error_where_needed(data_args, packing):
Expand Down
11 changes: 9 additions & 2 deletions tests/test_sft_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import copy
import json
import os
import re
import tempfile

# Third Party
Expand Down Expand Up @@ -88,7 +89,7 @@
num_train_epochs=5,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
gradient_accumulation_steps=4,
gradient_accumulation_steps=1,
learning_rate=0.00001,
weight_decay=0,
warmup_ratio=0.03,
Expand Down Expand Up @@ -1147,7 +1148,13 @@ def _validate_hf_resource_scanner_file(tempdir):


def _get_checkpoint_path(dir_path):
return os.path.join(dir_path, "checkpoint-5")
checkpoint_dirs = [
d
for d in os.listdir(dir_path)
if os.path.isdir(os.path.join(dir_path, d)) and re.match(r"^checkpoint-\d+$", d)
]
checkpoint_dirs.sort(key=lambda name: int(name.split("-")[-1]))
return os.path.join(dir_path, checkpoint_dirs[-1])


def _get_adapter_config(dir_path):
Expand Down
10 changes: 2 additions & 8 deletions tuning/data/setup_dataprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def _process_dataconfig_file(


# Data Format 1: Pretokenized Data
def _get_pretokenized_dataset_handlers(data_args, packing, is_eval_tokenized):
def _get_pretokenized_dataset_handlers(data_args, is_eval_tokenized):

# if the provided train dataset is pretokenized
# however user provides formatting flags, error out
Expand All @@ -96,12 +96,6 @@ def _get_pretokenized_dataset_handlers(data_args, packing, is_eval_tokenized):
along with pretokenized train data"
)

# Support for packing pretokenized datasets has been merged in trl library
# see: https://github.com/huggingface/trl/pull/2011
# but we wait till a new transformers version is released to remove this check.
if packing:
raise ValueError("packing will not be used when datasets are pretokenized")

# We do not need a handler here as this is tokenized dataset
return [], None

Expand Down Expand Up @@ -264,7 +258,7 @@ def _process_raw_data_args(
if is_traindata_tokenized:
# Data Format 1: Pretokenized Data
handlers, dataset_text_field = _get_pretokenized_dataset_handlers(
data_args, packing, (is_eval_dataset_present and not is_evaldata_tokenized)
data_args, (is_eval_dataset_present and not is_evaldata_tokenized)
)
elif data_args.instruction_template and data_args.response_template:
# Data Format 2: Chat dataset with instruction and response template
Expand Down

0 comments on commit 22f17ab

Please sign in to comment.