diff --git a/.gitignore b/.gitignore index d4de8013..1445d93c 100644 --- a/.gitignore +++ b/.gitignore @@ -159,5 +159,6 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. .idea/ -# Temp checkpoint folder -data/ \ No newline at end of file +# Temp folders +data/ +wandb/ \ No newline at end of file diff --git a/README.md b/README.md index 6f7c61e8..afad90d7 100644 --- a/README.md +++ b/README.md @@ -32,8 +32,8 @@ To run the code in this project, first create a Python virtual environment using conda create -n handbook python=3.10 && conda activate handbook ``` -Next, install PyTorch `v2.0.1` - the precise version is important for reproducibility! Since this hardware-dependent, we -direct you to the [PyTorch Installation Page](https://pytorch.org/get-started/previous-versions/#v201). +Next, install PyTorch `v2.1.0` - the precise version is important for reproducibility! Since this hardware-dependent, we +direct you to the [PyTorch Installation Page](https://pytorch.org/get-started/locally/). You can then install the remaining package dependencies as follows: @@ -44,7 +44,7 @@ python -m pip install . You will also need Flash Attention 2 installed, which can be done by running: ```shell -python -m pip install flash-attn==2.3.0 --no-build-isolation +python -m pip install flash-attn --no-build-isolation ``` Next, log into your Hugging Face account as follows: diff --git a/recipes/launch.slurm b/recipes/launch.slurm index 17f1afc5..da0b176e 100644 --- a/recipes/launch.slurm +++ b/recipes/launch.slurm @@ -3,8 +3,8 @@ #SBATCH --exclusive #SBATCH --gres=gpu:8 #SBATCH --partition=production-cluster -#SBATCH --output=/fsx/h4/logs/%x-%j.out -#SBATCH --err=/fsx/h4/logs/%x-%j.err +#SBATCH --output=/fsx/h4/logs/%x-%j.out # Adjust this to your cluster +#SBATCH --err=/fsx/h4/logs/%x-%j.err # Adjust this to your cluster set -x -e diff --git a/recipes/zephyr-7b/README.md b/recipes/zephyr-7b/README.md index f1df22cc..fcafde9d 100644 --- a/recipes/zephyr-7b/README.md +++ b/recipes/zephyr-7b/README.md @@ -1,5 +1,6 @@ # Instructions + In the handbook, for each training step we provide two sets of recipes: - Full training on a multi-GPU machine (tested on a 8xA100 node), using slurm to queue jobs. - LORA taining on a single consumer 24GB GPU (tested on a RTX 4090) @@ -21,6 +22,7 @@ sbatch --job-name=handbook_sft --nodes=1 recipes/launch.slurm zephyr-7b sft full ``` ## LORA training examples + ### SFT ```shell # locally on 1 gpu @@ -33,6 +35,7 @@ sbatch --job-name=handbook_sft_lora --nodes=1 recipes/launch.slurm zephyr-7b sft ``` ### SFT + ```shell # locally on 1 gpu accelerate launch scripts/run_dpo.py recipes/zephyr-7b/dpo/config_lora.yaml diff --git a/recipes/zephyr-7b/sft/config_lora.yaml b/recipes/zephyr-7b/sft/config_lora.yaml index 6bf806b9..2d488d80 100644 --- a/recipes/zephyr-7b/sft/config_lora.yaml +++ b/recipes/zephyr-7b/sft/config_lora.yaml @@ -1,6 +1,5 @@ # Model arguments model_name_or_path: mistralai/Mistral-7B-v0.1 -model_revision: main torch_dtype: auto use_flash_attention_2: true diff --git a/scripts/README.md b/scripts/README.md index 502f5662..a388ff98 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -1,38 +1,58 @@ -## Supervised Fine-Tuning (SFT) +## Scripts to Train and Evaluate Chat Models -We provide 3 main ways to train SFT models: +### Fine-tuning -* Distributed fine-tuning of all model weights with ZeRO-3 -* Fine-tuning with LoRA adapters and ZeRO-3 -* Fine-tuning with QLoRA adapters and DDP +In the handbook, we provide two main ways to align LLMs for chat: + +- Full fine-tuning on a multi-GPU machine (tested on an 8 x A100 (80GB) node). +- LoRA fine-tuning on a single consumer 24GB GPU (tested on a RTX 4090). + +In practice, we find comparable performance for both full and LoRA fine-tuning, with the latter having the advantage of producing small adapter weights that are fast to upload and download from the Hugging Face Hub. Here's the two general commands to fine-tune your models: ```shell -# Full training with ZeRO-3 -ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/{model_name}/sft/config_full.yaml +# Full training with ZeRO-3 on 8 GPUs +ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_{task}.py recipes/{model_name}/{task}/config_full.yaml + +# LoRA training on single GPU +ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_{task}.py recipes/{model_name}/{task}/config_lora.yaml +``` -# LoRA training with ZeRO-3 -ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/{model_name}/sft/config_16bit.yaml +Here `{task}` refers to type of training you wish to run (SFT, DPO, etc), while `{model_name}` refers to the choice of recipe in the `recipes/` directory. For example, to replicate Zephyr 7B you can run: -# QLoRA training with DDP -ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml scripts/run_sft.py recipes/{model_name}/sft/config_8bit.yaml +```shell +# Step 1 - train SFT policy +ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/zephyr-7b/sft/config_full.yaml + +# Step 2 - align with DPO +ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_dpo.py recipes/zephyr-7b/dpo/config_full.yaml ``` You can override the parameters in each YAML config by appending them to the command as follows: ```shell -ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/{model_name}/sft/config_full.yaml --per_device_train_batch_size=2 --num_train_epochs=3 +# Change batch size, number of epochs etc +ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_{task}.py recipes/{model_name}/{task}/config_full.yaml --per_device_train_batch_size=42 --num_train_epochs=5 ``` -## Direct Preference Optimisation (DPO) +By default all training metrics are logged with TensorBoard. If you have a [Weights and Biases](https://wandb.ai/site) account and are logged in, you can view the training metrics by appending `--report_to=wandb`, e.g. ```shell -# Full training with ZeRO-3 -ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_dpo.py recipes/{model_name}/dpo/config_full.yaml +ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_{task}.py recipes/{model_name}/{task}/config_full.yaml --report_to=wandb +``` + +#### Launching jobs on a Slurm cluster -# LoRA training with ZeRO-3 -ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_dpo.py recipes/{model_name}/dpo/config_16bit.yaml +If you have access to a Slurm cluster, we provide a `recipes/launch.slurm` script that will automatically queue training jobs for you. Here's how you can use it: + +```shell +sbatch --job-name=handbook_{task} --nodes=1 recipes/launch.slurm {model_name} {task} {precision} {accelerator} +``` + +Here `{model_name}` and `{task}` are defined as above, while `{precision}` refers to the type of training (full vs LoRA) and `{accelerator}` refers to the choice of 🤗 Accelerate config in `recipes/accelerate_configs`. Here's a concrete example to run SFT on 1 node of 8 GPUs: + +```shell +sbatch --job-name=handbook_sft --nodes=1 recipes/launch.slurm zephyr-7b sft full deepspeed_zero3 +``` -# QLoRA training with DDP -ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml scripts/run_dpo.py recipes/{model_name}/dpo/config_8bit.yaml -``` \ No newline at end of file +**Note:** the configuration in `recipes/launch.slurm` is optimised for the Hugging Face Compute Cluster and may require tweaking to be adapted to your own compute nodes. \ No newline at end of file diff --git a/scripts/run_dpo.py b/scripts/run_dpo.py index 542de20a..704ce1f2 100644 --- a/scripts/run_dpo.py +++ b/scripts/run_dpo.py @@ -175,6 +175,7 @@ def main(): kwargs = { "finetuned_from": model_args.model_name_or_path, "dataset": list(data_args.dataset_mixer.keys()), + "dataset_tags": list(data_args.dataset_mixer.keys()), "tags": ["alignment-handbook"], } dpo_trainer.create_model_card(**kwargs) diff --git a/scripts/run_sft.py b/scripts/run_sft.py index 614a14a0..1ed8e335 100644 --- a/scripts/run_sft.py +++ b/scripts/run_sft.py @@ -82,13 +82,9 @@ def main(): # Load datasets ############### raw_datasets = get_datasets(data_args, splits=data_args.dataset_splits) - logger.info( f"Training on the following datasets and their proportions: {[split + ' : ' + str(dset.num_rows) for split, dset in raw_datasets.items()]}" ) - with training_args.main_process_first(desc="Log a few random samples from the raw training set"): - for index in random.sample(range(len(raw_datasets["train"])), 3): - logger.info(f"Sample {index} of the raw training set:\n\n{raw_datasets['train'][index]['messages']}") ################ # Load tokenizer @@ -176,6 +172,7 @@ def main(): kwargs = { "finetuned_from": model_args.model_name_or_path, "dataset": list(data_args.dataset_mixer.keys()), + "dataset_tags": list(data_args.dataset_mixer.keys()), "tags": ["alignment-handbook"], } trainer.create_model_card(**kwargs) diff --git a/setup.py b/setup.py index c6d4f218..d71b591b 100644 --- a/setup.py +++ b/setup.py @@ -63,7 +63,7 @@ "scipy", "tensorboard", "transformers==4.35.0", - "trl==0.7.4", # TODO bump to next release, added for NEFTune + "trl==0.7.4", "tqdm>=4.64.1", ]