Skip to content

Commit

Permalink
Merge branch 'develop' of https://github.com/PaddlePaddle/PaddleNLP i…
Browse files Browse the repository at this point in the history
…nto develop
  • Loading branch information
gongel committed Apr 2, 2024
2 parents e85be50 + 2273ee7 commit 7f0717d
Show file tree
Hide file tree
Showing 65 changed files with 7,250 additions and 1,043 deletions.
6 changes: 6 additions & 0 deletions llm/argument.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,12 @@ class ModelArgument:
lora: bool = field(default=False, metadata={"help": "Whether to use LoRA technique"})
lora_path: str = field(default=None, metadata={"help": "Initialize lora state dict."})
lora_rank: int = field(default=8, metadata={"help": "Lora attention dimension"})
use_quick_lora: bool = field(
default=False,
metadata={
"help": "Whether to use quick lora, The use of Quick LoRa will only take effect when lora_dropout is set to 0."
},
)
rslora: bool = field(default=False, metadata={"help": "Whether to use RsLoRA"})
lora_plus_scale: float = field(default=1.0, metadata={"help": "Lora B scale in LoRA+ technique"})

Expand Down
3 changes: 3 additions & 0 deletions llm/finetune_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ def main():
weight_double_quant=model_args.weight_double_quant,
weight_double_quant_block_size=model_args.weight_double_quant_block_size,
)

if training_args.pipeline_parallel_degree > 1:
if data_args.eval_with_do_generation and training_args.do_eval:
raise ValueError("Plese set eval_with_do_generation to false in pipeline parallel mode.")
Expand Down Expand Up @@ -426,10 +427,12 @@ def neft_post_hook(module, input, output):
dtype=dtype,
do_qat=quant_args.do_qat,
base_model_name_or_path=model_args.model_name_or_path,
use_quick_lora=model_args.use_quick_lora,
)
model = LoRAModel(model, lora_config)
else:
model = LoRAModel.from_pretrained(model=model, lora_path=model_args.lora_path)

model.print_trainable_parameters()

def compute_metrics_do_generation(eval_preds):
Expand Down
8 changes: 4 additions & 4 deletions llm/llama/auto_parallel/run_pretrain_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,14 +275,14 @@ def create_pretrained_dataset(

train_val_test_num_samples = [
training_args.per_device_train_batch_size
* training_args.data_parallel_degree
* training_args.dataset_world_size
* training_args.max_steps
* training_args.gradient_accumulation_steps,
training_args.per_device_eval_batch_size
* training_args.data_parallel_degree
* training_args.dataset_world_size
* training_args.eval_iters
* (training_args.max_steps // training_args.eval_steps + 1),
training_args.per_device_eval_batch_size * training_args.data_parallel_degree * training_args.test_iters,
training_args.per_device_eval_batch_size * training_args.dataset_world_size * training_args.test_iters,
]

print_rank_0(" > datasets target sizes (minimum size):")
Expand Down Expand Up @@ -411,7 +411,7 @@ def init_seed(seed: int = 1234, args=None):
topo = Topology(
dist.get_rank(),
dist.get_world_size(),
dp_degree=args.data_parallel_degree,
dp_degree=args.dataset_world_size,
pp_degree=args.pipeline_parallel_degree,
mp_degree=args.tensor_parallel_degree,
sharding_degree=1, # auto_parallel's sharding is not orthogonal with dp, mp and pp
Expand Down
15 changes: 7 additions & 8 deletions llm/llama/auto_parallel/run_pretrain_auto_static.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,14 +274,14 @@ def create_pretrained_dataset(

train_val_test_num_samples = [
training_args.per_device_train_batch_size
* training_args.data_parallel_degree
* training_args.dataset_world_size
* training_args.max_steps
* training_args.gradient_accumulation_steps,
training_args.per_device_eval_batch_size
* training_args.data_parallel_degree
* training_args.dataset_world_size
* training_args.eval_iters
* (training_args.max_steps // training_args.eval_steps + 1),
training_args.per_device_eval_batch_size * training_args.data_parallel_degree * training_args.test_iters,
training_args.per_device_eval_batch_size * training_args.dataset_world_size * training_args.test_iters,
]

print_rank_0(" > datasets target sizes (minimum size):")
Expand Down Expand Up @@ -421,7 +421,7 @@ def init_seed(seed: int = 1234, args=None):
topo = Topology(
dist.get_rank(),
dist.get_world_size(),
dp_degree=args.data_parallel_degree,
dp_degree=args.dataset_world_size,
pp_degree=args.pipeline_parallel_degree,
mp_degree=args.tensor_parallel_degree,
sharding_degree=1, # auto_parallel's sharding is not orthogonal with dp, mp and pp
Expand Down Expand Up @@ -552,7 +552,8 @@ def main():
# if training_args.bf16:
# dtype = "bfloat16"

model = model_class._from_config(config)
# The `amp` of static graph model can't accept a model initialized with `dtype float16 or bfloat16`
model = model_class._from_config(config, dtype="float32")

if training_args.recompute:

Expand Down Expand Up @@ -599,9 +600,7 @@ def fn(layer):
def loss_func(loss, outputs):
return loss

total_train_batch_size_per_acc_step = (
training_args.per_device_train_batch_size * training_args.data_parallel_degree
)
total_train_batch_size_per_acc_step = training_args.per_device_train_batch_size * training_args.dataset_world_size
total_train_batch_size = total_train_batch_size_per_acc_step * training_args.gradient_accumulation_steps

print_config(training_args)
Expand Down
8 changes: 5 additions & 3 deletions llm/llama/fused_layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,16 +58,18 @@ def backward(ctx, y_grad):

if hasattr(weight, "main_grad") and hasattr(bias, "main_grad"):
weight.main_grad, bias.main_grad = _C_ops.fused_linear_param_grad_add(
x, y_grad, weight.main_grad, bias.main_grad, True
x, y_grad, weight.main_grad, bias.main_grad, True, True
)
return x_grad, None, None
else:
if weight.grad is not None:
assert bias.grad is not None
weight.grad, bias.grad = _C_ops.fused_linear_param_grad_add(x, y_grad, weight.grad, bias.grad, False)
weight.grad, bias.grad = _C_ops.fused_linear_param_grad_add(
x, y_grad, weight.grad, bias.grad, False, True
)
return x_grad, None, None
else:
weight_grad, bias_grad = _C_ops.fused_linear_param_grad_add(x, y_grad, None, None, False)
weight_grad, bias_grad = _C_ops.fused_linear_param_grad_add(x, y_grad, None, None, False, True)
return x_grad, weight_grad, bias_grad


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
MinLengthLogitsProcessor,
RepetitionPenaltyLogitsProcessor,
)
from .sequence_parallel_utils import (
from paddle.distributed.fleet.utils.sequence_parallel_utils import (
ColumnSequenceParallelLinear,
GatherOp,
RowSequenceParallelLinear,
Expand Down
Loading

0 comments on commit 7f0717d

Please sign in to comment.