Skip to content

Commit

Permalink
Merge branch 'develop' of https://github.com/PaddlePaddle/PaddleNLP i…
Browse files Browse the repository at this point in the history
…nto develop
  • Loading branch information
gongel committed Mar 20, 2024
2 parents 87e3571 + ac57ad7 commit e85be50
Show file tree
Hide file tree
Showing 112 changed files with 4,418 additions and 1,202 deletions.
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ unit-test:

.PHONY: install
install:
pip install paddlepaddle==0.0.0 -f https://www.paddlepaddle.org.cn/whl/linux/cpu-mkl/develop.html
pip install -r requirements-dev.txt
pip install -r requirements.txt
pip install -r paddlenlp/experimental/autonlp/requirements.txt
Expand Down
14 changes: 12 additions & 2 deletions docs/trainer.md
Original file line number Diff line number Diff line change
Expand Up @@ -321,9 +321,9 @@ Trainer 是一个简单,但功能完整的 Paddle训练和评估模块,并
--num_train_epochs
要执行的训练 epoch 总数(如果不是整数,将在停止训练
之前执行最后一个 epoch 的小数部分百分比)。
(`float`, 可选, 默认为 3.0):
(`float`, 可选, 默认为 1.0):
Total number of training epochs to perform. (default:3.0)
Total number of training epochs to perform. (default:1.0)
--max_steps
如果设置为正数,则表示要执行的训练步骤总数。
Expand Down Expand Up @@ -552,6 +552,16 @@ Trainer 是一个简单,但功能完整的 Paddle训练和评估模块,并
enable_delay_scale_loss, accumulate gradients util optimizer step, all gradients div by inner pipeline accumute step. instead of div accumute step on loss directly.
enable_dp_comm_overlap, fuse data parallel gradient communication.
--data_parallel_config
对于数据并行,一些选项会影响训练性能,这里将一些选项配置集中管理,以str形式传入配置.
支持如下选项:
enable_allreduce_avg_in_gradinent_scale : 在数据并行中, 替换`allreduce_sum + scale`模式为`allreduce_avg`, 以提高性能. 仅支持auto模式.
gradient_sync_after_accumulate : 当梯度累积开启时, 将梯度同步操作从backward阶段移动到optimizer阶段, 以减少同步次数, 提高性能, 但会增加显存占用. 仅支持auto模式.
Some additional configs which affect data parallel performance, we provide some option to config it.
following config is support:
enable_allreduce_avg_in_gradinent_scale, it replace `allreduce_sum + scale` pattern with `allreduce_avg` when scale gradient in data_parallel, which improve the performance. ONLY supported for auto mode now.
gradient_sync_after_accumulate, move gradient sync operations from backward into optimizer step when gradient accumulate enabling, which reduce the sync times to improve performance, but will increase the memory usage. ONLY supported for auto mode now.
--recompute
是否使用重计算训练。可以节省显存。
Expand Down
2 changes: 2 additions & 0 deletions llm/argument.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,8 @@ class ModelArgument:
lora: bool = field(default=False, metadata={"help": "Whether to use LoRA technique"})
lora_path: str = field(default=None, metadata={"help": "Initialize lora state dict."})
lora_rank: int = field(default=8, metadata={"help": "Lora attention dimension"})
rslora: bool = field(default=False, metadata={"help": "Whether to use RsLoRA"})
lora_plus_scale: float = field(default=1.0, metadata={"help": "Lora B scale in LoRA+ technique"})

# prefix tuning related parameters
prefix_tuning: bool = field(default=False, metadata={"help": "Whether to use Prefix technique"})
Expand Down
4 changes: 3 additions & 1 deletion llm/finetune_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,9 @@ def neft_post_hook(module, input, output):
lora_config = LoRAConfig(
target_modules=target_modules,
r=model_args.lora_rank,
lora_alpha=2 * model_args.lora_rank,
lora_alpha=2 * model_args.lora_rank if not model_args.rslora else 4,
rslora=model_args.rslora,
lora_plus_scale=model_args.lora_plus_scale,
merge_weights=False,
tensor_parallel_degree=training_args.tensor_parallel_degree,
dtype=dtype,
Expand Down
5 changes: 5 additions & 0 deletions llm/llama/auto_parallel/run_pretrain_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,13 +404,18 @@ def init_seed(seed: int = 1234, args=None):
else:
assert not args.use_hybrid_parallel and args.enable_auto_parallel
if dist.get_world_size() > 1:
if args.hybrid_parallel_topo_order is None or args.hybrid_parallel_topo_order == "pp_first":
order = ["pp", "dp", "sharding", "mp", "sep"]
elif args.hybrid_parallel_topo_order == "sharding_first":
order = ["dp", "sharding", "pp", "mp", "sep"]
topo = Topology(
dist.get_rank(),
dist.get_world_size(),
dp_degree=args.data_parallel_degree,
pp_degree=args.pipeline_parallel_degree,
mp_degree=args.tensor_parallel_degree,
sharding_degree=1, # auto_parallel's sharding is not orthogonal with dp, mp and pp
order=order,
)

global_seed, local_seed, random_seed = _get_distributed_seeds(args.seed, topo)
Expand Down
5 changes: 5 additions & 0 deletions llm/llama/auto_parallel/run_pretrain_auto_static.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,13 +414,18 @@ def init_seed(seed: int = 1234, args=None):
else:
assert not args.use_hybrid_parallel and args.enable_auto_parallel
if dist.get_world_size() > 1:
if args.hybrid_parallel_topo_order is None or args.hybrid_parallel_topo_order == "pp_first":
order = ["pp", "dp", "sharding", "mp", "sep"]
elif args.hybrid_parallel_topo_order == "sharding_first":
order = ["dp", "sharding", "pp", "mp", "sep"]
topo = Topology(
dist.get_rank(),
dist.get_world_size(),
dp_degree=args.data_parallel_degree,
pp_degree=args.pipeline_parallel_degree,
mp_degree=args.tensor_parallel_degree,
sharding_degree=1, # auto_parallel's sharding is not orthogonal with dp, mp and pp
order=order,
)

global_seed, local_seed, random_seed = _get_distributed_seeds(args.seed, topo)
Expand Down
5 changes: 5 additions & 0 deletions llm/predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import paddle
import paddle.distributed.fleet.base.topology as tp
import paddle.incubate.multiprocessing as mp
from paddle.base.framework import in_cinn_mode, in_pir_executor_mode
from paddle.distributed import fleet
from utils import (
dybatch_preprocess,
Expand Down Expand Up @@ -360,6 +361,10 @@ def __init__(self, config: PredictorArgument, tokenizer: PretrainedTokenizer = N
inference_config.disable_gpu()
inference_config.disable_glog_info()
inference_config.enable_new_executor()
if in_pir_executor_mode():
inference_config.enable_new_ir()
if in_cinn_mode():
inference_config.enable_cinn()

with static_mode_guard():
self.predictor = paddle.inference.create_predictor(inference_config)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
model_item=gpt_auto_pir
dp_degree=2
mp_degree=1
pp_degree=4
bs_item=16 # micro * dp * pp
fp_item=fp16O1
run_mode=DP2-MP1-PP4-SD2-stage1
device_num=N1C8
sharding_degree=2 # sharding_degree = dp_degree
sharding_stage=1
level=o1
local_batch_size=8

model=gpt
micro_bs=2 # local_batch_size / pp_degree

cd ./benchmarks
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${sharding_degree} ${sharding_stage} ${level} 2>&1;
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
model_item=gpt_auto_pir
dp_degree=2
mp_degree=1
pp_degree=4
bs_item=16 # micro * dp * pp
fp_item=fp16O1
run_mode=DP2-MP1-PP4-SD2-stage2
device_num=N1C8
sharding_degree=2 # sharding_degree = dp_degree
sharding_stage=2
level=o1
local_batch_size=8

model=gpt
micro_bs=2 # local_batch_size / pp_degree

cd ./benchmarks
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${sharding_degree} ${sharding_stage} ${level} 2>&1;
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
model_item=gpt_auto_pir
dp_degree=2
mp_degree=2
pp_degree=2
bs_item=16 # micro * dp * pp
fp_item=fp16O1
run_mode=DP2-MP2-PP2-SD2-stage1
device_num=N1C8
sharding_degree=2 # sharding_degree = dp_degree
sharding_stage=1
level=o1
local_batch_size=8

model=gpt
micro_bs=4 # local_batch_size / pp_degree

cd ./benchmarks
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${sharding_degree} ${sharding_stage} ${level} 2>&1;
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
model_item=gpt_auto_pir
dp_degree=2
mp_degree=2
pp_degree=2
bs_item=16 # micro * dp * pp
fp_item=fp16O1
run_mode=DP2-MP2-PP2-SD2-stage2
device_num=N1C8
sharding_degree=2 # sharding_degree = dp_degree
sharding_stage=2
level=o1
local_batch_size=8

model=gpt
micro_bs=4 # local_batch_size / pp_degree

cd ./benchmarks
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${sharding_degree} ${sharding_stage} ${level} 2>&1;
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
model_item=gpt_auto_pir
dp_degree=2
mp_degree=1
pp_degree=4
bs_item=16 # micro * dp * pp
fp_item=fp16O2
run_mode=DP2-MP1-PP4-SD2-stage1
device_num=N1C8
sharding_degree=2 # sharding_degree = dp_degree
sharding_stage=1
level=o2
local_batch_size=8

model=gpt
micro_bs=2 # local_batch_size / pp_degree

cd ./benchmarks
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${sharding_degree} ${sharding_stage} ${level} 2>&1;
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
model_item=gpt_auto_pir
dp_degree=2
mp_degree=1
pp_degree=4
bs_item=16 # micro * dp * pp
fp_item=fp16O2
run_mode=DP2-MP1-PP4-SD2-stage2
device_num=N1C8
sharding_degree=2 # sharding_degree = dp_degree
sharding_stage=2
level=o2
local_batch_size=8

model=gpt
micro_bs=2 # local_batch_size / pp_degree

cd ./benchmarks
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${sharding_degree} ${sharding_stage} ${level} 2>&1;
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
model_item=gpt_auto_pir
dp_degree=2
mp_degree=2
pp_degree=2
bs_item=16 # micro * dp * pp
fp_item=fp16O2
run_mode=DP2-MP2-PP2-SD2-stage1
device_num=N1C8
sharding_degree=2 # sharding_degree = dp_degree
sharding_stage=1
level=o2
local_batch_size=8

model=gpt
micro_bs=4 # local_batch_size / pp_degree

cd ./benchmarks
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${sharding_degree} ${sharding_stage} ${level} 2>&1;
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
model_item=gpt_auto_pir
dp_degree=2
mp_degree=2
pp_degree=2
bs_item=16 # micro * dp * pp
fp_item=fp16O2
run_mode=DP2-MP2-PP2-SD2-stage2
device_num=N1C8
sharding_degree=2 # sharding_degree = dp_degree
sharding_stage=2
level=o2
local_batch_size=8

model=gpt
micro_bs=4 # local_batch_size / pp_degree

cd ./benchmarks
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${sharding_degree} ${sharding_stage} ${level} 2>&1;
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
model_item=gpt_auto_pir
dp_degree=2
mp_degree=1
pp_degree=4
bs_item=16 # micro * dp * pp
fp_item=fp16O3
run_mode=DP2-MP1-PP4-SD2-stage1
device_num=N1C8
sharding_degree=2 # sharding_degree = dp_degree
sharding_stage=1
level=o3
local_batch_size=8

model=gpt
micro_bs=2 # local_batch_size / pp_degree

cd ./benchmarks
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${sharding_degree} ${sharding_stage} ${level} 2>&1;
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
model_item=gpt_auto_pir
dp_degree=2
mp_degree=1
pp_degree=4
bs_item=16 # micro * dp * pp
fp_item=fp16O3
run_mode=DP2-MP1-PP4-SD2-stage2
device_num=N1C8
sharding_degree=2 # sharding_degree = dp_degree
sharding_stage=2
level=o3
local_batch_size=8

model=gpt
micro_bs=2 # local_batch_size / pp_degree

cd ./benchmarks
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${sharding_degree} ${sharding_stage} ${level} 2>&1;
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
model_item=gpt_auto_pir
dp_degree=2
mp_degree=2
pp_degree=2
bs_item=16 # micro * dp * pp
fp_item=fp16O3
run_mode=DP2-MP2-PP2-SD2-stage1
device_num=N1C8
sharding_degree=2 # sharding_degree = dp_degree
sharding_stage=1
level=o3
local_batch_size=8

model=gpt
micro_bs=4 # local_batch_size / pp_degree

cd ./benchmarks
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${sharding_degree} ${sharding_stage} ${level} 2>&1;
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
model_item=gpt_auto_pir
dp_degree=2
mp_degree=2
pp_degree=2
bs_item=16 # micro * dp * pp
fp_item=fp16O3
run_mode=DP2-MP2-PP2-SD2-stage2
device_num=N1C8
sharding_degree=2 # sharding_degree = dp_degree
sharding_stage=2
level=o3
local_batch_size=8

model=gpt
micro_bs=4 # local_batch_size / pp_degree

cd ./benchmarks
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${sharding_degree} ${sharding_stage} ${level} 2>&1;
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
model_item=gpt_auto_pir
dp_degree=1
mp_degree=1
pp_degree=8
bs_item=8 # micro * dp * pp
fp_item=fp16O1
run_mode=DP1-MP1-PP8-SD1-stage1
device_num=N1C8
sharding_degree=1
sharding_stage=1
level=o1
local_batch_size=8

model=gpt
micro_bs=1 # local_batch_size / pp_degree

cd ./benchmarks
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/prepare.sh
# run
bash ./test_tipc/gpt/static/new_exec_pp_pir/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${mp_degree} ${pp_degree} ${micro_bs} ${bs_item} ${run_mode} ${device_num} \
${sharding_degree} ${sharding_stage} ${level} 2>&1;
Loading

0 comments on commit e85be50

Please sign in to comment.