From 4a20dbbf741cd4df5391560c38dc1e50ec972c8f Mon Sep 17 00:00:00 2001 From: gongel Date: Tue, 2 Apr 2024 11:14:51 +0000 Subject: [PATCH] add checkpoint_done to last model --- paddlenlp/trainer/trainer.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py index 67536b9c55ae..88c0bed7b5e8 100644 --- a/paddlenlp/trainer/trainer.py +++ b/paddlenlp/trainer/trainer.py @@ -2092,6 +2092,11 @@ def save_model(self, output_dir: Optional[str] = None, merge_tensor_parallel: Op # recover unified_checkpoint_config for not trine stage if not self.is_in_train: self.args.unified_checkpoint_config = unified_checkpoint_config_backup + if strtobool(os.getenv("FLAG_LLM_PDC", "False")): + # save checkpoint_done file to ensure checkpoint is complete + if self.args.should_save_model_state and self.args.should_save: + # For ckpt integrity + paddle.save(self.state.global_step, os.path.join(output_dir, ".checkpoint_done")) def _save_checkpoint(self, model, metrics=None): # assert unwrap_model(model) is self.model, "internal model should be a reference to self.model"