From 7c97b2a498fe5c7d508f159ddc6c4db46bc4befd Mon Sep 17 00:00:00 2001 From: gongel Date: Wed, 10 Apr 2024 06:22:42 +0000 Subject: [PATCH] add checkpoint_done to last model --- paddlenlp/trainer/trainer.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py index 5befe333b58a..7e18f5ba74e7 100644 --- a/paddlenlp/trainer/trainer.py +++ b/paddlenlp/trainer/trainer.py @@ -2047,6 +2047,11 @@ def save_model(self, output_dir: Optional[str] = None, merge_tensor_parallel: Op # recover unified_checkpoint_config for not trine stage if not self.is_in_train: self.args.unified_checkpoint_config = unified_checkpoint_config_backup + if strtobool(os.getenv("FLAG_LLM_PDC", "False")): + # save checkpoint_done file to ensure checkpoint is complete + if self.args.should_save_model_state and self.args.should_save: + # For ckpt integrity + paddle.save(self.state.global_step, os.path.join(output_dir, ".checkpoint_done")) def _save_checkpoint(self, model, metrics=None): # assert unwrap_model(model) is self.model, "internal model should be a reference to self.model"