diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py index 5befe333b58a..7e18f5ba74e7 100644 --- a/paddlenlp/trainer/trainer.py +++ b/paddlenlp/trainer/trainer.py @@ -2047,6 +2047,11 @@ def save_model(self, output_dir: Optional[str] = None, merge_tensor_parallel: Op # recover unified_checkpoint_config for not trine stage if not self.is_in_train: self.args.unified_checkpoint_config = unified_checkpoint_config_backup + if strtobool(os.getenv("FLAG_LLM_PDC", "False")): + # save checkpoint_done file to ensure checkpoint is complete + if self.args.should_save_model_state and self.args.should_save: + # For ckpt integrity + paddle.save(self.state.global_step, os.path.join(output_dir, ".checkpoint_done")) def _save_checkpoint(self, model, metrics=None): # assert unwrap_model(model) is self.model, "internal model should be a reference to self.model"