From 9be2e9a859233cbee13f9346454e78f398a0a9b8 Mon Sep 17 00:00:00 2001 From: lewtun Date: Sun, 9 Feb 2025 09:44:35 +0100 Subject: [PATCH] Add retry mechanism for pushing eval results (#252) The Hub throws 403 errors if there are too many concurrent pushes to the same repo, so we need a retry mechanism when that happens. --- slurm/evaluate.slurm | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/slurm/evaluate.slurm b/slurm/evaluate.slurm index 4883930b..c659c0b3 100644 --- a/slurm/evaluate.slurm +++ b/slurm/evaluate.slurm @@ -66,7 +66,15 @@ OUTPUT_FILEPATHS=$(find $OUTPUT_DIR/results/ -type f \( -name "*.json" \)) for filepath in $OUTPUT_FILEPATHS; do echo "Uploading $filepath to Hugging Face Hub..." filename=$(basename -- "$filepath") - huggingface-cli upload --repo-type space --private $LM_EVAL_REPO_ID $filepath $OUTPUT_DIR/$filename + for attempt in {1..20}; do + if huggingface-cli upload --repo-type space --private $LM_EVAL_REPO_ID $filepath $OUTPUT_DIR/$filename; then + echo "Upload succeeded for $filepath" + break + else + echo "Upload failed for $filepath. Attempt $attempt of 20. Retrying in 5 seconds..." + sleep 5 + fi + done done echo "Uploading details to Hugging Face Hub..." @@ -78,4 +86,4 @@ python src/open_r1/utils/upload_details.py --data_files $DETAILS_FILEPATHS --hub echo "Cleaning up ..." rm -rf $OUTPUT_DIR -echo "Done!" \ No newline at end of file +echo "Done!"