Skip to content

Commit

Permalink
Try setting wandb step manually
Browse files Browse the repository at this point in the history
  • Loading branch information
f-dangel committed Sep 14, 2024
1 parent e28ffd2 commit 05152ea
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 9 deletions.
6 changes: 3 additions & 3 deletions example/launch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
#SBATCH --qos=m5
#SBATCH --open-mode=append
#SBATCH --time=00:04:00
#SBATCH --array=0-9
#SBATCH --signal=B:SIGUSR1@120 # Send signal SIGUSR1 120 seconds before the job hits the time limit
#SBATCH --array=0
#SBATCH --signal=B:SIGUSR1@150 # Send signal SIGUSR1 120 seconds before the job hits the time limit

echo "Job $SLURM_JOB_NAME ($SLURM_JOB_ID) begins on $(hostname), submitted from $SLURM_SUBMIT_HOST ($SLURM_CLUSTER_NAME)"
echo ""
Expand All @@ -20,7 +20,7 @@ if [ "$SLURM_ARRAY_TASK_COUNT" != "" ]; then
fi

# NOTE that we need to use srun here, otherwise the Python process won't receive the SIGUSR1 signal
srun wandb agent --count=1 f-dangel-team/example-preemptable-sweep/4m89qo6r &
srun --unbuffered wandb agent --count=1 f-dangel-team/example-preemptable-sweep/4m89qo6r &
child="$!"

# Set up a handler to pass the SIGUSR1 to the python session launched by the agent
Expand Down
20 changes: 14 additions & 6 deletions example/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,14 +85,22 @@ def main(args):
# Select the remaining epochs to train
start_epoch = 0 if checkpoint_index is None else checkpoint_index + 1

wandb_resume_step = extra_info.get("wandb_step", None)
resume_from = (
None if wandb_resume_step is None else f"{run_id}?_step={wandb_resume_step}"
)
print("Resume string:", resume_from)
# NOTE forking must be enabled by the wandb team for your project.
# wandb_resume_step = extra_info.get("wandb_step", None)
# resume_from = (
# None if wandb_resume_step is None else f"{run_id}?_step={wandb_resume_step}"
# )
# resume = "allow" if resume_from is None else None
# print("resume_from:", resume_from)
# print("resume:", resume)
# wandb.init(resume=resume, resume_from=resume_from)

# NOTE: Allow runs to resume by passing 'allow' to wandb
wandb.init(resume="allow", resume_from=resume_from)
wandb.init(resume="allow")
print("Wandb step before manually setting it:", wandb.run.step)
# NOTE: Currently getting an error from setattr here
wandb.run.step = extra_info.get("wandb_step", 0)
print("Wandb step after manually setting it:", wandb.run.step)

# training
for epoch in range(start_epoch, args.epochs):
Expand Down

0 comments on commit 05152ea

Please sign in to comment.