From 4aac00088365287c667b9a8b1582d395307ac998 Mon Sep 17 00:00:00 2001 From: loreloc Date: Thu, 18 Jul 2024 13:16:51 +0100 Subject: [PATCH] updated slurm utility scripts --- slurm/launch.sh | 28 +++++++++++++++++----------- slurm/run.sh | 11 +++++++++++ 2 files changed, 28 insertions(+), 11 deletions(-) diff --git a/slurm/launch.sh b/slurm/launch.sh index 325d69e..a2af210 100644 --- a/slurm/launch.sh +++ b/slurm/launch.sh @@ -4,36 +4,42 @@ export PROJECT_NAME="sos-npcs" export PYTHONPATH=${PYTHONPATH:-src} # These flags need to be updated accordingly: -# SCRATCH_DIR: a directory within the local filesystem of a node # EXPS_ID: some identifier for the experiments # VENV_PATH: the path containing the pip virtual environment -export SCRATCH_DIR=${SCRATCH_DIR:-/disk/scratch_big/$USER} export EXPS_ID=${EXPS_ID:-exps} export VENV_PATH=${VENV_PATH:-venv} # The Slurm partition to use, e.g., #PARTITION=PGR-Standard -PARTITION=${PARTITION:-} +PARTITION=${PARTITION:-PGR-Standard} # An optional list of Slurm node to exclude, e.g., #EXCL_NODES=${EXCL_NODES:-busynode[01-07]} -EXCL_NODES=${EXCL_NODES:-} -# An optional list of Slurm node to allow -LIST_NODES=${LIST_NODES:-} +EXCL_NODES=${EXCL_NODES:-crannog[01-07],damnii[05-08]} # The maximum number of parallel jobs to dispatch MAX_PARALLEL_JOBS=12 # Resources and maximum execution time NUM_CPUS=2 NUM_GPUS=1 -TIME=120:00:00 +TIME=167:00:00 JOB_NAME="$PROJECT_NAME-$EXPS_ID" -OUTPUT="slurm/logs/$JOB_NAME-%j.out" +LOG_DIRECTORY="slurm/logs/$PROJECT_NAME/$EXPS_ID" +LOG_OUTPUT="$LOG_DIRECTORY/%j.out" EXPS_FILE="$1" NUM_EXPS=`cat ${EXPS_FILE} | wc -l` -sbatch --job-name $JOB_NAME --output "$OUTPUT" --partition "$PARTITION" \ - --nodes 1 --ntasks 1 --cpus-per-task $NUM_CPUS --gres=gpu:$NUM_GPUS \ - --time $TIME --exclude "$EXCL_NODES" \ +echo "Creating slurm logging directory $LOG_DIRECTORY" +mkdir -p "$LOG_DIRECTORY" + +echo "Slurm job settings" +echo "Partition: $PARTITION" +echo "Excl nodes: $EXCL_NODES" + +sbatch --job-name $JOB_NAME --output "$LOG_OUTPUT" --error "$LOG_OUTPUT" \ + --partition "$PARTITION" --nodes 1 --ntasks 1 \ + --cpus-per-task $NUM_CPUS --gres=gpu:$NUM_GPUS \ + --time $TIME --exclude="$EXCL_NODES" \ --array=1-${NUM_EXPS}%${MAX_PARALLEL_JOBS} \ slurm/run.sh "$EXPS_FILE" + diff --git a/slurm/run.sh b/slurm/run.sh index 6861726..1e41a2c 100644 --- a/slurm/run.sh +++ b/slurm/run.sh @@ -1,5 +1,16 @@ #!/bin/bash +# Find a suitable scratch directory +SCRATCH_DIR="/disk/scratch_big" +if [ ! -w "$SCRATCH_DIR" ] +then + SCRATCH_DIR="/disk/scratch" +fi + +echo "Running job on the partition $SLURM_JOB_PARTITION" +echo " and on the node $SLURMD_NODENAME" +echo "Using scratch directory $SCRATCH_DIR" + RESULTS_PATH="$SCRATCH_DIR/$SLURM_JOB_ID" DESTINATION_PATH="$HOME/$PROJECT_NAME" TBOARD_DIR="$RESULTS_PATH/tboard-runs/$EXPS_ID"