Skip to content

Commit

Permalink
Merge branch 'main' of github.com:loreloc/sos-npcs
Browse files Browse the repository at this point in the history
  • Loading branch information
loreloc committed Jul 18, 2024
2 parents 1324738 + 4aac000 commit 357b8b9
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 11 deletions.
28 changes: 17 additions & 11 deletions slurm/launch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,36 +4,42 @@ export PROJECT_NAME="sos-npcs"
export PYTHONPATH=${PYTHONPATH:-src}

# These flags need to be updated accordingly:
# SCRATCH_DIR: a directory within the local filesystem of a node
# EXPS_ID: some identifier for the experiments
# VENV_PATH: the path containing the pip virtual environment
export SCRATCH_DIR=${SCRATCH_DIR:-/disk/scratch_big/$USER}
export EXPS_ID=${EXPS_ID:-exps}
export VENV_PATH=${VENV_PATH:-venv}

# The Slurm partition to use, e.g.,
#PARTITION=PGR-Standard
PARTITION=${PARTITION:-}
PARTITION=${PARTITION:-PGR-Standard}
# An optional list of Slurm node to exclude, e.g.,
#EXCL_NODES=${EXCL_NODES:-busynode[01-07]}
EXCL_NODES=${EXCL_NODES:-}
# An optional list of Slurm node to allow
LIST_NODES=${LIST_NODES:-}
EXCL_NODES=${EXCL_NODES:-crannog[01-07],damnii[05-08]}
# The maximum number of parallel jobs to dispatch
MAX_PARALLEL_JOBS=12

# Resources and maximum execution time
NUM_CPUS=2
NUM_GPUS=1
TIME=120:00:00
TIME=167:00:00

JOB_NAME="$PROJECT_NAME-$EXPS_ID"
OUTPUT="slurm/logs/$JOB_NAME-%j.out"
LOG_DIRECTORY="slurm/logs/$PROJECT_NAME/$EXPS_ID"
LOG_OUTPUT="$LOG_DIRECTORY/%j.out"
EXPS_FILE="$1"
NUM_EXPS=`cat ${EXPS_FILE} | wc -l`

sbatch --job-name $JOB_NAME --output "$OUTPUT" --partition "$PARTITION" \
--nodes 1 --ntasks 1 --cpus-per-task $NUM_CPUS --gres=gpu:$NUM_GPUS \
--time $TIME --exclude "$EXCL_NODES" \
echo "Creating slurm logging directory $LOG_DIRECTORY"
mkdir -p "$LOG_DIRECTORY"

echo "Slurm job settings"
echo "Partition: $PARTITION"
echo "Excl nodes: $EXCL_NODES"

sbatch --job-name $JOB_NAME --output "$LOG_OUTPUT" --error "$LOG_OUTPUT" \
--partition "$PARTITION" --nodes 1 --ntasks 1 \
--cpus-per-task $NUM_CPUS --gres=gpu:$NUM_GPUS \
--time $TIME --exclude="$EXCL_NODES" \
--array=1-${NUM_EXPS}%${MAX_PARALLEL_JOBS} \
slurm/run.sh "$EXPS_FILE"

11 changes: 11 additions & 0 deletions slurm/run.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
#!/bin/bash

# Find a suitable scratch directory
SCRATCH_DIR="/disk/scratch_big"
if [ ! -w "$SCRATCH_DIR" ]
then
SCRATCH_DIR="/disk/scratch"
fi

echo "Running job on the partition $SLURM_JOB_PARTITION"
echo " and on the node $SLURMD_NODENAME"
echo "Using scratch directory $SCRATCH_DIR"

RESULTS_PATH="$SCRATCH_DIR/$SLURM_JOB_ID"
DESTINATION_PATH="$HOME/$PROJECT_NAME"
TBOARD_DIR="$RESULTS_PATH/tboard-runs/$EXPS_ID"
Expand Down

0 comments on commit 357b8b9

Please sign in to comment.