diff --git a/CHANGELOG.md b/CHANGELOG.md index 9db50369a..61eda832a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Pytest fixtures in the `conftest.py` file of the integration test suite - NOTE: an export command `export LC_ALL='C'` had to be added to fix a bug in the WEAVE CI. This can be removed when we resolve this issue for the `merlin server` command - Tests for the `celeryadapter.py` module +- New example for stopping an ensemble workflow in `merlin/examples/workflows/stop_ensemble/`. ## [1.11.1] ### Fixed diff --git a/merlin/examples/workflows/stop_ensemble/README b/merlin/examples/workflows/stop_ensemble/README new file mode 100644 index 000000000..30cddb6ac --- /dev/null +++ b/merlin/examples/workflows/stop_ensemble/README @@ -0,0 +1,70 @@ ++----------------------------------------------------+ +| | +| Stop Ensemble Example | +| | +| Description: Hello World example including a quit | +| script to stop running and dependent | +| jobs running the Merlin workflow. | +| | +| Important Scripts: -workers.sbatch | +| -hello_samples.yaml | +| -quitScript.sh | +| | ++----------------------------------------------------+ + + +========================================================== + Running the Example +========================================================== + +1) merlin run +2) sbatch workers.sbatch + +Notes: - is your yaml file including + the extension ".yaml". + -The second input parameter to workers.sbatch tells + slurm to either run a diasy chain of dependency + jobs ("true") or no dependency jobs ("false" or + no second input parameter at all). + + +========================================================== + What is Going On +========================================================== + +When the job is allocated, the typical hello_samples will +execute via step_1 and step_2 within the yaml file. After, +a stopWorkers step is executed that will copy the +quitScript.sh bash file from $(SPECROOT) and obtain the +specfile and the specRoot. + +These two variables are passed into quitSript.sh +which will find all job IDs of both running and awaiting +job allocations, loop through them, and cancel every +single one that deals with the +ran from $(SPECROOT) defined in steps 1 and 2 in the +"Running the Example" section above. + + +========================================================== + Important Notes +========================================================== + +1) quitScript.sh is submitted via sbatch instead of a + straight source or bash -c call because otherwise the + script will stop the workers immediately and the YAML + file then stops, but the task would never obtain a + successfull exit. Therefore, it would have to be purged + each time before starting a new Merlin run. By + submitting it via sbatch, once it is submitted the task + can exit successfully before the workers are stopped. + +2) Make sure the stopWorkers task is the very last task + in your workflow as it will stop any and all jobs + that have been allocated for the workflow. + +3) The virtual environment name and path are defined + in workers.sbatch. Change these to your specific needs + prior to running. + + diff --git a/merlin/examples/workflows/stop_ensemble/hello_samples.yaml b/merlin/examples/workflows/stop_ensemble/hello_samples.yaml new file mode 100644 index 000000000..e801df775 --- /dev/null +++ b/merlin/examples/workflows/stop_ensemble/hello_samples.yaml @@ -0,0 +1,51 @@ +description: + name: hello_samples + description: An example to stop ensembles automatically using hello world + +env: + variables: + N_SAMPLES: 10 + +global.parameters: + GREET: + values : ["hello","hola"] + label : GREET.%% + +study: + - name: step_1 + description: say hello + run: + cmd: echo "$(GREET), $(WORLD)!" + + - name: step_2 + description: print a success message + run: + cmd: print("Hurrah, we did it!") + depends: [step_1_*] + shell: /usr/bin/env python3 + + - name: stopEnsemble + description: Stop any and all workers attached to this specification file + run: + cmd: | + cp $(SPECROOT)/quitScript.sh ./ + + specFile=$(basename $(MERLIN_SPEC_ORIGINAL_TEMPLATE)) + specFile=${specFile/.orig.yaml/.yaml} + + echo $(SPECROOT) + echo $specFile + + sbatch ./quitScript.sh $(SPECROOT) $specFile + + #Give successful exit since batch job was canceled + #and can't do it + exit $(MERLIN_SUCCESS) + depends: [step_2] + +merlin: + samples: + generate: + cmd: python3 $(SPECROOT)/make_samples.py --filepath=$(MERLIN_INFO)/samples.csv --number=$(N_SAMPLES) + file: $(MERLIN_INFO)/samples.csv + column_labels: [WORLD] diff --git a/merlin/examples/workflows/stop_ensemble/make_samples.py b/merlin/examples/workflows/stop_ensemble/make_samples.py new file mode 100644 index 000000000..3b7b5b398 --- /dev/null +++ b/merlin/examples/workflows/stop_ensemble/make_samples.py @@ -0,0 +1,22 @@ +import argparse + +import names +import numpy as np + + +# argument parsing +parser = argparse.ArgumentParser(description="Make some samples (names of people).") +parser.add_argument("--number", type=int, action="store", help="the number of samples you want to make") +parser.add_argument("--filepath", type=str, help="output file") +args = parser.parse_args() + +# sample making +all_names = np.loadtxt(names.FILES["first:female"], dtype=str, usecols=0) +selected_names = np.random.choice(all_names, size=args.number) + +result = "" +name_list = list(selected_names) +result = "\n".join(name_list) + +with open(args.filepath, "w") as f: + f.write(result) diff --git a/merlin/examples/workflows/stop_ensemble/quitScript.sh b/merlin/examples/workflows/stop_ensemble/quitScript.sh new file mode 100644 index 000000000..a94a5d3f6 --- /dev/null +++ b/merlin/examples/workflows/stop_ensemble/quitScript.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +#SBATCH -N 1 +#SBATCH --ntasks-per-node=1 +#SBATCH -J stopWorkers +#SBATCH -t 00:01:00 +#SBATCH -o merlin_StopWorker_%j.out + +# Turn off core files to work around flux exec issue. +ulimit -c 0 + + +specRoot=$1 +targetSpecFile=$2 + +for JOB2CHECK in $(squeue --me --format="%F" | grep -v "ARRAY") +do + if [[ $JOB2CHECK != $SLURM_JOB_ID ]] + then + if [[ ! -z $(scontrol show job $JOB2CHECK | \ + grep -w "WorkDir=$specRoot") ]] + then + outFile=$(scontrol show job $JOB2CHECK | \ + grep "StdOut=") + outFile=${outFile##*=} + + foundSpec=$(grep 'Specification File: ' $outFile | sed 's/^.*: //') + + if [[ $targetSpecFile == $foundSpec ]] + then + cancelJOB_NAME=$(squeue --me --job=$JOB2CHECK --format=%j | sed -n '2p') + echo $cancelJOB_NAME + echo "Job to Cancel::::Job Name: $cancelJOB_NAME Job ID: $JOB2CHECK" + echo "canceling job" + scancel $JOB2CHECK + fi + fi + fi +done diff --git a/merlin/examples/workflows/stop_ensemble/requirements.txt b/merlin/examples/workflows/stop_ensemble/requirements.txt new file mode 100644 index 000000000..684db4dc8 --- /dev/null +++ b/merlin/examples/workflows/stop_ensemble/requirements.txt @@ -0,0 +1,2 @@ +names +numpy diff --git a/merlin/examples/workflows/stop_ensemble/workers.sbatch b/merlin/examples/workflows/stop_ensemble/workers.sbatch new file mode 100644 index 000000000..7ceb31994 --- /dev/null +++ b/merlin/examples/workflows/stop_ensemble/workers.sbatch @@ -0,0 +1,72 @@ +#!/bin/bash + +#SBATCH -N 1 +#SBATCH --ntasks-per-node=36 +#SBATCH -J stopEnsembleExample +#SBATCH -t 00:10:00 +#SBATCH -o merlin_workers_%j.out +#SBATCH -p pdebug + +# Turn off core files to work around flux exec issue. +ulimit -c 0 + +doDaisyChain=false +YAML=default + +# Get this filename +WORKER_SCRIPT=$(scontrol show job $SLURM_JOB_ID | grep -hi "Command=" | xargs -0 basename) + +if [[ $# -gt 0 ]] +then + YAML=$1 + + if [[ $# -gt 1 ]] + then + if [[ $2 == 1 || $2 == true || $2 == True || $2 == yes || $2 == Yes ]] + then + doDaisyChain=true + fi + fi +fi + + +echo "Spec Filename: $YAML" + +MERLIN_PATH= +VENV_NAME= + +# Activate the virtual environment +source ${MERLIN_PATH}/${VENV_NAME}/bin/activate + +# Show the workers command +merlin run-workers ${YAML} --echo + +# Start workers to run the tasks in the broker +merlin run-workers ${YAML} + +# Check that the YAML file was correct. Otherwise cancel job +outFile=$(scontrol show job $SLURM_JOB_ID | grep -i 'stdout=') +outFile=${outFile#*=} +isBadYAML=$(head -n 1 $outFile | grep -i "is not a valid filepath" | wc -l) +if [[ isBadYAML -eq 1 ]] +then + echo "Incorrect YAML File: $YAML" + echo "Canceling Job" + scancel $SLURM_JOB_ID +fi + + +# Start dependency job if this one times out +# The new job name will contain the job id of the job that +# created the depenendency +if [[ $doDaisyChain = true ]] +then + JOB_NAME_BASE=${SLURM_JOB_NAME%%-d-*} + JOB_NAME="$JOB_NAME_BASE-d-${SLURM_JOB_ID}" + sbatch -J $JOB_NAME --depend=afterany:${SLURM_JOB_ID} ${WORKER_SCRIPT} $YAML $doDaisyChain +fi + +# Keep the allocation alive +sleep inf + +