Skip to content

Commit

Permalink
eks: use kueue
Browse files Browse the repository at this point in the history
This avoids deadlocks by providing basic gang scheduling. Also, the
cluster now has a few cores worth of non-GPU node capacity, so we no
longer need to run the post-processing test on the large P5 nodes.
  • Loading branch information
olupton committed Dec 13, 2024
1 parent d7cac31 commit c98f25e
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 39 deletions.
2 changes: 2 additions & 0 deletions .github/eks-workflow-files/job.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ apiVersion: batch/v1
kind: Job
metadata:
name: PLACEHOLDER
labels:
kueue.x-k8s.io/queue-name: p5-queue
spec:
completions: 2 # number of nodes
parallelism: 2 # number of nodes
Expand Down
50 changes: 26 additions & 24 deletions .github/eks-workflow-files/mpi-nccl-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,15 @@ apiVersion: kubeflow.org/v2beta1
kind: MPIJob
metadata:
name: PLACEHOLDER
labels:
kueue.x-k8s.io/queue-name: p5-queue
spec:
# Without this then the first few attempts to run the launcher will result in errors
# due to failed DNS resolution of the worker names. It works eventually, given a big
# enough backoffLimit, but it makes it harder to handle log-streaming and identifying
# the "real" exit code of the job.
launcherCreationPolicy: WaitForWorkersReady
runPolicy:
cleanPodPolicy: Running
# surface errors direct to GitHub Actions without internal retries
# surface errors direct to GitHub Actions without Kubernetes-internal retries
backoffLimit: 0
# start suspended, let kueue unblock
suspend: true
# 1 MPI rank per GPU
slotsPerWorker: 8
mpiReplicaSpecs:
Expand All @@ -27,25 +26,28 @@ spec:
imagePullPolicy: IfNotPresent
name: PLACEHOLDER
command:
- mpirun
- --allow-run-as-root
- -np
- "16"
- -N
- "8"
- PLACEHOLDER
- -b
- "8"
- -e
- "16G"
- -f
- "2"
- -g
- "1"
- bash
- -c
- "1"
- -n
- "100"
- |
# kueue breaks the WaitForWorkersReady policy that mpi-operator
# nominally supports, so manually wait a while for a basic mpirun to
# start working (i.e. for the workers to be ready) before doing
# anything interesting, instead of relying on mpi-operator not to
# start the launcher before it is expected to succeed. This issue
# seems related: https://github.com/kubeflow/mpi-operator/pull/617
limit=5m
if ! timeout ${limit} sh -c "while ! mpirun --allow-run-as-root -N 1 hostname; do sleep 5; done"; then
echo "Workers were still not reachable after ${limit}, exiting"
exit 1
fi
mpirun --allow-run-as-root -np 16 -N 8 $0 \
-b 8 \
-e 16G \
-f 2 \
-g 1 \
-c 1 \
-n 100
- PLACEHOLDER
imagePullSecrets:
- name: PLACEHOLDER
Worker:
Expand Down
4 changes: 0 additions & 4 deletions .github/eks-workflow-files/post-process-job.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,6 @@ spec:
- pipefail
- -c
- nsys-jax-combine -o /opt/output/combined.zip /opt/output/*.zip --analysis communication
# FIXME: GPU not actually needed, but the test cluster doesn't have appropriate non-GPU nodes
resources:
limits:
nvidia.com/gpu: 1
volumeMounts:
- mountPath: /opt/output
name: output
Expand Down
24 changes: 13 additions & 11 deletions .github/workflows/nccl-k8s.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ jobs:
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].image = strenv(BASE_IMAGE)
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].name = strenv(LAUNCHER_NAME)
| .spec.mpiReplicaSpecs.Launcher.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[6] = strenv(TEST_NAME)
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[3] = strenv(TEST_NAME)
| .spec.mpiReplicaSpecs.Worker.template.spec.containers[].image = strenv(BASE_IMAGE)
| .spec.mpiReplicaSpecs.Worker.template.spec.containers[].name = strenv(WORKER_NAME)
| .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \
Expand All @@ -97,19 +97,21 @@ jobs:
- name: Wait for Kubernetes job to start
# Note that this is *not* using JOB_NAME
run: |
# Launcher job is only created once the workers are ready; wait for its
# creation. This is where we block if the cluster is busy executing other jobs,
# but it might be better to impose more of a parallelism limit at the GitHub
# Actions level to keep the Kubernetes queue length modest
kubectl wait --for=create job/${LAUNCHER_NAME} --timeout=3600s
# Launcher job is created eagerly, but suspended. Kueue un-suspends it when
# resources are available, but that is where there can be a long wait if the
# cluster is busy executing other jobs.
kubectl wait --for=create job/${LAUNCHER_NAME}
kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${LAUNCHER_NAME} --timeout=3600s
- name: Stream Kubernetes job output
# Note that this is *not* JOB_NAME
run: |
# Streaming logs will fail if the container/pod is still pending
while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
sleep 1
done
- name: Stream Kubernetes job output
# Note that this is *not* JOB_NAME
# TODO: --all-containers=true --all-pods=true could make sense here
run: kubectl logs --follow job/${LAUNCHER_NAME}
# TODO: --all-containers=true --all-pods=true could make sense here, but it
# prefixes lines with a rather verbose tag
kubectl logs --follow job/${LAUNCHER_NAME}
- name: Retrieve Kubernetes job status
shell: bash -exo pipefail {0}
run: |
Expand All @@ -135,7 +137,7 @@ jobs:
run: |
# Provide better debug in case of launch failures that will not produce log output
pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name)
if [[ -n "${powd}" ]]; then
if [[ -n "${pods}" ]]; then
kubectl describe ${pods}
fi
# Clean up in case of errors as well as success
Expand Down

0 comments on commit c98f25e

Please sign in to comment.