Skip to content

Commit

Permalink
Add sitrep to pax mgmn
Browse files Browse the repository at this point in the history
  • Loading branch information
hemildesai committed Nov 29, 2023
1 parent 9c7b41a commit 5cd10b8
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 107 deletions.
87 changes: 25 additions & 62 deletions .github/workflows/_test_pax.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,20 @@ on:
description: Extra command line args to pass to test-pax.sh
default: ""
required: false
BADGE_FILENAME:
type: string
description: 'Name of the endpoint JSON file for shields.io badge'
required: false
default: 'badge-pax-mgmn-test'
ARTIFACT_NAME:
type: string
description: 'Name of the artifact zip file'
required: false
default: 'artifact-pax-mgmn-test'
outputs:
TEST_STATUS:
description: 'Summary pass/fail value indicating if results from tests are acceptable'
value: ${{ jobs.publish-test.outputs.STATUS }}
value: ${{ jobs.sitrep.outputs.STATUS }}

jobs:

Expand Down Expand Up @@ -69,7 +79,7 @@ jobs:
shell: bash -O expand_aliases -x -e {0}
run: |
alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
sshx "date && hostname && sinfo"
sshx "date && hostname && sinfo"
sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
JOB=$(sshx sbatch --parsable << EOF
#!/bin/bash
Expand Down Expand Up @@ -134,7 +144,7 @@ jobs:
dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"}
json.dump(dump, f)
EOF
- name: Upload training logs as artifacts
uses: actions/upload-artifact@v3
with:
Expand Down Expand Up @@ -198,7 +208,7 @@ jobs:
shell: bash -O expand_aliases -x -e {0}
run: |
alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
sshx "date && hostname && sinfo"
sshx "date && hostname && sinfo"
sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
JOB=$(sshx sbatch --parsable << EOF
#!/bin/bash
Expand Down Expand Up @@ -271,7 +281,7 @@ jobs:
dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"}
json.dump(dump, f)
EOF
- name: Upload training logs as artifacts
uses: actions/upload-artifact@v3
with:
Expand Down Expand Up @@ -328,7 +338,7 @@ jobs:
shell: bash -O expand_aliases -x -e {0}
run: |
alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
sshx "date && hostname && sinfo"
sshx "date && hostname && sinfo"
sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
JOB=$(sshx sbatch --parsable << EOF
#!/bin/bash
Expand Down Expand Up @@ -402,7 +412,7 @@ jobs:
dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"}
json.dump(dump, f)
EOF
- name: Upload training logs as artifacts
uses: actions/upload-artifact@v3
with:
Expand Down Expand Up @@ -446,60 +456,13 @@ jobs:
name: metrics-test-log
path: report.jsonl


publish-test:
sitrep:
needs: [single-process-multi-device, pax-multi-node, single-process-evaluation, metrics]
uses: ./.github/workflows/_publish_badge.yaml
if: ( always() )
secrets: inherit
if: success() || failure()
uses: ./.github/workflows/_sitrep_mgmn.yaml
with:
ENDPOINT_FILENAME: 'pax-test-status.json'
PUBLISH: false
SCRIPT: |
EXIT_STATUSES="${GITHUB_RUN_ID}-*DP*FSDP*TP*PP*/*-status.json"
PASSED_TESTS=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
FAILED_TESTS=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l)
TOTAL_TESTS=$(ls $EXIT_STATUSES | wc -l)
echo '## PAX MGMN Test Status' >> $GITHUB_STEP_SUMMARY
for i in $EXIT_STATUSES; do
echo $i | cut -d'.' -f1
echo '```json'
jq . $i
echo '```'
done | tee -a $GITHUB_STEP_SUMMARY
echo "Test statuses:"
jq -rc 'input_filename,.' $EXIT_STATUSES
PYTEST_LOG=metrics-test-log/report.jsonl
all_outcomes() {
cat $PYTEST_LOG | jq -r '. | select((.["$report_type"] == "TestReport") and (.when == "call")) | .outcome'
}
cnt_type() {
cat $PYTEST_LOG | jq '. | select((.["$report_type"] == "TestReport") and (.when == "call") and (.outcome | contains("'${1}'"))) | .outcome' | wc -l
}
PYTEST_FAILED_TESTS=$(cnt_type failed)
PYTEST_PASSED_TESTS=$(cnt_type passed)
PYTEST_TOTAL_TESTS=$(all_outcomes | wc -l)
if ([[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]] && \
[[ $PYTEST_FAILED_TESTS -eq 0 ]] && [[ $PYTEST_TOTAL_TESTS -gt 0 ]]) || \
([[ $PASSED_TESTS -eq $TOTAL_TESTS ]] && [[ $PYTEST_PASSED_TESTS -eq $PYTEST_TOTAL_TESTS ]]); then
STATUS=success
BADGE_COLOR=brightgreen
elif [[ $PASSED_TESTS -eq 0 ]] || [[ $PYTEST_PASSED_TESTS -eq 0 ]]; then
STATUS=failure
BADGE_COLOR=red
else
STATUS=failure
BADGE_COLOR=yellow
fi
echo "STATUS='${STATUS}'" >> ${GITHUB_OUTPUT}
echo "LABEL='Completion'" >> $GITHUB_OUTPUT
echo "MESSAGE='${PASSED_TESTS}/${TOTAL_TESTS} ran ${PYTEST_PASSED_TESTS}/${PYTEST_TOTAL_TESTS} pass loss+perf'" >> $GITHUB_OUTPUT
echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT
BADGE_FILENAME: ${{ inputs.BADGE_FILENAME }}
ARTIFACT_NAME: ${{ inputs.ARTIFACT_NAME }}

summary:
runs-on: ubuntu-22.04
Expand All @@ -518,12 +481,12 @@ jobs:
) | tee $GITHUB_STEP_SUMMARY
outcome:
needs: publish-test
needs: sitrep
runs-on: ubuntu-22.04
if: ( always() )
steps:
- name: Sets workflow status based on test outputs
- name: Sets workflow status based on test outputs
run: |
if [[ ${{ needs.publish-test.outputs.STATUS }} != success ]]; then
if [[ ${{ needs.sitrep.outputs.STATUS }} != 'success' ]]; then
exit 1
fi
11 changes: 6 additions & 5 deletions .github/workflows/_test_t5x.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ jobs:
shell: bash -O expand_aliases -x -e {0}
run: |
alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
sshx "date && hostname && sinfo"
sshx "date && hostname && sinfo"
sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
JOB=$(sshx sbatch --parsable << EOF
#!/bin/bash
Expand Down Expand Up @@ -147,7 +147,7 @@ jobs:
dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"}
json.dump(dump, f)
EOF
- name: Upload training logs as artifacts
uses: actions/upload-artifact@v3
with:
Expand Down Expand Up @@ -202,7 +202,7 @@ jobs:
shell: bash -O expand_aliases -x -e {0}
run: |
alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
sshx "date && hostname && sinfo"
sshx "date && hostname && sinfo"
sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
JOB=$(sshx sbatch --parsable << EOF
#!/bin/bash
Expand Down Expand Up @@ -271,7 +271,7 @@ jobs:
dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"}
json.dump(dump, f)
EOF
- name: Upload training logs as artifacts
uses: actions/upload-artifact@v3
with:
Expand Down Expand Up @@ -314,6 +314,7 @@ jobs:
with:
name: metrics-test-log
path: report.jsonl

sitrep:
needs: [t5x-multi-node, t5x-multi-gpu, metrics]
if: success() || failure()
Expand Down Expand Up @@ -343,7 +344,7 @@ jobs:
runs-on: ubuntu-22.04
if: ( always() )
steps:
- name: Sets workflow status based on test outputs
- name: Sets workflow status based on test outputs
run: |
if [[ ${{ needs.sitrep.outputs.STATUS }} != 'success' ]]; then
exit 1
Expand Down
52 changes: 12 additions & 40 deletions .github/workflows/nightly-pax-test-mgmn.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -104,45 +104,9 @@ jobs:
EOF
) | tee $GITHUB_STEP_SUMMARY
publish-completion:
needs: [metadata, run-jobs]
uses: ./.github/workflows/_publish_badge.yaml
if: success() || failure()
secrets: inherit
with:
ENDPOINT_FILENAME: 'pax-test-completion-status.json'
PUBLISH: ${{ github.event_name == 'workflow_run' || needs.metadata.outputs.PUBLISH == 'true' }}
SCRIPT: |
STATUS=failure
if [[ ${{ needs.run-jobs.result }} == "success" ]]; then
EXIT_STATUSES="${GITHUB_RUN_ID}-*DP*TP*PP/*-status.json"
PASSED_TESTS=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
FAILED_TESTS=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l)
TOTAL_TESTS=$(ls $EXIT_STATUSES | wc -l)
echo "Test statuses:"
jq -rc 'input_filename,.' $EXIT_STATUSES
if [[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]] || [[ $PASSED_TESTS -eq $TOTAL_TESTS ]]; then
BADGE_COLOR=brightgreen
STATUS=success
elif [[ $PASSED_TESTS -eq 0 ]]; then
BADGE_COLOR=red
else
BADGE_COLOR=yellow
fi
echo "MESSAGE='${PASSED_TESTS}/${TOTAL_TESTS} passed'" >> $GITHUB_OUTPUT
echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT
else
echo "MESSAGE='n/a'" >> $GITHUB_OUTPUT
echo "COLOR='red'" >> $GITHUB_OUTPUT
fi
echo "LABEL='Completion'" >> $GITHUB_OUTPUT
echo "STATUS='$STATUS'" >> $GITHUB_OUTPUT
publish-verified:
if: needs.publish-completion.outputs.STATUS == 'success' && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH))
needs: [metadata, publish-completion]
if: needs.run-jobs.outputs.TEST_STATUS == 'success' && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH))
needs: [metadata, run-jobs]
uses: ./.github/workflows/_publish_container.yaml
secrets: inherit
with:
Expand All @@ -152,9 +116,9 @@ jobs:
type=raw,value=latest-verified,priority=1000
triage:
needs: [metadata, publish-completion]
needs: [metadata, run-jobs]
uses: ./.github/workflows/_triage.yaml
if: needs.publish-completion.outputs.STATUS != 'success' && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch')
if: needs.run-jobs.outputs.TEST_STATUS != 'success' && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch')
secrets: inherit
with:
BROKEN_IMAGE: ${{ needs.metadata.outputs.PAX_IMAGE }}
Expand All @@ -167,3 +131,11 @@ jobs:
if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'failure') && github.event_name != 'workflow_dispatch'
steps:
- run: echo 'Upstream workflow failed, aborting run' && exit 1

finalize:
if: always()
needs: [metadata, run-jobs]
uses: ./.github/workflows/_finalize.yaml
with:
PUBLISH_BADGE: ${{ github.event_name == 'workflow_run' || needs.metadata.outputs.PUBLISH == 'true' }}
secrets: inherit

0 comments on commit 5cd10b8

Please sign in to comment.