From 5cd10b88fc557e17f188ab12197de3c3d02a89f5 Mon Sep 17 00:00:00 2001 From: Hemil Desai Date: Wed, 29 Nov 2023 13:42:30 -0800 Subject: [PATCH] Add sitrep to pax mgmn --- .github/workflows/_test_pax.yaml | 87 ++++++-------------- .github/workflows/_test_t5x.yaml | 11 +-- .github/workflows/nightly-pax-test-mgmn.yaml | 52 +++--------- 3 files changed, 43 insertions(+), 107 deletions(-) diff --git a/.github/workflows/_test_pax.yaml b/.github/workflows/_test_pax.yaml index f35dee0d2..cbdbf8084 100644 --- a/.github/workflows/_test_pax.yaml +++ b/.github/workflows/_test_pax.yaml @@ -13,10 +13,20 @@ on: description: Extra command line args to pass to test-pax.sh default: "" required: false + BADGE_FILENAME: + type: string + description: 'Name of the endpoint JSON file for shields.io badge' + required: false + default: 'badge-pax-mgmn-test' + ARTIFACT_NAME: + type: string + description: 'Name of the artifact zip file' + required: false + default: 'artifact-pax-mgmn-test' outputs: TEST_STATUS: description: 'Summary pass/fail value indicating if results from tests are acceptable' - value: ${{ jobs.publish-test.outputs.STATUS }} + value: ${{ jobs.sitrep.outputs.STATUS }} jobs: @@ -69,7 +79,7 @@ jobs: shell: bash -O expand_aliases -x -e {0} run: | alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' - sshx "date && hostname && sinfo" + sshx "date && hostname && sinfo" sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} JOB=$(sshx sbatch --parsable << EOF #!/bin/bash @@ -134,7 +144,7 @@ jobs: dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"} json.dump(dump, f) EOF - + - name: Upload training logs as artifacts uses: actions/upload-artifact@v3 with: @@ -198,7 +208,7 @@ jobs: shell: bash -O expand_aliases -x -e {0} run: | alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' - sshx "date && hostname && sinfo" + sshx "date && hostname && sinfo" sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} JOB=$(sshx sbatch --parsable << EOF #!/bin/bash @@ -271,7 +281,7 @@ jobs: dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"} json.dump(dump, f) EOF - + - name: Upload training logs as artifacts uses: actions/upload-artifact@v3 with: @@ -328,7 +338,7 @@ jobs: shell: bash -O expand_aliases -x -e {0} run: | alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' - sshx "date && hostname && sinfo" + sshx "date && hostname && sinfo" sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} JOB=$(sshx sbatch --parsable << EOF #!/bin/bash @@ -402,7 +412,7 @@ jobs: dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"} json.dump(dump, f) EOF - + - name: Upload training logs as artifacts uses: actions/upload-artifact@v3 with: @@ -446,60 +456,13 @@ jobs: name: metrics-test-log path: report.jsonl - - publish-test: + sitrep: needs: [single-process-multi-device, pax-multi-node, single-process-evaluation, metrics] - uses: ./.github/workflows/_publish_badge.yaml - if: ( always() ) - secrets: inherit + if: success() || failure() + uses: ./.github/workflows/_sitrep_mgmn.yaml with: - ENDPOINT_FILENAME: 'pax-test-status.json' - PUBLISH: false - SCRIPT: | - EXIT_STATUSES="${GITHUB_RUN_ID}-*DP*FSDP*TP*PP*/*-status.json" - PASSED_TESTS=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l) - FAILED_TESTS=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l) - TOTAL_TESTS=$(ls $EXIT_STATUSES | wc -l) - - echo '## PAX MGMN Test Status' >> $GITHUB_STEP_SUMMARY - for i in $EXIT_STATUSES; do - echo $i | cut -d'.' -f1 - echo '```json' - jq . $i - echo '```' - done | tee -a $GITHUB_STEP_SUMMARY - - echo "Test statuses:" - jq -rc 'input_filename,.' $EXIT_STATUSES - - PYTEST_LOG=metrics-test-log/report.jsonl - all_outcomes() { - cat $PYTEST_LOG | jq -r '. | select((.["$report_type"] == "TestReport") and (.when == "call")) | .outcome' - } - cnt_type() { - cat $PYTEST_LOG | jq '. | select((.["$report_type"] == "TestReport") and (.when == "call") and (.outcome | contains("'${1}'"))) | .outcome' | wc -l - } - PYTEST_FAILED_TESTS=$(cnt_type failed) - PYTEST_PASSED_TESTS=$(cnt_type passed) - PYTEST_TOTAL_TESTS=$(all_outcomes | wc -l) - - if ([[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]] && \ - [[ $PYTEST_FAILED_TESTS -eq 0 ]] && [[ $PYTEST_TOTAL_TESTS -gt 0 ]]) || \ - ([[ $PASSED_TESTS -eq $TOTAL_TESTS ]] && [[ $PYTEST_PASSED_TESTS -eq $PYTEST_TOTAL_TESTS ]]); then - STATUS=success - BADGE_COLOR=brightgreen - elif [[ $PASSED_TESTS -eq 0 ]] || [[ $PYTEST_PASSED_TESTS -eq 0 ]]; then - STATUS=failure - BADGE_COLOR=red - else - STATUS=failure - BADGE_COLOR=yellow - fi - echo "STATUS='${STATUS}'" >> ${GITHUB_OUTPUT} - echo "LABEL='Completion'" >> $GITHUB_OUTPUT - echo "MESSAGE='${PASSED_TESTS}/${TOTAL_TESTS} ran ${PYTEST_PASSED_TESTS}/${PYTEST_TOTAL_TESTS} pass loss+perf'" >> $GITHUB_OUTPUT - echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT - + BADGE_FILENAME: ${{ inputs.BADGE_FILENAME }} + ARTIFACT_NAME: ${{ inputs.ARTIFACT_NAME }} summary: runs-on: ubuntu-22.04 @@ -518,12 +481,12 @@ jobs: ) | tee $GITHUB_STEP_SUMMARY outcome: - needs: publish-test + needs: sitrep runs-on: ubuntu-22.04 if: ( always() ) steps: - - name: Sets workflow status based on test outputs + - name: Sets workflow status based on test outputs run: | - if [[ ${{ needs.publish-test.outputs.STATUS }} != success ]]; then + if [[ ${{ needs.sitrep.outputs.STATUS }} != 'success' ]]; then exit 1 fi diff --git a/.github/workflows/_test_t5x.yaml b/.github/workflows/_test_t5x.yaml index 4a57ab0db..e5d0a4a15 100644 --- a/.github/workflows/_test_t5x.yaml +++ b/.github/workflows/_test_t5x.yaml @@ -81,7 +81,7 @@ jobs: shell: bash -O expand_aliases -x -e {0} run: | alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' - sshx "date && hostname && sinfo" + sshx "date && hostname && sinfo" sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} JOB=$(sshx sbatch --parsable << EOF #!/bin/bash @@ -147,7 +147,7 @@ jobs: dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"} json.dump(dump, f) EOF - + - name: Upload training logs as artifacts uses: actions/upload-artifact@v3 with: @@ -202,7 +202,7 @@ jobs: shell: bash -O expand_aliases -x -e {0} run: | alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' - sshx "date && hostname && sinfo" + sshx "date && hostname && sinfo" sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} JOB=$(sshx sbatch --parsable << EOF #!/bin/bash @@ -271,7 +271,7 @@ jobs: dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"} json.dump(dump, f) EOF - + - name: Upload training logs as artifacts uses: actions/upload-artifact@v3 with: @@ -314,6 +314,7 @@ jobs: with: name: metrics-test-log path: report.jsonl + sitrep: needs: [t5x-multi-node, t5x-multi-gpu, metrics] if: success() || failure() @@ -343,7 +344,7 @@ jobs: runs-on: ubuntu-22.04 if: ( always() ) steps: - - name: Sets workflow status based on test outputs + - name: Sets workflow status based on test outputs run: | if [[ ${{ needs.sitrep.outputs.STATUS }} != 'success' ]]; then exit 1 diff --git a/.github/workflows/nightly-pax-test-mgmn.yaml b/.github/workflows/nightly-pax-test-mgmn.yaml index db041cd77..ff1137050 100644 --- a/.github/workflows/nightly-pax-test-mgmn.yaml +++ b/.github/workflows/nightly-pax-test-mgmn.yaml @@ -104,45 +104,9 @@ jobs: EOF ) | tee $GITHUB_STEP_SUMMARY - publish-completion: - needs: [metadata, run-jobs] - uses: ./.github/workflows/_publish_badge.yaml - if: success() || failure() - secrets: inherit - with: - ENDPOINT_FILENAME: 'pax-test-completion-status.json' - PUBLISH: ${{ github.event_name == 'workflow_run' || needs.metadata.outputs.PUBLISH == 'true' }} - SCRIPT: | - STATUS=failure - if [[ ${{ needs.run-jobs.result }} == "success" ]]; then - EXIT_STATUSES="${GITHUB_RUN_ID}-*DP*TP*PP/*-status.json" - PASSED_TESTS=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l) - FAILED_TESTS=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l) - TOTAL_TESTS=$(ls $EXIT_STATUSES | wc -l) - - echo "Test statuses:" - jq -rc 'input_filename,.' $EXIT_STATUSES - - if [[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]] || [[ $PASSED_TESTS -eq $TOTAL_TESTS ]]; then - BADGE_COLOR=brightgreen - STATUS=success - elif [[ $PASSED_TESTS -eq 0 ]]; then - BADGE_COLOR=red - else - BADGE_COLOR=yellow - fi - echo "MESSAGE='${PASSED_TESTS}/${TOTAL_TESTS} passed'" >> $GITHUB_OUTPUT - echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT - else - echo "MESSAGE='n/a'" >> $GITHUB_OUTPUT - echo "COLOR='red'" >> $GITHUB_OUTPUT - fi - echo "LABEL='Completion'" >> $GITHUB_OUTPUT - echo "STATUS='$STATUS'" >> $GITHUB_OUTPUT - publish-verified: - if: needs.publish-completion.outputs.STATUS == 'success' && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH)) - needs: [metadata, publish-completion] + if: needs.run-jobs.outputs.TEST_STATUS == 'success' && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH)) + needs: [metadata, run-jobs] uses: ./.github/workflows/_publish_container.yaml secrets: inherit with: @@ -152,9 +116,9 @@ jobs: type=raw,value=latest-verified,priority=1000 triage: - needs: [metadata, publish-completion] + needs: [metadata, run-jobs] uses: ./.github/workflows/_triage.yaml - if: needs.publish-completion.outputs.STATUS != 'success' && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch') + if: needs.run-jobs.outputs.TEST_STATUS != 'success' && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch') secrets: inherit with: BROKEN_IMAGE: ${{ needs.metadata.outputs.PAX_IMAGE }} @@ -167,3 +131,11 @@ jobs: if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'failure') && github.event_name != 'workflow_dispatch' steps: - run: echo 'Upstream workflow failed, aborting run' && exit 1 + + finalize: + if: always() + needs: [metadata, run-jobs] + uses: ./.github/workflows/_finalize.yaml + with: + PUBLISH_BADGE: ${{ github.event_name == 'workflow_run' || needs.metadata.outputs.PUBLISH == 'true' }} + secrets: inherit