diff --git a/.github/workflows/_publish_t5x_pax_results.yaml b/.github/workflows/_publish_t5x_pax_results.yaml index f79298002..5306a5f07 100644 --- a/.github/workflows/_publish_t5x_pax_results.yaml +++ b/.github/workflows/_publish_t5x_pax_results.yaml @@ -63,4 +63,4 @@ jobs: [view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars®exInput=$(jq -nr --arg url "${FOLDER}" '$url|@uri')&_smoothingWeight=0&tagFilter=seqs_per) EOF - ) | tee $GITHUB_STEP_SUMMARY \ No newline at end of file + ) | tee $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/_sitrep_mgmn.yaml b/.github/workflows/_sitrep_mgmn.yaml new file mode 100644 index 000000000..7fd0cbd0e --- /dev/null +++ b/.github/workflows/_sitrep_mgmn.yaml @@ -0,0 +1,134 @@ +name: ~Generate sitrep for Multi-Node Multi-GPU tests + +on: + workflow_call: + inputs: + BADGE_FILENAME: + type: string + description: 'Name of the endpoint JSON file for shields.io badge' + required: true + ARTIFACT_NAME: + type: string + description: 'Name of the artifact zip file' + required: true + FW_NAME: + type: string + description: 'Name of the framework being used' + required: true + outputs: + STATUS: + description: 'Summary of all tests run for the workflow. Set to "success" when all metrics per job and all jobs pass, whereas a single metric failure or job error sets the status to "failure"' + value: ${{ jobs.sitrep.outputs.STATUS }} + +jobs: + sitrep: + runs-on: ubuntu-22.04 + outputs: + STATUS: ${{ steps.gen-sitrep.outputs.STATUS }} + steps: + - name: Check out repository + uses: actions/checkout@v3 + + - name: Download all artifacts from the previous jobs + uses: actions/download-artifact@v3 + + - name: Write exit status summary + id: exit-status + shell: bash -x -e {0} + run: | + EXIT_STATUSES="${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}-*/*-status.json" + EXIT_STATUS_SUMMARY_FILE="exit_status_summary.json" + echo -e "\n\n## ${{ inputs.FW_NAME }} MGMN+SPMD Test Status" >> $EXIT_STATUS_SUMMARY_FILE + cat <>$EXIT_STATUS_SUMMARY_FILE + | Test Case | State | Exit Code | + | --- | --- | --- | + EOF + + for i in $EXIT_STATUSES; do + # Files are named --/-status.json + echo "| $(echo $i | cut -d/ -f1 | awk -F- '{print $NF}') | $(jq -r .state $i) | $(jq -r .exitcode $i)" + done | tee -a $EXIT_STATUS_SUMMARY_FILE + + echo "Test statuses:" + jq -rc 'input_filename,.' $EXIT_STATUSES + + echo "EXIT_STATUS_SUMMARY_FILE=$EXIT_STATUS_SUMMARY_FILE" >> ${GITHUB_OUTPUT} + + - name: Write metrics summary + id: metrics + shell: bash -x -e {0} + run: | + METRICS_SUMMARY_FILE="metrics_summary.json" + echo -e "\n\n## ${{ inputs.FW_NAME }} MGMN Test Metrics" >> $METRICS_SUMMARY_FILE + for i in metrics-test-log/*_metrics.json; do + echo $i | cut -d'.' -f1 + echo '```json' + jq . $i + echo '```' + done | tee -a $METRICS_SUMMARY_FILE + + echo "METRICS_SUMMARY_FILE=$METRICS_SUMMARY_FILE" >> ${GITHUB_OUTPUT} + + - name: Generate sitrep + id: gen-sitrep + shell: bash -x -e {0} + run: | + source .github/workflows/scripts/to_json.sh + + EXIT_STATUSES="${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}-*/*-status.json" + + passed_tests=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l) + failed_tests=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l) + total_tests=$(ls $EXIT_STATUSES | wc -l) + + METRICS_LOG=metrics-test-log/report.jsonl + all_outcomes() { + cat $METRICS_LOG | jq -r '. | select((.["$report_type"] == "TestReport") and (.when == "call")) | .outcome' + } + cnt_type() { + cat $METRICS_LOG | jq '. | select((.["$report_type"] == "TestReport") and (.when == "call") and (.outcome | contains("'${1}'"))) | .outcome' | wc -l + } + pytest_failed_tests=$(cnt_type failed) + pytest_passed_tests=$(cnt_type passed) + pytest_total_tests=$(all_outcomes | wc -l) + + if ([[ $failed_tests -eq 0 ]] && [[ $total_tests -gt 0 ]] && \ + [[ $pytest_failed_tests -eq 0 ]] && [[ $pytest_total_tests -gt 0 ]]); then + status=success + badge_color=brightgreen + elif [[ $passed_tests -eq 0 ]] || [[ $pytest_passed_tests -eq 0 ]]; then + status=failure + badge_color=red + else + status=failure + badge_color=yellow + fi + badge_message="${passed_tests}/${total_tests} jobs | ${pytest_passed_tests}/${pytest_total_tests} metrics" + + badge_label='Upstream Tests' + summary="# ${{ inputs.FW_NAME }} MGMN Test: $badge_message" + summary+=`cat ${{ steps.exit-status.outputs.EXIT_STATUS_SUMMARY_FILE }}` + summary+=`cat ${{ steps.metrics.outputs.METRICS_SUMMARY_FILE }}` + + to_json \ + summary \ + total_tests passed_tests failed_tests \ + badge_label badge_color badge_message \ + > sitrep.json + + schemaVersion=1 \ + label="${badge_label}" \ + message="${badge_message}" \ + color="${badge_color}" \ + to_json schemaVersion label message color \ + > ${{ inputs.BADGE_FILENAME }} + + echo "STATUS='${status}'" >> ${GITHUB_OUTPUT} + + - name: Upload artifacts + uses: actions/upload-artifact@v3 + with: + name: ${{ inputs.ARTIFACT_NAME }} + path: | + sitrep.json + ${{ inputs.BADGE_FILENAME }} diff --git a/.github/workflows/_test_pax.yaml b/.github/workflows/_test_pax.yaml index e0e4d1435..052c9dbe3 100644 --- a/.github/workflows/_test_pax.yaml +++ b/.github/workflows/_test_pax.yaml @@ -13,15 +13,25 @@ on: description: Extra command line args to pass to test-pax.sh default: "" required: false + BADGE_FILENAME: + type: string + description: 'Name of the endpoint JSON file for shields.io badge' + required: false + default: 'badge-pax-mgmn-test.json' ARTIFACT_NAME: type: string - description: If provided, will prepend a prefix to the artifact name. Helpful if re-running this reusable workflow to prevent clobbering of artifacts - default: "" + description: 'Name of the artifact zip file' required: false + default: 'artifact-pax-mgmn-test' + FW_NAME: + type: string + description: 'Name of the framework being used' + required: false + default: 'pax' outputs: TEST_STATUS: description: 'Summary pass/fail value indicating if results from tests are acceptable' - value: ${{ jobs.publish-test.outputs.STATUS }} + value: ${{ jobs.sitrep.outputs.STATUS }} jobs: @@ -63,7 +73,7 @@ jobs: MAX_GPUS_PER_NODE=8 NODES=1 GPUS_PER_NODE=8 - JOB_NAME=${{ inputs.ARTIFACT_NAME }}pax-${GITHUB_RUN_ID}-${TEST_CASE_NAME} + JOB_NAME=${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}-${TEST_CASE_NAME} LOG_FILE=/nfs/cluster/${JOB_NAME}.log MODEL_PATH=/nfs/cluster/${JOB_NAME} for var in IMAGE TEST_CASE_NAME TOTAL_TASKS NODES GPUS_PER_NODE JOB_NAME LOG_FILE MODEL_PATH; do @@ -74,7 +84,7 @@ jobs: shell: bash -O expand_aliases -x -e {0} run: | alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' - sshx "date && hostname && sinfo" + sshx "date && hostname && sinfo" sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} JOB=$(sshx sbatch --parsable << EOF #!/bin/bash @@ -129,7 +139,7 @@ jobs: output/ || true rsync -rtz --progress \ output/ \ - ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${{ inputs.ARTIFACT_NAME }}pax-${GITHUB_RUN_ID}/ || true + ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}/ || true - name: Write SLURM job status to file shell: bash -x -e {0} run: | @@ -139,7 +149,7 @@ jobs: dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"} json.dump(dump, f) EOF - + - name: Upload training logs as artifacts uses: actions/upload-artifact@v3 with: @@ -191,7 +201,7 @@ jobs: NODES=$(((TOTAL_TASKS+MAX_GPUS_PER_NODE-1)/MAX_GPUS_PER_NODE)) GPUS_PER_NODE=$((TOTAL_TASKS/NODES)) - JOB_NAME=${{ inputs.ARTIFACT_NAME }}pax-${GITHUB_RUN_ID}-${TEST_CASE_NAME} + JOB_NAME=${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}-${TEST_CASE_NAME} LOG_FILE=/nfs/cluster/${JOB_NAME}.log MODEL_PATH=/nfs/cluster/${JOB_NAME} for var in IMAGE TEST_CASE_NAME TOTAL_TASKS NODES GPUS_PER_NODE JOB_NAME LOG_FILE MODEL_PATH; do @@ -203,7 +213,7 @@ jobs: shell: bash -O expand_aliases -x -e {0} run: | alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' - sshx "date && hostname && sinfo" + sshx "date && hostname && sinfo" sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} JOB=$(sshx sbatch --parsable << EOF #!/bin/bash @@ -265,7 +275,7 @@ jobs: output/ || true rsync -rtz --progress \ output/ \ - ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${{ inputs.ARTIFACT_NAME }}pax-${GITHUB_RUN_ID}/ || true + ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}/ || true - name: Write SLURM job status to file shell: bash -x -e {0} @@ -276,7 +286,7 @@ jobs: dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"} json.dump(dump, f) EOF - + - name: Upload training logs as artifacts uses: actions/upload-artifact@v3 with: @@ -321,7 +331,7 @@ jobs: NODES=1 GPUS_PER_NODE=8 - JOB_NAME=${{ inputs.ARTIFACT_NAME }}pax-${GITHUB_RUN_ID}-${TEST_CASE_NAME} + JOB_NAME=${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}-${TEST_CASE_NAME} LOG_FILE=/nfs/cluster/${JOB_NAME}.log MODEL_PATH=/nfs/cluster/${JOB_NAME} for var in IMAGE TEST_CASE_NAME TOTAL_TASKS NODES GPUS_PER_NODE JOB_NAME LOG_FILE MODEL_PATH; do @@ -333,7 +343,7 @@ jobs: shell: bash -O expand_aliases -x -e {0} run: | alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' - sshx "date && hostname && sinfo" + sshx "date && hostname && sinfo" sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} JOB=$(sshx sbatch --parsable << EOF #!/bin/bash @@ -396,7 +406,7 @@ jobs: output/ || true rsync -rtz --progress \ output/ \ - ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${{ inputs.ARTIFACT_NAME }}pax-${GITHUB_RUN_ID}/ || true + ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}/ || true - name: Write SLURM job status to file shell: bash -x -e {0} @@ -407,7 +417,7 @@ jobs: dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"} json.dump(dump, f) EOF - + - name: Upload training logs as artifacts uses: actions/upload-artifact@v3 with: @@ -429,83 +439,30 @@ jobs: shell: bash -x {0} run: | pip install pytest pytest-reportlog tensorboard - for i in ${{ inputs.ARTIFACT_NAME }}pax-${GITHUB_RUN_ID}-*; do + for i in ${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}-*; do SUBDIR=$(echo $i | cut -d'-' -f3) mv $i/$SUBDIR* . python3 .github/workflows/baselines/summarize_metrics.py $SUBDIR # create result json in baseline format done - echo '## PAX MGMN Test Metrics' >> $GITHUB_STEP_SUMMARY - for i in *_metrics.json; do - echo $i | cut -d'.' -f1 - echo '```json' - jq . $i - echo '```' - done | tee -a $GITHUB_STEP_SUMMARY - RESULTS_DIR=$PWD BASELINES_DIR=PAX_MGMN/upstream pytest --report-log=report.jsonl .github/workflows/baselines/test_pax_mgmn_metrics.py || true - name: Upload metrics test json logs uses: actions/upload-artifact@v3 with: name: metrics-test-log - path: report.jsonl + path: | + report.jsonl + *_metrics.json - - publish-test: + sitrep: needs: [single-process-multi-device, pax-multi-node, single-process-evaluation, metrics] - uses: ./.github/workflows/_publish_badge.yaml - if: ( always() ) - secrets: inherit + if: success() || failure() + uses: ./.github/workflows/_sitrep_mgmn.yaml with: - ENDPOINT_FILENAME: '${{ inputs.ARTIFACT_NAME }}pax-test-status.json' - PUBLISH: false - SCRIPT: | - EXIT_STATUSES="${{ inputs.ARTIFACT_NAME }}pax-${GITHUB_RUN_ID}-*DP*FSDP*TP*PP*/*-status.json" - PASSED_TESTS=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l) - FAILED_TESTS=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l) - TOTAL_TESTS=$(ls $EXIT_STATUSES | wc -l) - - cat <>$GITHUB_STEP_SUMMARY - ## Pax MGMN+SPMD Test Status - | Test Case | State | Exit Code | - | --- | --- | --- | - EOF - for i in $EXIT_STATUSES; do - # Files are named pax--/-status.json - echo "| $(echo $i | cut -d/ -f1 | cut -d- -f3) | $(jq -r .state $i) | $(jq -r .exitcode $i)" - done | tee -a $GITHUB_STEP_SUMMARY - - echo "Test statuses:" - jq -rc 'input_filename,.' $EXIT_STATUSES - - METRICS_LOG=metrics-test-log/report.jsonl - all_outcomes() { - cat $METRICS_LOG | jq -r '. | select((.["$report_type"] == "TestReport") and (.when == "call")) | .outcome' - } - cnt_type() { - cat $METRICS_LOG | jq '. | select((.["$report_type"] == "TestReport") and (.when == "call") and (.outcome | contains("'${1}'"))) | .outcome' | wc -l - } - PYTEST_FAILED_TESTS=$(cnt_type failed) - PYTEST_PASSED_TESTS=$(cnt_type passed) - PYTEST_TOTAL_TESTS=$(all_outcomes | wc -l) - - if ([[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]] && \ - [[ $PYTEST_FAILED_TESTS -eq 0 ]] && [[ $PYTEST_TOTAL_TESTS -gt 0 ]]); then - STATUS=success - BADGE_COLOR=brightgreen - elif [[ $PASSED_TESTS -eq 0 ]] || [[ $PYTEST_PASSED_TESTS -eq 0 ]]; then - STATUS=failure - BADGE_COLOR=red - else - STATUS=failure - BADGE_COLOR=yellow - fi - echo "STATUS='${STATUS}'" >> ${GITHUB_OUTPUT} - echo "LABEL='Completion'" >> $GITHUB_OUTPUT - echo "MESSAGE='${PASSED_TESTS}/${TOTAL_TESTS} ran ${PYTEST_PASSED_TESTS}/${PYTEST_TOTAL_TESTS} pass loss+perf'" >> $GITHUB_OUTPUT - echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT - + BADGE_FILENAME: ${{ inputs.BADGE_FILENAME }} + ARTIFACT_NAME: ${{ inputs.ARTIFACT_NAME }} + FW_NAME: ${{ inputs.FW_NAME }} summary: runs-on: ubuntu-22.04 @@ -518,18 +475,18 @@ jobs: ## PAX MGMN training - [view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars®exInput=${{ inputs.ARTIFACT_NAME }}pax-${GITHUB_RUN_ID}&_smoothingWeight=0&tagFilter=seqs_per) + [view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars®exInput=${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}&_smoothingWeight=0&tagFilter=seqs_per) EOF ) | tee $GITHUB_STEP_SUMMARY outcome: - needs: publish-test + needs: sitrep runs-on: ubuntu-22.04 if: ( always() ) steps: - - name: Sets workflow status based on test outputs + - name: Sets workflow status based on test outputs run: | - if [[ ${{ needs.publish-test.outputs.STATUS }} != success ]]; then + if [[ ${{ needs.sitrep.outputs.STATUS }} != 'success' ]]; then exit 1 fi diff --git a/.github/workflows/_test_t5x.yaml b/.github/workflows/_test_t5x.yaml index 3de559b41..23bacd815 100644 --- a/.github/workflows/_test_t5x.yaml +++ b/.github/workflows/_test_t5x.yaml @@ -18,15 +18,25 @@ on: description: Extra gin args to pass to test-t5x.sh default: "" required: false + BADGE_FILENAME: + type: string + description: 'Name of the endpoint JSON file for shields.io badge' + required: false + default: 'badge-t5x-mgmn-test.json' ARTIFACT_NAME: type: string - description: If provided, will prepend a prefix to the artifact name. Helpful if re-running this reusable workflow to prevent clobbering of artifacts - default: "" + description: 'Name of the artifact zip file' + required: false + default: 'artifact-t5x-mgmn-test' + FW_NAME: + type: string + description: 'Name of the framework being used' required: false + default: 'T5X' outputs: TEST_STATUS: description: 'Summary pass/fail value indicating if results from tests are acceptable' - value: ${{ jobs.publish-test.outputs.STATUS }} + value: ${{ jobs.sitrep.outputs.STATUS }} jobs: @@ -63,7 +73,7 @@ jobs: run: | IMAGE="$(echo ${{inputs.T5X_IMAGE}} | sed 's/\//#/')" TEST_CASE_NAME=1P${{ matrix.N_GPU }}G - JOB_NAME=${{ inputs.ARTIFACT_NAME }}T5X-${GITHUB_RUN_ID}-${TEST_CASE_NAME} + JOB_NAME=${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}-${TEST_CASE_NAME} LOG_FILE=/nfs/cluster/${JOB_NAME}.log MODEL_PATH=/nfs/cluster/${JOB_NAME} BATCH_SIZE=$((${{ inputs.BATCH_SIZE_PER_GPU }} * ${{ matrix.N_GPU }})) @@ -76,7 +86,7 @@ jobs: shell: bash -O expand_aliases -x -e {0} run: | alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' - sshx "date && hostname && sinfo" + sshx "date && hostname && sinfo" sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} JOB=$(sshx sbatch --parsable << EOF #!/bin/bash @@ -131,7 +141,7 @@ jobs: output/ || true rsync -rtz --progress \ output/ \ - ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${{ inputs.ARTIFACT_NAME }}T5X-${GITHUB_RUN_ID}/ || true + ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}/ || true - name: Write SLURM job status to file shell: bash -x -e {0} @@ -142,7 +152,7 @@ jobs: dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"} json.dump(dump, f) EOF - + - name: Upload training logs as artifacts uses: actions/upload-artifact@v3 with: @@ -184,7 +194,7 @@ jobs: IMAGE="$(echo ${{inputs.T5X_IMAGE}} | sed 's/\//#/')" TEST_CASE_NAME=${{ matrix.N_GPU }}G${{ matrix.N_NODE }}N TOTAL_TASKS=$((${{ matrix.N_GPU }} * ${{ matrix.N_NODE }})) - JOB_NAME=${{ inputs.ARTIFACT_NAME }}T5X-${GITHUB_RUN_ID}-${TEST_CASE_NAME} + JOB_NAME=${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}-${TEST_CASE_NAME} LOG_FILE=/nfs/cluster/${JOB_NAME}.log MODEL_PATH=/nfs/cluster/${JOB_NAME} BATCH_SIZE=$((${{ inputs.BATCH_SIZE_PER_GPU }} * ${{ matrix.N_GPU }} * ${{ matrix.N_NODE }})) @@ -197,7 +207,7 @@ jobs: shell: bash -O expand_aliases -x -e {0} run: | alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' - sshx "date && hostname && sinfo" + sshx "date && hostname && sinfo" sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} JOB=$(sshx sbatch --parsable << EOF #!/bin/bash @@ -255,7 +265,7 @@ jobs: output/ || true rsync -rtz --progress \ output/ \ - ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${{ inputs.ARTIFACT_NAME }}T5X-${GITHUB_RUN_ID}/ || true + ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}/ || true - name: Write SLURM job status to file shell: bash -x -e {0} @@ -266,7 +276,7 @@ jobs: dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"} json.dump(dump, f) EOF - + - name: Upload training logs as artifacts uses: actions/upload-artifact@v3 with: @@ -288,82 +298,31 @@ jobs: shell: bash -x {0} run: | pip install pytest pytest-reportlog tensorboard - for i in ${{ inputs.ARTIFACT_NAME }}T5X-${GITHUB_RUN_ID}-*; do + for i in ${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}-*; do SUBDIR=$(echo $i | cut -d'-' -f3) mv $i/$SUBDIR* . python3 .github/workflows/baselines/summarize_metrics.py $SUBDIR --perf_summary_name "timing/steps_per_second" # create result json in baseline format done - echo '## T5X MGMN Test Metrics' >> $GITHUB_STEP_SUMMARY - for i in *_metrics.json; do - echo $i | cut -d'.' -f1 - echo '```json' - jq . $i - echo '```' - done | tee -a $GITHUB_STEP_SUMMARY - RESULTS_DIR=$PWD pytest --report-log=report.jsonl .github/workflows/baselines/test_t5x_mgmn_metrics.py || true - name: Upload metrics test json logs uses: actions/upload-artifact@v3 with: name: metrics-test-log - path: report.jsonl + path: | + report.jsonl + *_metrics.json - publish-test: + sitrep: needs: [t5x-multi-node, t5x-multi-gpu, metrics] - uses: ./.github/workflows/_publish_badge.yaml - if: ( always() ) - secrets: inherit + if: success() || failure() + uses: ./.github/workflows/_sitrep_mgmn.yaml with: - ENDPOINT_FILENAME: '${{ inputs.ARTIFACT_NAME }}t5x-test-completion-status.json' - PUBLISH: false - SCRIPT: | - EXIT_STATUSES="${{ inputs.ARTIFACT_NAME }}T5X-${GITHUB_RUN_ID}-*/*-status.json" - PASSED_TESTS=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l) - FAILED_TESTS=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l) - TOTAL_TESTS=$(ls $EXIT_STATUSES | wc -l) - - cat <>$GITHUB_STEP_SUMMARY - ## T5x MGMN+SPMD Test Status - | Test Case | State | Exit Code | - | --- | --- | --- | - EOF - for i in $EXIT_STATUSES; do - # Files are named T5X--/-status.json - echo "| $(echo $i | cut -d/ -f1 | cut -d- -f3) | $(jq -r .state $i) | $(jq -r .exitcode $i)" - done | tee -a $GITHUB_STEP_SUMMARY - - echo "Test statuses:" - jq -rc 'input_filename,.' $EXIT_STATUSES - - METRICS_LOG=metrics-test-log/report.jsonl - all_outcomes() { - cat $METRICS_LOG | jq -r '. | select((.["$report_type"] == "TestReport") and (.when == "call")) | .outcome' - } - cnt_type() { - cat $METRICS_LOG | jq '. | select((.["$report_type"] == "TestReport") and (.when == "call") and (.outcome | contains("'${1}'"))) | .outcome' | wc -l - } - PYTEST_FAILED_TESTS=$(cnt_type failed) - PYTEST_PASSED_TESTS=$(cnt_type passed) - PYTEST_TOTAL_TESTS=$(all_outcomes | wc -l) - - if ([[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]] && \ - [[ $PYTEST_FAILED_TESTS -eq 0 ]] && [[ $PYTEST_TOTAL_TESTS -gt 0 ]]); then - STATUS=success - BADGE_COLOR=brightgreen - elif [[ $PASSED_TESTS -eq 0 ]] || [[ $PYTEST_PASSED_TESTS -eq 0 ]]; then - STATUS=failure - BADGE_COLOR=red - else - STATUS=failure - BADGE_COLOR=yellow - fi - echo "STATUS='${STATUS}'" >> ${GITHUB_OUTPUT} - echo "LABEL='Completion'" >> $GITHUB_OUTPUT - echo "MESSAGE='${PASSED_TESTS}/${TOTAL_TESTS} ran ${PYTEST_PASSED_TESTS}/${PYTEST_TOTAL_TESTS} pass loss+perf'" >> $GITHUB_OUTPUT - echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT + BADGE_FILENAME: ${{ inputs.BADGE_FILENAME }} + ARTIFACT_NAME: ${{ inputs.ARTIFACT_NAME }} + FW_NAME: ${{ inputs.FW_NAME }} summary: runs-on: ubuntu-22.04 @@ -376,18 +335,18 @@ jobs: ## T5X MGMN training - [view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars®exInput=${{ inputs.ARTIFACT_NAME }}T5X-${GITHUB_RUN_ID}&_smoothingWeight=0&tagFilter=seqs_per) + [view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars®exInput=${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}&_smoothingWeight=0&tagFilter=seqs_per) EOF ) | tee $GITHUB_STEP_SUMMARY outcome: - needs: publish-test + needs: sitrep runs-on: ubuntu-22.04 if: ( always() ) steps: - - name: Sets workflow status based on test outputs + - name: Sets workflow status based on test outputs run: | - if [[ ${{ needs.publish-test.outputs.STATUS }} != success ]]; then + if [[ ${{ needs.sitrep.outputs.STATUS }} != 'success' ]]; then exit 1 fi diff --git a/.github/workflows/nightly-pax-test-mgmn.yaml b/.github/workflows/nightly-pax-test-mgmn.yaml index 3ae570c3e..edb25939d 100644 --- a/.github/workflows/nightly-pax-test-mgmn.yaml +++ b/.github/workflows/nightly-pax-test-mgmn.yaml @@ -34,30 +34,53 @@ jobs: runs-on: ubuntu-22.04 outputs: BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }} - PAX_IMAGE: ${{ steps.date.outputs.PAX_IMAGE }} - PUBLISH: ${{ steps.date.outputs.PUBLISH }} + PAX_IMAGE: ${{ steps.image.outputs.PAX_IMAGE }} + PUBLISH: ${{ steps.publish.outputs.PUBLISH }} steps: - - name: Set metadata + - name: Check if the triggering workflow failed + id: if-upstream-failed + shell: bash -x -e {0} + run: | + echo "UPSTREAM_FAILED=${{ github.event_name == 'workflow_run' && github.event.workflow_run.conclusion != 'success' }}" >> $GITHUB_OUTPUT + + - name: Cancel workflow if upstream workflow did not success + if: ${{ steps.if-upstream-failed.outputs.UPSTREAM_FAILED == 'true' }} + run: | + echo "Upstream workflow failed, cancelling this workflow" + curl -X POST -H "Authorization: token ${{ github.token }}" \ + -H "Accept: application/vnd.github.v3+json" \ + "https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/cancel" + cat # blocks execution in case workflow cancellation takes time + + - name: Set build date id: date shell: bash -x -e {0} run: | BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT + - name: Set docker image + id: image + shell: bash -x -e {0} + run: | PAX_IMAGE=${{ inputs.PAX_IMAGE }} PAX_IMAGE=${PAX_IMAGE:-${{ env.DEFAULT_PAX_IMAGE }}} echo "PAX_IMAGE=${PAX_IMAGE}" >> $GITHUB_OUTPUT - echo "PUBLISH=${{ inputs.PUBLISH }}" >> $GITHUB_OUTPUT + + - name: Set result publishing flags + id: publish + shell: bash -x -e {0} + run: | + echo "PUBLISH=${{ github.event_name == 'workflow_run' || inputs.PUBLISH == 'true' }}" >> $GITHUB_OUTPUT run-jobs: needs: metadata uses: ./.github/workflows/_test_pax.yaml - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' with: PAX_IMAGE: ${{ needs.metadata.outputs.PAX_IMAGE }} secrets: inherit - publish: + tensorboard-upload: needs: [metadata, run-jobs] uses: ./.github/workflows/_publish_t5x_pax_results.yaml if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' @@ -66,45 +89,9 @@ jobs: EXPERIMENT_SUBDIR: PAX secrets: inherit - publish-completion: + publish-verified-image: needs: [metadata, run-jobs] - uses: ./.github/workflows/_publish_badge.yaml - if: success() || failure() - secrets: inherit - with: - ENDPOINT_FILENAME: 'pax-test-completion-status.json' - PUBLISH: ${{ github.event_name == 'workflow_run' || needs.metadata.outputs.PUBLISH == 'true' }} - SCRIPT: | - STATUS=failure - if [[ ${{ needs.run-jobs.result }} == "success" ]]; then - EXIT_STATUSES="${GITHUB_RUN_ID}-*DP*TP*PP/*-status.json" - PASSED_TESTS=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l) - FAILED_TESTS=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l) - TOTAL_TESTS=$(ls $EXIT_STATUSES | wc -l) - - echo "Test statuses:" - jq -rc 'input_filename,.' $EXIT_STATUSES - - if [[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]] || [[ $PASSED_TESTS -eq $TOTAL_TESTS ]]; then - BADGE_COLOR=brightgreen - STATUS=success - elif [[ $PASSED_TESTS -eq 0 ]]; then - BADGE_COLOR=red - else - BADGE_COLOR=yellow - fi - echo "MESSAGE='${PASSED_TESTS}/${TOTAL_TESTS} passed'" >> $GITHUB_OUTPUT - echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT - else - echo "MESSAGE='n/a'" >> $GITHUB_OUTPUT - echo "COLOR='red'" >> $GITHUB_OUTPUT - fi - echo "LABEL='Completion'" >> $GITHUB_OUTPUT - echo "STATUS='$STATUS'" >> $GITHUB_OUTPUT - - publish-verified: - if: needs.publish-completion.outputs.STATUS == 'success' && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH)) - needs: [metadata, publish-completion] + if: needs.metadata.outputs.PUBLISH == 'true' && needs.run-jobs.outputs.TEST_STATUS == 'success' uses: ./.github/workflows/_publish_container.yaml secrets: inherit with: @@ -113,10 +100,10 @@ jobs: TARGET_TAGS: | type=raw,value=latest-verified,priority=1000 - triage: - needs: [metadata, publish-completion] + failure-triage: + needs: [metadata, run-jobs] + if: needs.run-jobs.outputs.TEST_STATUS != 'success' uses: ./.github/workflows/_triage.yaml - if: needs.publish-completion.outputs.STATUS != 'success' && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch') secrets: inherit with: BROKEN_IMAGE: ${{ needs.metadata.outputs.PAX_IMAGE }} @@ -124,8 +111,10 @@ jobs: REPO_DIRS: "/opt/paxml /opt/praxis" FILE_ISSUE: true - if-upstream-failed: - runs-on: ubuntu-latest - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'failure') && github.event_name != 'workflow_dispatch' - steps: - - run: echo 'Upstream workflow failed, aborting run' && exit 1 + finalize: + if: "!cancelled()" + needs: [metadata, run-jobs] + uses: ./.github/workflows/_finalize.yaml + with: + PUBLISH_BADGE: ${{ needs.metadata.outputs.PUBLISH }} + secrets: inherit diff --git a/.github/workflows/nightly-t5x-test-mgmn.yaml b/.github/workflows/nightly-t5x-test-mgmn.yaml index 866711940..88fc15250 100644 --- a/.github/workflows/nightly-t5x-test-mgmn.yaml +++ b/.github/workflows/nightly-t5x-test-mgmn.yaml @@ -34,77 +34,63 @@ jobs: runs-on: ubuntu-22.04 outputs: BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }} - T5X_IMAGE: ${{ steps.date.outputs.T5X_IMAGE }} - PUBLISH: ${{ steps.date.outputs.PUBLISH }} + T5X_IMAGE: ${{ steps.image.outputs.T5X_IMAGE }} + PUBLISH: ${{ steps.publish.outputs.PUBLISH }} steps: - - name: Set metadata + - name: Check if the triggering workflow failed + id: if-upstream-failed + shell: bash -x -e {0} + run: | + echo "UPSTREAM_FAILED=${{ github.event_name == 'workflow_run' && github.event.workflow_run.conclusion != 'success' }}" >> $GITHUB_OUTPUT + + - name: Cancel workflow if upstream workflow did not success + if: ${{ steps.if-upstream-failed.outputs.UPSTREAM_FAILED == 'true' }} + run: | + echo "Upstream workflow failed, cancelling this workflow" + curl -X POST -H "Authorization: token ${{ github.token }}" \ + -H "Accept: application/vnd.github.v3+json" \ + "https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/cancel" + cat # blocks execution in case workflow cancellation takes time + + - name: Set build date id: date shell: bash -x -e {0} run: | BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT + - name: Set docker image + id: image + shell: bash -x -e {0} + run: | T5X_IMAGE=${{ inputs.T5X_IMAGE }} T5X_IMAGE=${T5X_IMAGE:-${{ env.DEFAULT_T5X_IMAGE }}} echo "T5X_IMAGE=${T5X_IMAGE}" >> $GITHUB_OUTPUT - echo "PUBLISH=${{ inputs.PUBLISH }}" >> $GITHUB_OUTPUT + + - name: Set result publishing flags + id: publish + shell: bash -x -e {0} + run: | + echo "PUBLISH=${{ github.event_name == 'workflow_run' || inputs.PUBLISH == 'true' }}" >> $GITHUB_OUTPUT run-jobs: needs: metadata uses: ./.github/workflows/_test_t5x.yaml - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' with: T5X_IMAGE: ${{ needs.metadata.outputs.T5X_IMAGE }} secrets: inherit - publish: + tensorboard-upload: needs: [metadata, run-jobs] uses: ./.github/workflows/_publish_t5x_pax_results.yaml - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' with: BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} EXPERIMENT_SUBDIR: T5X secrets: inherit - publish-completion: + publish-verified-image: needs: [metadata, run-jobs] - uses: ./.github/workflows/_publish_badge.yaml - if: success() || failure() - secrets: inherit - with: - ENDPOINT_FILENAME: 't5x-test-overall-status.json' - PUBLISH: ${{ github.event_name == 'workflow_run' || needs.metadata.outputs.PUBLISH == 'true' }} - SCRIPT: | - STATUS=failure - if [[ ${{ needs.run-jobs.result }} == "success" ]]; then - EXIT_STATUSES="${GITHUB_RUN_ID}-*/*-status.json" - PASSED_TESTS=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l) - FAILED_TESTS=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l) - TOTAL_TESTS=$(ls $EXIT_STATUSES | wc -l) - - echo "Test statuses:" - jq -rc 'input_filename,.' $EXIT_STATUSES - - if [[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]] || [[ $PASSED_TESTS -eq $TOTAL_TESTS ]]; then - BADGE_COLOR=brightgreen - STATUS=success - elif [[ $PASSED_TESTS -eq 0 ]]; then - BADGE_COLOR=red - else - BADGE_COLOR=yellow - fi - echo "MESSAGE='${PASSED_TESTS}/${TOTAL_TESTS} passed'" >> $GITHUB_OUTPUT - echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT - else - echo "MESSAGE='n/a'" >> $GITHUB_OUTPUT - echo "COLOR='red'" >> $GITHUB_OUTPUT - fi - echo "LABEL='Completion'" >> $GITHUB_OUTPUT - echo "STATUS='$STATUS'" >> $GITHUB_OUTPUT - - publish-verified: - if: needs.publish-completion.outputs.STATUS == 'success' && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH)) - needs: [metadata, publish-completion] + if: needs.metadata.outputs.PUBLISH == 'true' && needs.run-jobs.outputs.TEST_STATUS == 'success' uses: ./.github/workflows/_publish_container.yaml secrets: inherit with: @@ -113,10 +99,10 @@ jobs: TARGET_TAGS: | type=raw,value=latest-verified,priority=1000 - triage: - needs: [metadata, publish-completion] + failure-triage: + needs: [metadata, run-jobs] + if: needs.run-jobs.outputs.TEST_STATUS != 'success' uses: ./.github/workflows/_triage.yaml - if: needs.publish-completion.outputs.STATUS != 'success' && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch') secrets: inherit with: BROKEN_IMAGE: ${{ needs.metadata.outputs.T5X_IMAGE }} @@ -124,8 +110,10 @@ jobs: REPO_DIRS: "/opt/t5x /opt/flax" FILE_ISSUE: true - if-upstream-failed: - runs-on: ubuntu-latest - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'failure') && github.event_name != 'workflow_dispatch' - steps: - - run: echo 'Upstream workflow failed, aborting run' && exit 1 + finalize: + if: "!cancelled()" + needs: [metadata, run-jobs] + uses: ./.github/workflows/_finalize.yaml + with: + PUBLISH_BADGE: ${{ needs.metadata.outputs.PUBLISH }} + secrets: inherit diff --git a/.github/workflows/scripts/to_json.sh b/.github/workflows/scripts/to_json.sh index 548f411ee..bea2ce50e 100644 --- a/.github/workflows/scripts/to_json.sh +++ b/.github/workflows/scripts/to_json.sh @@ -2,8 +2,15 @@ # convert a list of variables to a json dictionary function to_json() { - eval $(echo jq -n \ - $(for var in "$@"; do echo $([[ "${!var}" =~ ^[0-9]+$ ]] && echo --argjson || echo --arg) _$var "'"${!var}"'"; done) \ - \'"{$(for var in "$@"; do echo -n "\"$var\": \$_$var, "; done)}"\' - ) + CMD="jq -n " + CMD+=$(for var in "$@"; do + echo "$([[ "${!var}" =~ ^[0-9]+$ ]] && echo --argjson || echo --arg) _$var \"\$$var\" " + done) + + JSON=$(for var in "$@"; do + echo "$var: \$_$var, " + done) + CMD+=\'{$JSON}\' + + eval $CMD }