Skip to content

Commit

Permalink
Migrate to sitrep mechanism for T5X and PAXML MGMN tests (#401)
Browse files Browse the repository at this point in the history
Addresses #399,
#235 and
#236

Example badge:
![](https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-t5x-mgmn-test.json&logo=nvidia)

Completed run:
https://github.com/NVIDIA/JAX-Toolbox/actions/runs/7039302468

## Changes
- Create a reusable job for publishing sitrep for MGMN workflows
- Integrate it in T5X MGMN
- Integrate it in PAX MGMN

## TODO
- [ ]  Change badge endpoint in README for T5X
- [ ]  Change badge endpoint in README for PAX

Closes #399 
Closes #235 
Closes #236

---------

Co-authored-by: Yu-Hang Maxin Tang <Tang.Maxin@gmail.com>
  • Loading branch information
hemildesai and yhtang authored Dec 8, 2023
1 parent 5b2c1b4 commit fe2b422
Show file tree
Hide file tree
Showing 7 changed files with 302 additions and 268 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/_publish_t5x_pax_results.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,4 +63,4 @@ jobs:
[view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars&regexInput=$(jq -nr --arg url "${FOLDER}" '$url|@uri')&_smoothingWeight=0&tagFilter=seqs_per)
EOF
) | tee $GITHUB_STEP_SUMMARY
) | tee $GITHUB_STEP_SUMMARY
134 changes: 134 additions & 0 deletions .github/workflows/_sitrep_mgmn.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
name: ~Generate sitrep for Multi-Node Multi-GPU tests

on:
workflow_call:
inputs:
BADGE_FILENAME:
type: string
description: 'Name of the endpoint JSON file for shields.io badge'
required: true
ARTIFACT_NAME:
type: string
description: 'Name of the artifact zip file'
required: true
FW_NAME:
type: string
description: 'Name of the framework being used'
required: true
outputs:
STATUS:
description: 'Summary of all tests run for the workflow. Set to "success" when all metrics per job and all jobs pass, whereas a single metric failure or job error sets the status to "failure"'
value: ${{ jobs.sitrep.outputs.STATUS }}

jobs:
sitrep:
runs-on: ubuntu-22.04
outputs:
STATUS: ${{ steps.gen-sitrep.outputs.STATUS }}
steps:
- name: Check out repository
uses: actions/checkout@v3

- name: Download all artifacts from the previous jobs
uses: actions/download-artifact@v3

- name: Write exit status summary
id: exit-status
shell: bash -x -e {0}
run: |
EXIT_STATUSES="${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}-*/*-status.json"
EXIT_STATUS_SUMMARY_FILE="exit_status_summary.json"
echo -e "\n\n## ${{ inputs.FW_NAME }} MGMN+SPMD Test Status" >> $EXIT_STATUS_SUMMARY_FILE
cat <<EOF >>$EXIT_STATUS_SUMMARY_FILE
| Test Case | State | Exit Code |
| --- | --- | --- |
EOF
for i in $EXIT_STATUSES; do
# Files are named <FW_NAME>-<GHID>-<NAME>/<NAME>-status.json
echo "| $(echo $i | cut -d/ -f1 | awk -F- '{print $NF}') | $(jq -r .state $i) | $(jq -r .exitcode $i)"
done | tee -a $EXIT_STATUS_SUMMARY_FILE
echo "Test statuses:"
jq -rc 'input_filename,.' $EXIT_STATUSES
echo "EXIT_STATUS_SUMMARY_FILE=$EXIT_STATUS_SUMMARY_FILE" >> ${GITHUB_OUTPUT}
- name: Write metrics summary
id: metrics
shell: bash -x -e {0}
run: |
METRICS_SUMMARY_FILE="metrics_summary.json"
echo -e "\n\n## ${{ inputs.FW_NAME }} MGMN Test Metrics" >> $METRICS_SUMMARY_FILE
for i in metrics-test-log/*_metrics.json; do
echo $i | cut -d'.' -f1
echo '```json'
jq . $i
echo '```'
done | tee -a $METRICS_SUMMARY_FILE
echo "METRICS_SUMMARY_FILE=$METRICS_SUMMARY_FILE" >> ${GITHUB_OUTPUT}
- name: Generate sitrep
id: gen-sitrep
shell: bash -x -e {0}
run: |
source .github/workflows/scripts/to_json.sh
EXIT_STATUSES="${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}-*/*-status.json"
passed_tests=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
failed_tests=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l)
total_tests=$(ls $EXIT_STATUSES | wc -l)
METRICS_LOG=metrics-test-log/report.jsonl
all_outcomes() {
cat $METRICS_LOG | jq -r '. | select((.["$report_type"] == "TestReport") and (.when == "call")) | .outcome'
}
cnt_type() {
cat $METRICS_LOG | jq '. | select((.["$report_type"] == "TestReport") and (.when == "call") and (.outcome | contains("'${1}'"))) | .outcome' | wc -l
}
pytest_failed_tests=$(cnt_type failed)
pytest_passed_tests=$(cnt_type passed)
pytest_total_tests=$(all_outcomes | wc -l)
if ([[ $failed_tests -eq 0 ]] && [[ $total_tests -gt 0 ]] && \
[[ $pytest_failed_tests -eq 0 ]] && [[ $pytest_total_tests -gt 0 ]]); then
status=success
badge_color=brightgreen
elif [[ $passed_tests -eq 0 ]] || [[ $pytest_passed_tests -eq 0 ]]; then
status=failure
badge_color=red
else
status=failure
badge_color=yellow
fi
badge_message="${passed_tests}/${total_tests} jobs | ${pytest_passed_tests}/${pytest_total_tests} metrics"
badge_label='Upstream Tests'
summary="# ${{ inputs.FW_NAME }} MGMN Test: $badge_message"
summary+=`cat ${{ steps.exit-status.outputs.EXIT_STATUS_SUMMARY_FILE }}`
summary+=`cat ${{ steps.metrics.outputs.METRICS_SUMMARY_FILE }}`
to_json \
summary \
total_tests passed_tests failed_tests \
badge_label badge_color badge_message \
> sitrep.json
schemaVersion=1 \
label="${badge_label}" \
message="${badge_message}" \
color="${badge_color}" \
to_json schemaVersion label message color \
> ${{ inputs.BADGE_FILENAME }}
echo "STATUS='${status}'" >> ${GITHUB_OUTPUT}
- name: Upload artifacts
uses: actions/upload-artifact@v3
with:
name: ${{ inputs.ARTIFACT_NAME }}
path: |
sitrep.json
${{ inputs.BADGE_FILENAME }}
121 changes: 39 additions & 82 deletions .github/workflows/_test_pax.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,25 @@ on:
description: Extra command line args to pass to test-pax.sh
default: ""
required: false
BADGE_FILENAME:
type: string
description: 'Name of the endpoint JSON file for shields.io badge'
required: false
default: 'badge-pax-mgmn-test.json'
ARTIFACT_NAME:
type: string
description: If provided, will prepend a prefix to the artifact name. Helpful if re-running this reusable workflow to prevent clobbering of artifacts
default: ""
description: 'Name of the artifact zip file'
required: false
default: 'artifact-pax-mgmn-test'
FW_NAME:
type: string
description: 'Name of the framework being used'
required: false
default: 'pax'
outputs:
TEST_STATUS:
description: 'Summary pass/fail value indicating if results from tests are acceptable'
value: ${{ jobs.publish-test.outputs.STATUS }}
value: ${{ jobs.sitrep.outputs.STATUS }}

jobs:

Expand Down Expand Up @@ -63,7 +73,7 @@ jobs:
MAX_GPUS_PER_NODE=8
NODES=1
GPUS_PER_NODE=8
JOB_NAME=${{ inputs.ARTIFACT_NAME }}pax-${GITHUB_RUN_ID}-${TEST_CASE_NAME}
JOB_NAME=${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}-${TEST_CASE_NAME}
LOG_FILE=/nfs/cluster/${JOB_NAME}.log
MODEL_PATH=/nfs/cluster/${JOB_NAME}
for var in IMAGE TEST_CASE_NAME TOTAL_TASKS NODES GPUS_PER_NODE JOB_NAME LOG_FILE MODEL_PATH; do
Expand All @@ -74,7 +84,7 @@ jobs:
shell: bash -O expand_aliases -x -e {0}
run: |
alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
sshx "date && hostname && sinfo"
sshx "date && hostname && sinfo"
sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
JOB=$(sshx sbatch --parsable << EOF
#!/bin/bash
Expand Down Expand Up @@ -129,7 +139,7 @@ jobs:
output/ || true
rsync -rtz --progress \
output/ \
${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${{ inputs.ARTIFACT_NAME }}pax-${GITHUB_RUN_ID}/ || true
${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}/ || true
- name: Write SLURM job status to file
shell: bash -x -e {0}
run: |
Expand All @@ -139,7 +149,7 @@ jobs:
dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"}
json.dump(dump, f)
EOF
- name: Upload training logs as artifacts
uses: actions/upload-artifact@v3
with:
Expand Down Expand Up @@ -191,7 +201,7 @@ jobs:
NODES=$(((TOTAL_TASKS+MAX_GPUS_PER_NODE-1)/MAX_GPUS_PER_NODE))
GPUS_PER_NODE=$((TOTAL_TASKS/NODES))
JOB_NAME=${{ inputs.ARTIFACT_NAME }}pax-${GITHUB_RUN_ID}-${TEST_CASE_NAME}
JOB_NAME=${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}-${TEST_CASE_NAME}
LOG_FILE=/nfs/cluster/${JOB_NAME}.log
MODEL_PATH=/nfs/cluster/${JOB_NAME}
for var in IMAGE TEST_CASE_NAME TOTAL_TASKS NODES GPUS_PER_NODE JOB_NAME LOG_FILE MODEL_PATH; do
Expand All @@ -203,7 +213,7 @@ jobs:
shell: bash -O expand_aliases -x -e {0}
run: |
alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
sshx "date && hostname && sinfo"
sshx "date && hostname && sinfo"
sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
JOB=$(sshx sbatch --parsable << EOF
#!/bin/bash
Expand Down Expand Up @@ -265,7 +275,7 @@ jobs:
output/ || true
rsync -rtz --progress \
output/ \
${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${{ inputs.ARTIFACT_NAME }}pax-${GITHUB_RUN_ID}/ || true
${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}/ || true
- name: Write SLURM job status to file
shell: bash -x -e {0}
Expand All @@ -276,7 +286,7 @@ jobs:
dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"}
json.dump(dump, f)
EOF
- name: Upload training logs as artifacts
uses: actions/upload-artifact@v3
with:
Expand Down Expand Up @@ -321,7 +331,7 @@ jobs:
NODES=1
GPUS_PER_NODE=8
JOB_NAME=${{ inputs.ARTIFACT_NAME }}pax-${GITHUB_RUN_ID}-${TEST_CASE_NAME}
JOB_NAME=${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}-${TEST_CASE_NAME}
LOG_FILE=/nfs/cluster/${JOB_NAME}.log
MODEL_PATH=/nfs/cluster/${JOB_NAME}
for var in IMAGE TEST_CASE_NAME TOTAL_TASKS NODES GPUS_PER_NODE JOB_NAME LOG_FILE MODEL_PATH; do
Expand All @@ -333,7 +343,7 @@ jobs:
shell: bash -O expand_aliases -x -e {0}
run: |
alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
sshx "date && hostname && sinfo"
sshx "date && hostname && sinfo"
sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
JOB=$(sshx sbatch --parsable << EOF
#!/bin/bash
Expand Down Expand Up @@ -396,7 +406,7 @@ jobs:
output/ || true
rsync -rtz --progress \
output/ \
${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${{ inputs.ARTIFACT_NAME }}pax-${GITHUB_RUN_ID}/ || true
${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}/ || true
- name: Write SLURM job status to file
shell: bash -x -e {0}
Expand All @@ -407,7 +417,7 @@ jobs:
dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"}
json.dump(dump, f)
EOF
- name: Upload training logs as artifacts
uses: actions/upload-artifact@v3
with:
Expand All @@ -429,83 +439,30 @@ jobs:
shell: bash -x {0}
run: |
pip install pytest pytest-reportlog tensorboard
for i in ${{ inputs.ARTIFACT_NAME }}pax-${GITHUB_RUN_ID}-*; do
for i in ${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}-*; do
SUBDIR=$(echo $i | cut -d'-' -f3)
mv $i/$SUBDIR* .
python3 .github/workflows/baselines/summarize_metrics.py $SUBDIR # create result json in baseline format
done
echo '## PAX MGMN Test Metrics' >> $GITHUB_STEP_SUMMARY
for i in *_metrics.json; do
echo $i | cut -d'.' -f1
echo '```json'
jq . $i
echo '```'
done | tee -a $GITHUB_STEP_SUMMARY
RESULTS_DIR=$PWD BASELINES_DIR=PAX_MGMN/upstream pytest --report-log=report.jsonl .github/workflows/baselines/test_pax_mgmn_metrics.py || true
- name: Upload metrics test json logs
uses: actions/upload-artifact@v3
with:
name: metrics-test-log
path: report.jsonl
path: |
report.jsonl
*_metrics.json

publish-test:
sitrep:
needs: [single-process-multi-device, pax-multi-node, single-process-evaluation, metrics]
uses: ./.github/workflows/_publish_badge.yaml
if: ( always() )
secrets: inherit
if: success() || failure()
uses: ./.github/workflows/_sitrep_mgmn.yaml
with:
ENDPOINT_FILENAME: '${{ inputs.ARTIFACT_NAME }}pax-test-status.json'
PUBLISH: false
SCRIPT: |
EXIT_STATUSES="${{ inputs.ARTIFACT_NAME }}pax-${GITHUB_RUN_ID}-*DP*FSDP*TP*PP*/*-status.json"
PASSED_TESTS=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
FAILED_TESTS=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l)
TOTAL_TESTS=$(ls $EXIT_STATUSES | wc -l)
cat <<EOF >>$GITHUB_STEP_SUMMARY
## Pax MGMN+SPMD Test Status
| Test Case | State | Exit Code |
| --- | --- | --- |
EOF
for i in $EXIT_STATUSES; do
# Files are named pax-<GHID>-<NAME>/<NAME>-status.json
echo "| $(echo $i | cut -d/ -f1 | cut -d- -f3) | $(jq -r .state $i) | $(jq -r .exitcode $i)"
done | tee -a $GITHUB_STEP_SUMMARY
echo "Test statuses:"
jq -rc 'input_filename,.' $EXIT_STATUSES
METRICS_LOG=metrics-test-log/report.jsonl
all_outcomes() {
cat $METRICS_LOG | jq -r '. | select((.["$report_type"] == "TestReport") and (.when == "call")) | .outcome'
}
cnt_type() {
cat $METRICS_LOG | jq '. | select((.["$report_type"] == "TestReport") and (.when == "call") and (.outcome | contains("'${1}'"))) | .outcome' | wc -l
}
PYTEST_FAILED_TESTS=$(cnt_type failed)
PYTEST_PASSED_TESTS=$(cnt_type passed)
PYTEST_TOTAL_TESTS=$(all_outcomes | wc -l)
if ([[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]] && \
[[ $PYTEST_FAILED_TESTS -eq 0 ]] && [[ $PYTEST_TOTAL_TESTS -gt 0 ]]); then
STATUS=success
BADGE_COLOR=brightgreen
elif [[ $PASSED_TESTS -eq 0 ]] || [[ $PYTEST_PASSED_TESTS -eq 0 ]]; then
STATUS=failure
BADGE_COLOR=red
else
STATUS=failure
BADGE_COLOR=yellow
fi
echo "STATUS='${STATUS}'" >> ${GITHUB_OUTPUT}
echo "LABEL='Completion'" >> $GITHUB_OUTPUT
echo "MESSAGE='${PASSED_TESTS}/${TOTAL_TESTS} ran ${PYTEST_PASSED_TESTS}/${PYTEST_TOTAL_TESTS} pass loss+perf'" >> $GITHUB_OUTPUT
echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT
BADGE_FILENAME: ${{ inputs.BADGE_FILENAME }}
ARTIFACT_NAME: ${{ inputs.ARTIFACT_NAME }}
FW_NAME: ${{ inputs.FW_NAME }}

summary:
runs-on: ubuntu-22.04
Expand All @@ -518,18 +475,18 @@ jobs:
## PAX MGMN training
[view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars&regexInput=${{ inputs.ARTIFACT_NAME }}pax-${GITHUB_RUN_ID}&_smoothingWeight=0&tagFilter=seqs_per)
[view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars&regexInput=${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}&_smoothingWeight=0&tagFilter=seqs_per)
EOF
) | tee $GITHUB_STEP_SUMMARY
outcome:
needs: publish-test
needs: sitrep
runs-on: ubuntu-22.04
if: ( always() )
steps:
- name: Sets workflow status based on test outputs
- name: Sets workflow status based on test outputs
run: |
if [[ ${{ needs.publish-test.outputs.STATUS }} != success ]]; then
if [[ ${{ needs.sitrep.outputs.STATUS }} != 'success' ]]; then
exit 1
fi
Loading

0 comments on commit fe2b422

Please sign in to comment.