Skip to content

Nightly Pax MGMN performance test (workflow_dispatch) #366

Nightly Pax MGMN performance test (workflow_dispatch)

Nightly Pax MGMN performance test (workflow_dispatch) #366

name: Nightly Pax MGMN performance test
run-name: Nightly Pax MGMN performance test (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }})
on:
workflow_run:
workflows: [Nightly Pax build]
types: [completed]
branches: [main]
workflow_dispatch:
inputs:
PAX_IMAGE:
type: string
description: Pax container
default: 'ghcr.io/nvidia/upstream-pax:latest'
required: true
PUBLISH:
type: boolean
description: Publish dated results to tensorboard server?
default: false
required: false
permissions:
contents: read # to fetch code
actions: write # to cancel previous workflows
packages: write # to upload container
issues: write # to create issues
env:
DEFAULT_PAX_IMAGE: 'ghcr.io/nvidia/upstream-pax:latest'
jobs:
metadata:
runs-on: ubuntu-22.04
outputs:
BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }}
PAX_IMAGE: ${{ steps.image.outputs.PAX_IMAGE }}
PUBLISH: ${{ steps.publish.outputs.PUBLISH }}
steps:
- name: Check if the triggering workflow failed
id: if-upstream-failed
shell: bash -x -e {0}
run: |
echo "UPSTREAM_FAILED=${{ github.event_name == 'workflow_run' && github.event.workflow_run.conclusion != 'success' }}" >> $GITHUB_OUTPUT
- name: Cancel workflow if upstream workflow did not success
if: ${{ steps.if-upstream-failed.outputs.UPSTREAM_FAILED == 'true' }}
run: |
echo "Upstream workflow failed, cancelling this workflow"
curl -X POST -H "Authorization: token ${{ github.token }}" \
-H "Accept: application/vnd.github.v3+json" \
"https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/cancel"
cat # blocks execution in case workflow cancellation takes time
- name: Set build date
id: date
shell: bash -x -e {0}
run: |
BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d')
echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT
- name: Set docker image
id: image
shell: bash -x -e {0}
run: |
PAX_IMAGE=${{ inputs.PAX_IMAGE }}
PAX_IMAGE=${PAX_IMAGE:-${{ env.DEFAULT_PAX_IMAGE }}}
echo "PAX_IMAGE=${PAX_IMAGE}" >> $GITHUB_OUTPUT
- name: Set result publishing flags
id: publish
shell: bash -x -e {0}
run: |
echo "PUBLISH=${{ github.event_name == 'workflow_run' || inputs.PUBLISH == 'true' }}" >> $GITHUB_OUTPUT
run-jobs:
needs: metadata
uses: ./.github/workflows/_test_pax.yaml
with:
PAX_IMAGE: ${{ needs.metadata.outputs.PAX_IMAGE }}
secrets: inherit
tensorboard-upload:
needs: [metadata, run-jobs]
runs-on: ubuntu-22.04
steps:
- name: Setup SSH agent
uses: webfactory/ssh-agent@v0.8.0
with:
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
- name: Setup SSH known hosts
id: ssh-known-hosts
run: |
mkdir -p ~/.ssh
cat >> ~/.ssh/known_hosts << EOF
${{ vars.SSH_KNOWN_HOSTS }}
EOF
chmod 600 ~/.ssh/known_hosts
echo "FILE=$(realpath ~/.ssh/known_hosts)" >> $GITHUB_OUTPUT
- name: Setup SSH config
id: ssh-config
run: |
mkdir -p ~/.ssh
cat >> ~/.ssh/config << EOF
${{ vars.SSH_CONFIG }}
EOF
chmod 600 ~/.ssh/config
- name: Create dated folder and generate TensorBoard query URL
id: mkdir
shell: bash -x -e {0}
run: |
FOLDER="${{ needs.metadata.outputs.BUILD_DATE }}/PAX"
# copy folder
ssh -T tensorboard mkdir -p /tensorboard-logs/${FOLDER}
ssh -T tensorboard rsync -rt /tensorboard-logs/${GITHUB_RUN_ID}/ /tensorboard-logs/${FOLDER}/
# generate query URL
(
cat << EOF
## PAX MGMN nightly training: ${{ needs.metadata.outputs.BUILD_DATE }}
[view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars&regexInput=$(jq -nr --arg url "${FOLDER}" '$url|@uri')&_smoothingWeight=0&tagFilter=seqs_per)
EOF
) | tee $GITHUB_STEP_SUMMARY
publish-verified-image:
needs: [metadata, run-jobs]
if: needs.metadata.outputs.PUBLISH == 'true' && needs.run-jobs.outputs.TEST_STATUS == 'success'
uses: ./.github/workflows/_publish_container.yaml
secrets: inherit
with:
SOURCE_IMAGE: ${{ needs.metadata.outputs.PAX_IMAGE }}
TARGET_IMAGE: upstream-pax
TARGET_TAGS: |
type=raw,value=latest-verified,priority=1000
failure-triage:
needs: [metadata, run-jobs]
if: needs.run-jobs.outputs.TEST_STATUS != 'success'
uses: ./.github/workflows/_triage.yaml
secrets: inherit
with:
BROKEN_IMAGE: ${{ needs.metadata.outputs.PAX_IMAGE }}
BASE_IMAGE: ghcr.io/nvidia/upstream-pax:latest-verified
REPO_DIRS: "/opt/paxml /opt/praxis"
FILE_ISSUE: true
finalize:
if: "!cancelled()"
needs: [metadata, run-jobs]
uses: ./.github/workflows/_finalize.yaml
with:
PUBLISH_BADGE: ${{ needs.metadata.outputs.PUBLISH }}
secrets: inherit