Skip to content

Nightly T5X MGMN performance test (workflow_dispatch) #354

Nightly T5X MGMN performance test (workflow_dispatch)

Nightly T5X MGMN performance test (workflow_dispatch) #354

name: Nightly T5X MGMN performance test
run-name: Nightly T5X MGMN performance test (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }})
on:
workflow_run:
workflows: [Nightly T5X build]
types: [completed]
branches: [main]
workflow_dispatch:
inputs:
T5X_IMAGE:
type: string
description: T5X container
default: 'ghcr.io/nvidia/upstream-t5x:latest'
required: true
PUBLISH:
type: boolean
description: Publish dated results to tensorboard server?
default: false
required: false
permissions:
contents: read # to fetch code
actions: write # to cancel previous workflows
packages: write # to upload container
issues: write # to create issues
env:
DEFAULT_T5X_IMAGE: 'ghcr.io/nvidia/upstream-t5x:latest'
jobs:
metadata:
runs-on: ubuntu-22.04
outputs:
BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }}
T5X_IMAGE: ${{ steps.date.outputs.T5X_IMAGE }}
PUBLISH: ${{ steps.date.outputs.PUBLISH }}
steps:
- name: Set metadata
id: date
shell: bash -x -e {0}
run: |
BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d')
echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT
T5X_IMAGE=${{ inputs.T5X_IMAGE }}
T5X_IMAGE=${T5X_IMAGE:-${{ env.DEFAULT_T5X_IMAGE }}}
echo "T5X_IMAGE=${T5X_IMAGE}" >> $GITHUB_OUTPUT
echo "PUBLISH=${{ inputs.PUBLISH }}" >> $GITHUB_OUTPUT
run-jobs:
needs: metadata
uses: ./.github/workflows/_test_t5x.yaml
if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch'
with:
T5X_IMAGE: ${{ needs.metadata.outputs.T5X_IMAGE }}
secrets: inherit
# publish:
# needs: [metadata, run-jobs]
# uses: ./.github/workflows/_publish_t5x_results.yaml
# if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch'
# with:
# BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }}
# EXPERIMENT_SUBDIR: T5X
# secrets: inherit
# publish-verified:
# if: needs.run-jobs.outputs.TEST_STATUS == 'success' && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH))
# needs: [metadata, run-jobs]
# uses: ./.github/workflows/_publish_container.yaml
# secrets: inherit
# with:
# SOURCE_IMAGE: ${{ needs.metadata.outputs.T5X_IMAGE }}
# TARGET_IMAGE: upstream-t5x
# TARGET_TAGS: |
# type=raw,value=latest-verified,priority=1000
# triage:
# needs: [metadata, run-jobs]
# uses: ./.github/workflows/_triage.yaml
# if: needs.run-jobs.outputs.TEST_STATUS != 'success' && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch')
# secrets: inherit
# with:
# BROKEN_IMAGE: ${{ needs.metadata.outputs.T5X_IMAGE }}
# BASE_IMAGE: ghcr.io/nvidia/upstream-t5x:latest-verified
# REPO_DIRS: "/opt/t5x /opt/flax"
# FILE_ISSUE: true
if-upstream-failed:
runs-on: ubuntu-latest
if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'failure') && github.event_name != 'workflow_dispatch'
steps:
- run: echo 'Upstream workflow failed, aborting run' && exit 1
finalize:
if: always()
needs: [metadata, run-jobs]
uses: ./.github/workflows/_finalize.yaml
with:
PUBLISH_BADGE: ${{ github.event_name == 'workflow_run' || needs.metadata.outputs.PUBLISH == 'true' }}
secrets: inherit