Skip to content

Commit

Permalink
simplify job dependency and conditional execution
Browse files Browse the repository at this point in the history
  • Loading branch information
yhtang committed Dec 7, 2023
1 parent 70bea03 commit 9f5d230
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 48 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/_publish_t5x_results.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: ~publish t5x integration test results
name: ~publish t5x integration test results on Tensorboard server

on:
workflow_call:
Expand Down
57 changes: 35 additions & 22 deletions .github/workflows/nightly-pax-test-mgmn.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,34 +34,53 @@ jobs:
runs-on: ubuntu-22.04
outputs:
BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }}
PAX_IMAGE: ${{ steps.date.outputs.PAX_IMAGE }}
PUBLISH: ${{ steps.date.outputs.PUBLISH }}
PUBLISH_CONTAINER: ${{ steps.date.outputs.PUBLISH_CONTAINER }}
SHOULD_TRIAGE: ${{ steps.date.outputs.SHOULD_TRIAGE }}
PAX_IMAGE: ${{ steps.image.outputs.PAX_IMAGE }}
PUBLISH: ${{ steps.publish.outputs.PUBLISH }}
steps:
- name: Set metadata
- name: Check if the triggering workflow failed
id: if-upstream-failed
shell: bash -x -e {0}
run: |
echo "UPSTREAM_FAILED=${{ github.event_name == 'workflow_run' && github.event.workflow_run.conclusion != 'success' }}" >> $GITHUB_OUTPUT
- name: Cancel workflow if upstream workflow did not success
if: ${{ steps.if-upstream-failed.outputs.UPSTREAM_FAILED == 'true' }}
run: |
echo "Upstream workflow failed, cancelling this workflow"
curl -X POST -H "Authorization: token ${{ github.token }}" \
-H "Accept: application/vnd.github.v3+json" \
"https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/cancel"
cat # blocks execution in case workflow cancellation takes time
- name: Set build date
id: date
shell: bash -x -e {0}
run: |
BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d')
echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT
- name: Set docker image
id: image
shell: bash -x -e {0}
run: |
PAX_IMAGE=${{ inputs.PAX_IMAGE }}
PAX_IMAGE=${PAX_IMAGE:-${{ env.DEFAULT_PAX_IMAGE }}}
echo "PAX_IMAGE=${PAX_IMAGE}" >> $GITHUB_OUTPUT
echo "PUBLISH=${{ inputs.PUBLISH }}" >> $GITHUB_OUTPUT
echo "PUBLISH_CONTAINER=${{ ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH)) }} >> $GITHUB_OUTPUT"
echo "SHOULD_TRIAGE=${{ ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch') }} >> $GITHUB_OUTPUT"
- name: Set result publishing flags
id: publish
shell: bash -x -e {0}
run: |
echo "PUBLISH=${{ github.event_name == 'workflow_run' || inputs.PUBLISH == 'true' }}" >> $GITHUB_OUTPUT
run-jobs:
needs: metadata
uses: ./.github/workflows/_test_pax.yaml
if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch'
with:
PAX_IMAGE: ${{ needs.metadata.outputs.PAX_IMAGE }}
secrets: inherit

publish:
tensorboard-upload:
needs: [metadata, run-jobs]
runs-on: ubuntu-22.04
steps:
Expand Down Expand Up @@ -108,9 +127,9 @@ jobs:
EOF
) | tee $GITHUB_STEP_SUMMARY
publish-verified:
if: needs.run-jobs.outputs.TEST_STATUS == 'success' && needs.metadata.outputs.PUBLISH_CONTAINER == 'true'
publish-verified-image:
needs: [metadata, run-jobs]
if: needs.metadata.outputs.PUBLISH == 'true' && needs.run-jobs.outputs.TEST_STATUS == 'success'
uses: ./.github/workflows/_publish_container.yaml
secrets: inherit
with:
Expand All @@ -119,27 +138,21 @@ jobs:
TARGET_TAGS: |
type=raw,value=latest-verified,priority=1000
triage:
failure-triage:
needs: [metadata, run-jobs]
if: needs.run-jobs.outputs.TEST_STATUS != 'success'
uses: ./.github/workflows/_triage.yaml
if: needs.run-jobs.outputs.TEST_STATUS != 'success' && needs.metadata.outputs.SHOULD_TRIAGE == 'true'
secrets: inherit
with:
BROKEN_IMAGE: ${{ needs.metadata.outputs.PAX_IMAGE }}
BASE_IMAGE: ghcr.io/nvidia/upstream-pax:latest-verified
REPO_DIRS: "/opt/paxml /opt/praxis"
FILE_ISSUE: true

if-upstream-failed:
runs-on: ubuntu-latest
if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'failure') && github.event_name != 'workflow_dispatch'
steps:
- run: echo 'Upstream workflow failed, aborting run' && exit 1

finalize:
if: always()
if: "!cancelled()"
needs: [metadata, run-jobs]
uses: ./.github/workflows/_finalize.yaml
with:
PUBLISH_BADGE: ${{ github.event_name == 'workflow_run' || needs.metadata.outputs.PUBLISH == 'true' }}
PUBLISH_BADGE: ${ needs.metadata.outputs.PUBLISH }}
secrets: inherit
60 changes: 35 additions & 25 deletions .github/workflows/nightly-t5x-test-mgmn.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,47 +34,63 @@ jobs:
runs-on: ubuntu-22.04
outputs:
BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }}
T5X_IMAGE: ${{ steps.date.outputs.T5X_IMAGE }}
PUBLISH: ${{ steps.date.outputs.PUBLISH }}
PUBLISH_RESULTS: ${{ steps.date.outputs.PUBLISH_RESULTS }}
PUBLISH_CONTAINER: ${{ steps.date.outputs.PUBLISH_CONTAINER }}
SHOULD_TRIAGE: ${{ steps.date.outputs.SHOULD_TRIAGE }}
T5X_IMAGE: ${{ steps.image.outputs.T5X_IMAGE }}
PUBLISH: ${{ steps.publish.outputs.PUBLISH }}
steps:
- name: Set metadata
- name: Check if the triggering workflow failed
id: if-upstream-failed
shell: bash -x -e {0}
run: |
echo "UPSTREAM_FAILED=${{ github.event_name == 'workflow_run' && github.event.workflow_run.conclusion != 'success' }}" >> $GITHUB_OUTPUT
- name: Cancel workflow if upstream workflow did not success
if: ${{ steps.if-upstream-failed.outputs.UPSTREAM_FAILED == 'true' }}
run: |
echo "Upstream workflow failed, cancelling this workflow"
curl -X POST -H "Authorization: token ${{ github.token }}" \
-H "Accept: application/vnd.github.v3+json" \
"https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/cancel"
cat # blocks execution in case workflow cancellation takes time
- name: Set build date
id: date
shell: bash -x -e {0}
run: |
BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d')
echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT
- name: Set docker image
id: image
shell: bash -x -e {0}
run: |
T5X_IMAGE=${{ inputs.T5X_IMAGE }}
T5X_IMAGE=${T5X_IMAGE:-${{ env.DEFAULT_T5X_IMAGE }}}
echo "T5X_IMAGE=${T5X_IMAGE}" >> $GITHUB_OUTPUT
echo "PUBLISH=${{ inputs.PUBLISH }}" >> $GITHUB_OUTPUT
echo "PUBLISH_RESULTS=${{ (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' }} >> $GITHUB_OUTPUT"
echo "PUBLISH_CONTAINER=${{ ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH)) }} >> $GITHUB_OUTPUT"
echo "SHOULD_TRIAGE=${{ ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch') }} >> $GITHUB_OUTPUT"
- name: Set result publishing flags
id: publish
shell: bash -x -e {0}
run: |
echo "PUBLISH=${{ github.event_name == 'workflow_run' || inputs.PUBLISH == 'true' }}" >> $GITHUB_OUTPUT
run-jobs:
needs: metadata
uses: ./.github/workflows/_test_t5x.yaml
if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch'
with:
T5X_IMAGE: ${{ needs.metadata.outputs.T5X_IMAGE }}
secrets: inherit

publish:
tensorboard-upload:
needs: [metadata, run-jobs]
uses: ./.github/workflows/_publish_t5x_results.yaml
if: needs.metadata.outputs.PUBLISH_RESULTS == 'true'
with:
BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }}
EXPERIMENT_SUBDIR: T5X
secrets: inherit

publish-verified:
if: needs.run-jobs.outputs.TEST_STATUS == 'success' && needs.metadata.outputs.PUBLISH_CONTAINER == 'true'
publish-verified-image:
needs: [metadata, run-jobs]
if: needs.metadata.outputs.PUBLISH == 'true' && needs.run-jobs.outputs.TEST_STATUS == 'success'
uses: ./.github/workflows/_publish_container.yaml
secrets: inherit
with:
Expand All @@ -83,27 +99,21 @@ jobs:
TARGET_TAGS: |
type=raw,value=latest-verified,priority=1000
triage:
failure-triage:
needs: [metadata, run-jobs]
if: needs.run-jobs.outputs.TEST_STATUS != 'success'
uses: ./.github/workflows/_triage.yaml
if: needs.run-jobs.outputs.TEST_STATUS != 'success' && needs.metadata.outputs.SHOULD_TRIAGE == 'true'
secrets: inherit
with:
BROKEN_IMAGE: ${{ needs.metadata.outputs.T5X_IMAGE }}
BASE_IMAGE: ghcr.io/nvidia/upstream-t5x:latest-verified
REPO_DIRS: "/opt/t5x /opt/flax"
FILE_ISSUE: true

if-upstream-failed:
runs-on: ubuntu-latest
if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'failure') && github.event_name != 'workflow_dispatch'
steps:
- run: echo 'Upstream workflow failed, aborting run' && exit 1

finalize:
if: always()
if: "!cancelled()"
needs: [metadata, run-jobs]
uses: ./.github/workflows/_finalize.yaml
with:
PUBLISH_BADGE: ${{ github.event_name == 'workflow_run' || needs.metadata.outputs.PUBLISH == 'true' }}
PUBLISH_BADGE: ${ needs.metadata.outputs.PUBLISH }}
secrets: inherit

0 comments on commit 9f5d230

Please sign in to comment.