From 9f5d2300cfe61e348ed2619cef1e3eef90fc42fe Mon Sep 17 00:00:00 2001 From: Yu-Hang Maxin Tang Date: Thu, 7 Dec 2023 02:47:33 +0000 Subject: [PATCH] simplify job dependency and conditional execution --- .github/workflows/_publish_t5x_results.yaml | 2 +- .github/workflows/nightly-pax-test-mgmn.yaml | 57 ++++++++++++------- .github/workflows/nightly-t5x-test-mgmn.yaml | 60 ++++++++++++-------- 3 files changed, 71 insertions(+), 48 deletions(-) diff --git a/.github/workflows/_publish_t5x_results.yaml b/.github/workflows/_publish_t5x_results.yaml index 6b9dc6ebb..e12242d88 100644 --- a/.github/workflows/_publish_t5x_results.yaml +++ b/.github/workflows/_publish_t5x_results.yaml @@ -1,4 +1,4 @@ -name: ~publish t5x integration test results +name: ~publish t5x integration test results on Tensorboard server on: workflow_call: diff --git a/.github/workflows/nightly-pax-test-mgmn.yaml b/.github/workflows/nightly-pax-test-mgmn.yaml index 086fd5d9d..3e91eb678 100644 --- a/.github/workflows/nightly-pax-test-mgmn.yaml +++ b/.github/workflows/nightly-pax-test-mgmn.yaml @@ -34,34 +34,53 @@ jobs: runs-on: ubuntu-22.04 outputs: BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }} - PAX_IMAGE: ${{ steps.date.outputs.PAX_IMAGE }} - PUBLISH: ${{ steps.date.outputs.PUBLISH }} - PUBLISH_CONTAINER: ${{ steps.date.outputs.PUBLISH_CONTAINER }} - SHOULD_TRIAGE: ${{ steps.date.outputs.SHOULD_TRIAGE }} + PAX_IMAGE: ${{ steps.image.outputs.PAX_IMAGE }} + PUBLISH: ${{ steps.publish.outputs.PUBLISH }} steps: - - name: Set metadata + - name: Check if the triggering workflow failed + id: if-upstream-failed + shell: bash -x -e {0} + run: | + echo "UPSTREAM_FAILED=${{ github.event_name == 'workflow_run' && github.event.workflow_run.conclusion != 'success' }}" >> $GITHUB_OUTPUT + + - name: Cancel workflow if upstream workflow did not success + if: ${{ steps.if-upstream-failed.outputs.UPSTREAM_FAILED == 'true' }} + run: | + echo "Upstream workflow failed, cancelling this workflow" + curl -X POST -H "Authorization: token ${{ github.token }}" \ + -H "Accept: application/vnd.github.v3+json" \ + "https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/cancel" + cat # blocks execution in case workflow cancellation takes time + + - name: Set build date id: date shell: bash -x -e {0} run: | BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT + - name: Set docker image + id: image + shell: bash -x -e {0} + run: | PAX_IMAGE=${{ inputs.PAX_IMAGE }} PAX_IMAGE=${PAX_IMAGE:-${{ env.DEFAULT_PAX_IMAGE }}} echo "PAX_IMAGE=${PAX_IMAGE}" >> $GITHUB_OUTPUT - echo "PUBLISH=${{ inputs.PUBLISH }}" >> $GITHUB_OUTPUT - echo "PUBLISH_CONTAINER=${{ ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH)) }} >> $GITHUB_OUTPUT" - echo "SHOULD_TRIAGE=${{ ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch') }} >> $GITHUB_OUTPUT" + + - name: Set result publishing flags + id: publish + shell: bash -x -e {0} + run: | + echo "PUBLISH=${{ github.event_name == 'workflow_run' || inputs.PUBLISH == 'true' }}" >> $GITHUB_OUTPUT run-jobs: needs: metadata uses: ./.github/workflows/_test_pax.yaml - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' with: PAX_IMAGE: ${{ needs.metadata.outputs.PAX_IMAGE }} secrets: inherit - publish: + tensorboard-upload: needs: [metadata, run-jobs] runs-on: ubuntu-22.04 steps: @@ -108,9 +127,9 @@ jobs: EOF ) | tee $GITHUB_STEP_SUMMARY - publish-verified: - if: needs.run-jobs.outputs.TEST_STATUS == 'success' && needs.metadata.outputs.PUBLISH_CONTAINER == 'true' + publish-verified-image: needs: [metadata, run-jobs] + if: needs.metadata.outputs.PUBLISH == 'true' && needs.run-jobs.outputs.TEST_STATUS == 'success' uses: ./.github/workflows/_publish_container.yaml secrets: inherit with: @@ -119,10 +138,10 @@ jobs: TARGET_TAGS: | type=raw,value=latest-verified,priority=1000 - triage: + failure-triage: needs: [metadata, run-jobs] + if: needs.run-jobs.outputs.TEST_STATUS != 'success' uses: ./.github/workflows/_triage.yaml - if: needs.run-jobs.outputs.TEST_STATUS != 'success' && needs.metadata.outputs.SHOULD_TRIAGE == 'true' secrets: inherit with: BROKEN_IMAGE: ${{ needs.metadata.outputs.PAX_IMAGE }} @@ -130,16 +149,10 @@ jobs: REPO_DIRS: "/opt/paxml /opt/praxis" FILE_ISSUE: true - if-upstream-failed: - runs-on: ubuntu-latest - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'failure') && github.event_name != 'workflow_dispatch' - steps: - - run: echo 'Upstream workflow failed, aborting run' && exit 1 - finalize: - if: always() + if: "!cancelled()" needs: [metadata, run-jobs] uses: ./.github/workflows/_finalize.yaml with: - PUBLISH_BADGE: ${{ github.event_name == 'workflow_run' || needs.metadata.outputs.PUBLISH == 'true' }} + PUBLISH_BADGE: ${ needs.metadata.outputs.PUBLISH }} secrets: inherit diff --git a/.github/workflows/nightly-t5x-test-mgmn.yaml b/.github/workflows/nightly-t5x-test-mgmn.yaml index 3007c107c..a02c0a783 100644 --- a/.github/workflows/nightly-t5x-test-mgmn.yaml +++ b/.github/workflows/nightly-t5x-test-mgmn.yaml @@ -34,47 +34,63 @@ jobs: runs-on: ubuntu-22.04 outputs: BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }} - T5X_IMAGE: ${{ steps.date.outputs.T5X_IMAGE }} - PUBLISH: ${{ steps.date.outputs.PUBLISH }} - PUBLISH_RESULTS: ${{ steps.date.outputs.PUBLISH_RESULTS }} - PUBLISH_CONTAINER: ${{ steps.date.outputs.PUBLISH_CONTAINER }} - SHOULD_TRIAGE: ${{ steps.date.outputs.SHOULD_TRIAGE }} + T5X_IMAGE: ${{ steps.image.outputs.T5X_IMAGE }} + PUBLISH: ${{ steps.publish.outputs.PUBLISH }} steps: - - name: Set metadata + - name: Check if the triggering workflow failed + id: if-upstream-failed + shell: bash -x -e {0} + run: | + echo "UPSTREAM_FAILED=${{ github.event_name == 'workflow_run' && github.event.workflow_run.conclusion != 'success' }}" >> $GITHUB_OUTPUT + + - name: Cancel workflow if upstream workflow did not success + if: ${{ steps.if-upstream-failed.outputs.UPSTREAM_FAILED == 'true' }} + run: | + echo "Upstream workflow failed, cancelling this workflow" + curl -X POST -H "Authorization: token ${{ github.token }}" \ + -H "Accept: application/vnd.github.v3+json" \ + "https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/cancel" + cat # blocks execution in case workflow cancellation takes time + + - name: Set build date id: date shell: bash -x -e {0} run: | BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT + - name: Set docker image + id: image + shell: bash -x -e {0} + run: | T5X_IMAGE=${{ inputs.T5X_IMAGE }} T5X_IMAGE=${T5X_IMAGE:-${{ env.DEFAULT_T5X_IMAGE }}} echo "T5X_IMAGE=${T5X_IMAGE}" >> $GITHUB_OUTPUT - echo "PUBLISH=${{ inputs.PUBLISH }}" >> $GITHUB_OUTPUT - echo "PUBLISH_RESULTS=${{ (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' }} >> $GITHUB_OUTPUT" - echo "PUBLISH_CONTAINER=${{ ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH)) }} >> $GITHUB_OUTPUT" - echo "SHOULD_TRIAGE=${{ ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch') }} >> $GITHUB_OUTPUT" + + - name: Set result publishing flags + id: publish + shell: bash -x -e {0} + run: | + echo "PUBLISH=${{ github.event_name == 'workflow_run' || inputs.PUBLISH == 'true' }}" >> $GITHUB_OUTPUT run-jobs: needs: metadata uses: ./.github/workflows/_test_t5x.yaml - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' with: T5X_IMAGE: ${{ needs.metadata.outputs.T5X_IMAGE }} secrets: inherit - publish: + tensorboard-upload: needs: [metadata, run-jobs] uses: ./.github/workflows/_publish_t5x_results.yaml - if: needs.metadata.outputs.PUBLISH_RESULTS == 'true' with: BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} EXPERIMENT_SUBDIR: T5X secrets: inherit - publish-verified: - if: needs.run-jobs.outputs.TEST_STATUS == 'success' && needs.metadata.outputs.PUBLISH_CONTAINER == 'true' + publish-verified-image: needs: [metadata, run-jobs] + if: needs.metadata.outputs.PUBLISH == 'true' && needs.run-jobs.outputs.TEST_STATUS == 'success' uses: ./.github/workflows/_publish_container.yaml secrets: inherit with: @@ -83,10 +99,10 @@ jobs: TARGET_TAGS: | type=raw,value=latest-verified,priority=1000 - triage: + failure-triage: needs: [metadata, run-jobs] + if: needs.run-jobs.outputs.TEST_STATUS != 'success' uses: ./.github/workflows/_triage.yaml - if: needs.run-jobs.outputs.TEST_STATUS != 'success' && needs.metadata.outputs.SHOULD_TRIAGE == 'true' secrets: inherit with: BROKEN_IMAGE: ${{ needs.metadata.outputs.T5X_IMAGE }} @@ -94,16 +110,10 @@ jobs: REPO_DIRS: "/opt/t5x /opt/flax" FILE_ISSUE: true - if-upstream-failed: - runs-on: ubuntu-latest - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'failure') && github.event_name != 'workflow_dispatch' - steps: - - run: echo 'Upstream workflow failed, aborting run' && exit 1 - finalize: - if: always() + if: "!cancelled()" needs: [metadata, run-jobs] uses: ./.github/workflows/_finalize.yaml with: - PUBLISH_BADGE: ${{ github.event_name == 'workflow_run' || needs.metadata.outputs.PUBLISH == 'true' }} + PUBLISH_BADGE: ${ needs.metadata.outputs.PUBLISH }} secrets: inherit