Skip to content

Nightly Containers on CUDA 12.1 (JAX pinned) (schedule) #37

Nightly Containers on CUDA 12.1 (JAX pinned) (schedule)

Nightly Containers on CUDA 12.1 (JAX pinned) (schedule) #37

name: Nightly Containers on CUDA 12.1 (JAX pinned)
run-name: Nightly Containers on CUDA 12.1 (JAX pinned) (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }})
on:
schedule:
- cron: '30 9 * * *' # Pacific Time 01:30 AM in UTC
workflow_dispatch:
inputs:
JAX_BASE_IMAGE:
type: string
description: 'Base Multiarch JAX Image'
default: 'ghcr.io/nvidia/jax-toolbox-internal:6473019396-jax-multiarch'
required: true
REPO_T5X:
type: string
description: URL of T5X repository to check out
required: false
default: "https://github.com/nvjax-svc-0/t5x.git"
REF_T5X:
type: string
description: Git commit, tag, or branch for T5X
required: false
default: unpin-tfds-gpu-extra
REPO_TE:
type: string
description: URL of TE repository to check out
required: false
default: "https://github.com/NVIDIA/TransformerEngine.git"
REF_TE:
type: string
description: Git commit, tag, or branch for TE
required: false
default: v0.13
PUBLISH:
type: boolean
description: Publish dated images and update the 'latest' tag?
default: false
required: false
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
permissions:
contents: read # to fetch code
actions: write # to cancel previous workflows
packages: write # to upload container
env:
DEFAULT_JAX_BASE_IMAGE: ghcr.io/nvidia/jax-toolbox-internal:6473019396-jax-multiarch
DEFAULT_REPO_T5X: https://github.com/nvjax-svc-0/t5x.git
DEFAULT_REF_T5X: unpin-tfds-gpu-extra
DEFAULT_REPO_TE: https://github.com/NVIDIA/TransformerEngine.git
DEFAULT_REF_TE: v0.13
jobs:
metadata:
runs-on: ubuntu-22.04
outputs:
BUILD_DATE: ${{ steps.meta.outputs.BUILD_DATE }}
JAX_BASE_IMAGE: ${{ steps.meta.outputs.JAX_BASE_IMAGE}}
REPO_T5X: ${{ steps.meta.outputs.REPO_T5X }}
REF_T5X: ${{ steps.meta.outputs.REF_T5X }}
REPO_TE: ${{ steps.meta.outputs.REPO_TE }}
REF_TE: ${{ steps.meta.outputs.REF_TE }}
steps:
- name: Set build date and base image
id: meta
shell: bash -x -e {0}
run: |
BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d')
echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT
if [[ -z "${{ inputs.JAX_BASE_IMAGE }}" ]]; then
echo "JAX_BASE_IMAGE=${{ env.DEFAULT_JAX_BASE_IMAGE }}" >> $GITHUB_OUTPUT
else
echo "JAX_BASE_IMAGE=${{ inputs.JAX_BASE_IMAGE }}" >> $GITHUB_OUTPUT
fi
if [[ -z "${{ inputs.REPO_T5X }}" ]]; then
echo "REPO_T5X=${{ env.DEFAULT_REPO_T5X }}" >> $GITHUB_OUTPUT
else
echo "REPO_T5X=${{ inputs.REPO_T5X }}" >> $GITHUB_OUTPUT
fi
if [[ -z "${{ inputs.REF_T5X }}" ]]; then
echo "REF_T5X=${{ env.DEFAULT_REF_T5X }}" >> $GITHUB_OUTPUT
else
echo "REF_T5X=${{ inputs.REF_T5X }}" >> $GITHUB_OUTPUT
fi
if [[ -z "${{ inputs.REPO_TE }}" ]]; then
echo "REPO_TE=${{ env.DEFAULT_REPO_TE }}" >> $GITHUB_OUTPUT
else
echo "REPO_TE=${{ inputs.REPO_TE }}" >> $GITHUB_OUTPUT
fi
if [[ -z "${{ inputs.REF_TE }}" ]]; then
echo "REF_TE=${{ env.DEFAULT_REF_TE }}" >> $GITHUB_OUTPUT
else
echo "REF_TE=${{ inputs.REF_TE }}" >> $GITHUB_OUTPUT
fi
build-pax:
needs: [metadata]
uses: ./.github/workflows/_build_pax.yaml
with:
BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }}
BASE_IMAGE: ${{ needs.metadata.outputs.JAX_BASE_IMAGE }}
secrets: inherit
build-rosetta-pax:
uses: ./.github/workflows/_build_rosetta.yaml
needs: [metadata, build-pax]
with:
BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }}
BASE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAGS }}
BASE_LIBRARY: pax
PLATFORMS: '["amd64"]'
secrets: inherit
build-t5x:
needs: [metadata]
uses: ./.github/workflows/_build_t5x.yaml
with:
BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }}
BASE_IMAGE: ${{ needs.metadata.outputs.JAX_BASE_IMAGE }}
REPO_T5X: ${{ needs.metadata.outputs.REPO_T5X }}
REF_T5X: ${{ needs.metadata.outputs.REF_T5X }}
REPO_TE: ${{ needs.metadata.outputs.REPO_TE }}
REF_TE: ${{ needs.metadata.outputs.REF_TE }}
secrets: inherit
build-rosetta-t5x:
uses: ./.github/workflows/_build_rosetta.yaml
needs: [metadata, build-t5x]
with:
BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }}
BASE_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAGS }}
BASE_LIBRARY: t5x
PLATFORMS: '["amd64"]'
secrets: inherit
build-summary:
needs: [metadata, build-t5x, build-rosetta-t5x, build-pax, build-rosetta-pax]
if: always()
runs-on: ubuntu-22.04
steps:
- name: Generate job summary for container build
shell: bash -x -e {0}
run: |
cat > $GITHUB_STEP_SUMMARY << EOF
# Images created
| Image | Link |
| ------------ | -------------------------------------------------- |
| JAX (input) | ${{ needs.metadata.outputs.JAX_BASE_IMAGE }} |
| T5X | ${{ needs.build-t5x.outputs.DOCKER_TAGS }} |
| ROSETTA(T5X) | ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAGS }} |
| PAX | ${{ needs.build-pax.outputs.DOCKER_TAGS }} |
| ROSETTA(pax) | ${{ needs.build-rosetta-pax.outputs.DOCKER_TAGS }} |
EOF
test-jax:
needs: metadata
uses: ./.github/workflows/_test_jax.yaml
with:
JAX_IMAGE: ${{ needs.metadata.outputs.JAX_BASE_IMAGE }}
secrets: inherit
test-pax:
needs: build-pax
uses: ./.github/workflows/_test_pax.yaml
with:
PAX_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAGS }}
secrets: inherit
test-t5x:
needs: build-t5x
uses: ./.github/workflows/_test_t5x.yaml
with:
T5X_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAGS }}
secrets: inherit
# TODO(terry): This is missing the rosetta tests which can only be added
# After a fix for the TB log collision is pushed.
finalize:
if: always()
# TODO: use dynamic matrix to make dependencies self-updating
needs: [build-summary, test-jax, test-pax]
uses: ./.github/workflows/_finalize.yaml
with:
PUBLISH_BADGE: false
secrets: inherit